In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix


In [4]:
# Load Both Datasets
df1 = pd.read_csv('indian_crop_weather.csv')
df2 = pd.read_csv('crop_recommendation.csv')


In [5]:
df1.head()

Unnamed: 0,Dist Code,Year,State Code,State Name,Dist Name,Crop,Area_ha,Yield_kg_per_ha,N_req_kg_per_ha,P_req_kg_per_ha,K_req_kg_per_ha,Total_N_kg,Total_P_kg,Total_K_kg,Temperature_C,Humidity_%,pH,Rainfall_mm,Wind_Speed_m_s,Solar_Radiation_MJ_m2_day
0,1,1966,14,Chhattisgarh,Durg,rice,548000.0,337.59,8.43975,4.05108,7.42698,4624983.0,2219991.84,4069985.04,25,80,6.5,1200,2.0,18
1,1,1966,14,Chhattisgarh,Durg,maize,3000.0,666.67,18.00009,8.00004,11.33339,54000.27,24000.12,34000.17,22,70,6.0,800,2.5,20
2,1,1966,14,Chhattisgarh,Durg,chickpea,54000.0,500.0,9.0,5.0,9.0,486000.0,270000.0,486000.0,20,60,6.5,600,1.5,16
3,1,1967,14,Chhattisgarh,Durg,rice,547000.0,747.71,18.69275,8.97252,16.44962,10224934.25,4907968.44,8997942.14,25,80,6.5,1200,2.0,18
4,1,1967,14,Chhattisgarh,Durg,maize,3000.0,1000.0,27.0,12.0,17.0,81000.0,36000.0,51000.0,22,70,6.0,800,2.5,20


In [6]:
df2.head()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


In [7]:
df1.rename(columns={
    "Temperature_C": "temperature",
    "Humidity_%": "humidity",
    "Rainfall_mm": "rainfall",
    "Crop": "label",
    "pH": "ph",
    "Yield_kg_per_ha": "Production",
    "Dist Name": "District"
}, inplace=True)
# for debugging
print("\nRenamed df1 columns:", df1.columns.tolist()) 


Renamed df1 columns: ['Dist Code', 'Year', 'State Code', 'State Name', 'District', 'label', 'Area_ha', 'Production', 'N_req_kg_per_ha', 'P_req_kg_per_ha', 'K_req_kg_per_ha', 'Total_N_kg', 'Total_P_kg', 'Total_K_kg', 'temperature', 'humidity', 'ph', 'rainfall', 'Wind_Speed_m_s', 'Solar_Radiation_MJ_m2_day']


In [8]:
# filter Required Columns
df1_filtered = df1[["temperature", "humidity", "ph", "rainfall", "label", "Production","District"]].copy()
df2_filtered = df2[["temperature", "humidity", "ph", "rainfall", "label"]].copy()
df2_filtered["Production"] = np.nan
filler_district = "Unknown"
df2_filtered["District"] = filler_district

In [9]:
 # Combining Both Datasets 
balanced_df = pd.concat([df1_filtered, df2_filtered], ignore_index=True)
print("Columns in balanced_df:", balanced_df.columns.tolist())



Columns in balanced_df: ['temperature', 'humidity', 'ph', 'rainfall', 'label', 'Production', 'District']


In [10]:
#  Checking class distribution before encoding
print("\nCrop Distribution (Before Encoding):")
print(merged_df['label'].value_counts())

plt.figure(figsize=(12,6))
merged_df['label'].value_counts().plot(kind='bar',color="lightgreen")
plt.title("Crop Distribution (Raw Labels)")
plt.ylabel("Number of Samples")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()



Crop Distribution (Before Encoding):


NameError: name 'merged_df' is not defined

In [None]:
# Encode Crop Labels 
le = LabelEncoder()
balanced_df['label'] = le.fit_transform(balanced_df['label'].astype(str))

print("\nLabel Mapping:")
for i, crop in enumerate(le.classes_):
    print(f"{i}: {crop}")


In [None]:
# Encode district 
district_encoder = LabelEncoder()
balanced_df['District'] = district_encoder.fit_transform(balanced_df['District'].astype(str))


In [None]:
# Prepare Features and Labels 
X = balanced_df.drop(["label","Production"], axis=1)
y = balanced_df["label"]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
model.fit(X_train, y_train)
print("\nTraining Data Ranges:")
for col in X.columns:
    print(f"{col}: min = {X[col].min()}, max = {X[col].max()}")

In [None]:
# Evaluate Model 
y_pred = model.predict(X_test)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
# feature Importance
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort_values().plot(kind='barh', figsize=(8,4), color="skyblue")
plt.title("Feature Importance")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

In [None]:
# Yield Info Lookup Function
def get_yield_info(crop_name):
    crop_rows = balanced_df[balanced_df['label'] == le.transform([crop_name])[0]]
    valid_production = crop_rows['Production'].dropna()
    if valid_production.empty:
        return " No production data available."
    return f"Average Yield: {valid_production.mean():.2f} kg/ha"

In [None]:
# predicting function
def recommend_crop(temp, hum, ph, rain, district_name):
    if district_name not in district_encoder.classes_:
        return "District not recognized. Please enter a valid district."

    district_code = district_encoder.transform([district_name])[0]

    if not (8.8 <= temp <= 43.6 and 14 <= hum <= 99.9 and 3.5 <= ph <= 9.9 and 20 <= rain <= 1200):
        return "Input values are out of training data range. Please enter realistic values."

    input_df = pd.DataFrame([[temp, hum, ph, rain, district_code]], columns=X.columns)
    prediction = model.predict(input_df)
    predicted_crop = le.inverse_transform(prediction)[0]
    yield_info = get_yield_info(predicted_crop)
   c

In [None]:
# Test the Function 
print("\n,Recommended Crop:", recommend_crop(26, 80, 6.5, 1200,"Durg"))


In [None]:
# Save Model and Label Encoder 
joblib.dump(model, "crop_recommendation_model.pkl")
joblib.dump(le, "label_encoder.pkl")
joblib.dump(district_encoder, "district_encoder.pkl")
print("\n Model and encoders saved successfully!")

In [None]:
recommend_crop(27.5, 85, 6.8, 1000, "Aligarh")

In [None]:
recommend_crop(30.2, 70, 7.2, 800, "Gorakhpur")

In [None]:
recommend_crop(31.0, 60, 7.5, 350, "Lucknow")

In [11]:
import os
print("Current Working Directory:", os.getcwd())


Current Working Directory: C:\Users\nikki
