<a href="https://colab.research.google.com/github/bhargavi1973/FasalSaathi/blob/main/CropClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Extracting the file from zip folder


In [1]:
import zipfile
import os

zip_path = '/content/dataset8.zip'
extract_path = '/content/dataset8_extracted'

# Unzip the file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"'{zip_path}' unzipped to '{extract_path}'")

# Find the CSV file in the extracted directory
csv_file = None
for root, _, files in os.walk(extract_path):
    for file in files:
        if file.endswith('.csv'):
            csv_file = os.path.join(root, file)
            break
    if csv_file:
        break

if csv_file:
    print(f"Found CSV file: {os.path.basename(csv_file)}")
    # Optionally, update the CSV_PATH in the next cell if necessary
    # For this notebook, the path is already set correctly in the next cell
    print(f"New CSV path: {csv_file}")
else:
    print("No CSV file found in the extracted directory.")

'/content/dataset8.zip' unzipped to '/content/dataset8_extracted'
Found CSV file: Crop_recommendation.csv
New CSV path: /content/dataset8_extracted/Crop_recommendation.csv


Training the xgboost classifier

In [2]:
# yield_predict.py
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# -----------------------
# Config
# -----------------------
# Updated path to use the extracted CSV from dataset8.zip
CSV_PATH = "/content/dataset8_extracted/Crop_recommendation.csv"
# Update TARGET_COL based on the new dataset (assuming 'label' or similar, adjust if needed)
TARGET_COL = "label" # This is the target column for classification
RANDOM_STATE = 42

In [3]:
# -----------------------
# Load dataset
# -----------------------
if not os.path.exists(CSV_PATH):
    # Removed the demo dataset creation as we are using a specific file
    raise FileNotFoundError(f"{CSV_PATH} not found.")


df = pd.read_csv(CSV_PATH)
# Update required_cols based on the new dataset (adjust if needed after inspecting the data)
required_cols = {'N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', TARGET_COL}
if not required_cols.issubset(set(df.columns)):
    # Print existing columns to help identify the correct ones
    print(f"Columns in the new CSV: {df.columns.tolist()}")
    raise ValueError(f"CSV must contain columns: {required_cols}. Found: {df.columns.tolist()}")


In [7]:


# -----------------------
# Helper: create synthetic demo dataset if none provided
# This function is no longer needed as we are using a specific dataset
# -----------------------
# def create_demo_csv(path):
#     np.random.seed(RANDOM_STATE)
#     n = 1000
#     crops = ['rice', 'maize', 'wheat', 'groundnut']
#     df = pd.DataFrame({
#         'N': np.random.uniform(10, 180, n),            # kg/ha available or applied
#         'P': np.random.uniform(5, 80, n),
#         'K': np.random.uniform(5, 120, n),
#         'temp': np.random.uniform(18, 35, n),         # mean temp degC
#         'humidity': np.random.uniform(40, 95, n),     # %
#         'ph': np.random.uniform(4.5, 8.5, n),
#         'rainfall': np.random.uniform(100, 1500, n),  # mm over season
#         'crop_type': np.random.choice(crops, n)
#     })
#     # synthetic yield generation (toy function)
#     def synthetic_yield(row):
#         base = {'rice': 4000, 'maize': 5000, 'wheat': 3500, 'groundnut': 2500}[row['crop_type']]
#         # contributions
#         n_eff = min(row['N'], 150) * 6    # rough kg/ha per N kg
#         p_eff = min(row['P'], 60) * 4
#         k_eff = min(row['K'], 80) * 3
#         rain_factor = -abs(row['rainfall'] - 800) * 0.5  # penalty if too far from 800
#         temp_penalty = -max(0, (row['temp'] - 30)) * 20
#         ph_penalty = -abs(row['ph'] - 6.5) * 100
#         noise = np.random.normal(0, 200)
#         return base + n_eff + p_eff + k_eff + rain_factor + temp_penalty + ph_penalty + noise

#     df[TARGET_COL] = df.apply(synthetic_yield, axis=1).clip(lower=200)
#     df.to_csv(path, index=False)
#     print(f"Demo dataset written to {path}")
# Basic cleaning
df = df.dropna(subset=[TARGET_COL])  # must have target
# If there are obvious outliers or bad rows you may want to handle them here

# -----------------------
# Features & preprocessing
# -----------------------
# Update feature names based on the new dataset
numeric_features = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
categorical_features = [] # Assuming the new dataset has no categorical features besides the target

# Pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    # for tree models scaling is not necessary; if using linear models add StandardScaler()
])

# Keep categorical transformer in case needed later, but it's empty for now
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Adjust preprocessor based on the updated feature lists
if categorical_features:
    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
else:
     preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features)
    ])

# Encode the target variable since it's categorical (text)
label_encoder = LabelEncoder()
df[TARGET_COL] = label_encoder.fit_transform(df[TARGET_COL])


X = df[numeric_features + categorical_features]
y = df[TARGET_COL].values

# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=RANDOM_STATE)

# Fit preprocessing
X_train_trans = preprocessor.fit_transform(X_train)
X_test_trans = preprocessor.transform(X_test)

# Get feature names for interpretation
# Adjust feature name retrieval based on whether categorical features exist
if categorical_features:
    ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
    # get_feature_names_out returns a numpy array in newer versions
    cat_names = list(ohe.get_feature_names_out(categorical_features))
    feature_names = numeric_features + cat_names
else:
    feature_names = numeric_features


# -----------------------
# Train XGBoost classifier
# Note: We are switching to a classifier as the target is categorical
# -----------------------
model = xgb.XGBClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=RANDOM_STATE,
    use_label_encoder=False, # Deprecated in newer XGBoost, set to False
    eval_metric='merror' # Metric for multi-class classification error
)

# use early stopping on a validation split
X_tr, X_val, y_tr, y_val = train_test_split(X_train_trans, y_train, test_size=0.15, random_state=RANDOM_STATE)
model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    verbose=50
)

# -----------------------
# Evaluate
# Note: These evaluation metrics are for classification
# -----------------------
y_pred = model.predict(X_test_trans)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


# -----------------------
# Feature importance (gain)
# -----------------------
try:
    importances = model.get_booster().get_score(importance_type='gain')
    # map to full feature names (xgboost uses f0..fN)
    fmap = {}
    for k, v in importances.items():
        idx = int(k.replace('f', ''))
        fmap[feature_names[idx]] = v
    print("\nFeature importances (by gain):")
    for fn, val in sorted(fmap.items(), key=lambda x: -x[1])[:20]:
        print(f"  {fn}: {val:.4f}")
except Exception:
    pass

# -----------------------
# Plot predicted vs actual (Not applicable for classification, removing)
# Note: This plot is for regression. If the new dataset is for classification,
# a different visualization (e.g., confusion matrix, classification report) is needed.
# -----------------------
# plt.figure(figsize=(6,6))
# sns.scatterplot(x=y_test, y=y_pred, alpha=0.6)
# maxv = max(max(y_test), max(y_pred))
# minv = min(min(y_test), min(y_pred))
# plt.plot([minv, maxv], [minv, maxv], 'r--', linewidth=1.2)
# plt.xlabel('Actual yield (kg/ha)') # Label might need adjustment
# plt.ylabel('Predicted yield (kg/ha)') # Label might need adjustment
# plt.title('Predicted vs Actual Yield') # Title might need adjustment
# plt.grid(alpha=0.3)
# plt.tight_layout()
# plt.show()


# -----------------------
# Save preprocessing + model
# -----------------------
os.makedirs('artifacts', exist_ok=True)
joblib.dump(preprocessor, 'artifacts/preprocessor.joblib')
joblib.dump(model, 'artifacts/xgb_crop_classifier.joblib') # Model name adjusted
print("Saved preprocessor and model to artifacts/")

# -----------------------
# Example: predict for a single farmer input - function definition moved to this cell
# -----------------------
def predict_crop(N, P, K, temperature, humidity, ph, rainfall):
    """
    Predicts the recommended crop based on environmental conditions.

    Args:
        N (float): Nitrogen content in the soil.
        P (float): Phosphorus content in the soil.
        K (float): Potassium content in the soil.
        temperature (float): Temperature in Celsius.
        humidity (float): Humidity percentage.
        ph (float): pH level of the soil.
        rainfall (float): Rainfall in mm.

    Returns:
        str: The recommended crop label, or None if an error occurred.
    """
    # Ensure preprocessor, model, and label_encoder are loaded
    try:
        preprocessor = joblib.load('artifacts/preprocessor.joblib')
        model = joblib.load('artifacts/xgb_crop_classifier.joblib')
        # Access label_encoder from the global scope where it was trained
        label_encoder = globals().get('label_encoder')
        if label_encoder is None:
            raise FileNotFoundError("label_encoder not found. Please run the cell where the model was trained.")

    except FileNotFoundError as e:
        print(f"Error loading artifacts: {e}. Make sure you have run the previous cell to train and save the model.")
        return None
    except Exception as e:
        print(f"An error occurred during loading: {e}")
        return None


    # Create a DataFrame from the input
    input_data = pd.DataFrame([{
        'N': N,
        'P': P,
        'K': K,
        'temperature': temperature,
        'humidity': humidity,
        'ph': ph,
        'rainfall': rainfall
    }])

    # Preprocess the input data
    input_data_trans = preprocessor.transform(input_data)

    # Make a prediction
    prediction = model.predict(input_data_trans)

    # Convert the predicted label (numeric) back to the crop name (string)
    predicted_crop = label_encoder.inverse_transform(prediction)

    return predicted_crop[0]

[0]	validation_0-merror:0.05694


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[50]	validation_0-merror:0.01068
[100]	validation_0-merror:0.01068
[150]	validation_0-merror:0.01068
[200]	validation_0-merror:0.01423
[250]	validation_0-merror:0.01423
[300]	validation_0-merror:0.01423
[350]	validation_0-merror:0.01423
[400]	validation_0-merror:0.01423
[450]	validation_0-merror:0.01423
[500]	validation_0-merror:0.01423
[550]	validation_0-merror:0.01779
[600]	validation_0-merror:0.01779
[650]	validation_0-merror:0.01779
[700]	validation_0-merror:0.01779
[750]	validation_0-merror:0.01779
[800]	validation_0-merror:0.01779
[850]	validation_0-merror:0.01779
[900]	validation_0-merror:0.01779
[950]	validation_0-merror:0.01779
[999]	validation_0-merror:0.01779
Accuracy: 0.9848

Classification Report:


TypeError: object of type 'numpy.int64' has no len()

In [5]:
# Example usage:
# You can replace these values with input taken from the user
n_val = 90
p_val = 42
k_val = 43
temp_val = 20.88
humidity_val = 82.0
ph_val = 6.5
rainfall_val = 200

recommended_crop = predict_crop(n_val, p_val, k_val, temp_val, humidity_val, ph_val, rainfall_val)

if recommended_crop:
    print(f"\nBased on the input conditions, the recommended crop is: {recommended_crop}")


Based on the input conditions, the recommended crop is: rice
