In [None]:
# Install required packages
!pip install pandas scikit-learn xgboost imbalanced-learn seaborn matplotlib lightgbm category_encoders --quiet

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, GroupKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier
from sklearn.metrics import (accuracy_score, f1_score, confusion_matrix,
                           classification_report, mean_squared_error, r2_score)
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import joblib
from category_encoders import TargetEncoder

# =============================================
# DATA LOADING & CLEANING (ENHANCED)
# =============================================

# Load dataset
df = pd.read_csv('/content/cleaned_dataset.csv')

# Standardize and clean categorical columns
def clean_crop_name(name):
    name = name.lower().strip()
    replacements = {
        'chilli': 'chili',
        'pearl millet (bajra)': 'pearl millet',
        'tur (pigeonpea)': 'pigeonpea',
        'green gram': 'mung bean'
    }
    return replacements.get(name, name)

df['crop'] = df['crop'].apply(clean_crop_name)
df[['state', 'district', 'soil_type', 'month']] = df[['state', 'district', 'soil_type', 'month']].apply(lambda x: x.str.lower().str.strip())

# Handle numeric columns
num_cols = ['soil_ph', 'rainfall_mm', 'temperature_c']
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')

# Drop duplicates & missing values
df = df.drop_duplicates().dropna(subset=num_cols + ['crop'])

# =============================================
# CROP CLASS OPTIMIZATION (IMPROVED)
# =============================================

crop_categories = {
    'cereals': [
        'wheat', 'rice', 'maize', 'barley', 'sorghum', 'pearl millet', 'ragi', 'jowar'
    ],
    'pulses': [
        'chickpea', 'urad dal', 'mung bean', 'lentil', 'pigeon pea', 'rajma',
        'black-eyed pea', 'adzuki bean', 'green bean', 'cowpea', 'field bean',
        'horse gram', 'lathyrus'
    ],
    'oilseeds': [
        'mustard', 'sunflower', 'groundnut', 'sesame', 'castor', 'linseed', 'soybean', 'peanut'
    ],
    'vegetables': [
        'tomato', 'potato', 'onion', 'brinjal', 'cabbage', 'cauliflower',
        'pumpkin', 'okra', 'bottle gourd', 'cucumber', 'chili pepper', 'spinach',
        'amaranth', 'carrot', 'bitter gourd', 'broccoli', 'lettuce', 'radish'
    ],
    'fruits': [
        'mango', 'banana', 'orange', 'kinnow', 'lemon', 'lime', 'guava',
        'watermelon', 'papaya', 'grapes', 'apple', 'pear', 'plum',
        'litchi', 'pineapple', 'peach', 'cherry', 'jackfruit', 'grapefruit'
    ],
    'spices_and_condiments': [
        'turmeric', 'coriander', 'ginger', 'garlic', 'cumin', 'black pepper',
        'cardamom', 'tamarind', 'fenugreek'
    ],
    'plantation_crops': [
        'tea', 'coffee', 'coconut', 'arecanut', 'cashew'
    ],
    'dryland_and_fiber_crops': [
        'cotton', 'jute'
    ],
    'nuts': [
        'almond', 'hazelnut', 'walnut'
    ],
    'medicinal_and_misc': [
        'ashwagandha', 'moringa', 'betel leaf', 'tobacco'
    ],
    'tubers': [
        'tapioca'
    ]
}

# Apply categorization and remove 'other' class
df['crop_category'] = df['crop'].apply(lambda x: next((k for k, v in crop_categories.items() if x in v), None))
df = df.dropna(subset=['crop_category'])  # Remove uncategorized crops

# Only keep categories with sufficient samples
min_samples = 50
valid_categories = df['crop_category'].value_counts()[df['crop_category'].value_counts() >= min_samples].index
df = df[df['crop_category'].isin(valid_categories)]

# =============================================
# FEATURE ENGINEERING (ENHANCED)
# =============================================

# Cyclical month encoding
df['month_sin'] = np.sin(2 * np.pi * (pd.to_datetime(df['month'], format='%b').dt.month - 1)/12)
df['month_cos'] = np.cos(2 * np.pi * (pd.to_datetime(df['month'], format='%b').dt.month - 1)/12)

# Soil fertility indicator
df['fertility_index'] = np.where(
    (df['soil_ph'] > 6) & (df['soil_ph'] < 7.5) & (df['rainfall_mm'] > 500),
    1, 0
)

# Add aggregated features
df['district_month_avg_ph'] = df.groupby(['district', 'month'])['soil_ph'].transform('mean')
df['district_month_avg_rain'] = df.groupby(['district', 'month'])['rainfall_mm'].transform('mean')

# Growing Degree Days
base_temp = 10
df['gdd'] = np.maximum(df['temperature_c'] - base_temp, 0)

# Rainfall seasonality
df['is_monsoon'] = df['month'].isin(['jun', 'jul', 'aug', 'sep']).astype(int)

# District encoding (frequency-based)
district_encoding = df.groupby('district')['crop_category'].agg(lambda x: x.value_counts().index[0])
df['district_encoded'] = df['district'].map(district_encoding)

# =============================================
# CROP CATEGORY RECOMMENDATION MODEL (IMPROVED)
# =============================================

# Features & Target
features = [
    'soil_type', 'soil_ph', 'temperature_c', 'rainfall_mm',
    'month_sin', 'month_cos', 'fertility_index',
    'district_month_avg_ph', 'district_month_avg_rain',
    'gdd', 'is_monsoon', 'district_encoded'
]
X = df[features]
y = df['crop_category']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Preprocessing with TargetEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', TargetEncoder(), ['soil_type', 'district_encoded']),
        ('num', StandardScaler(), [
            'soil_ph', 'temperature_c', 'rainfall_mm',
            'month_sin', 'month_cos', 'fertility_index',
            'district_month_avg_ph', 'district_month_avg_rain',
            'gdd', 'is_monsoon'
        ])
    ])

# Model Pipeline with LightGBM (better for categoricals)
model = ImbPipeline([
    ('preprocessor', preprocessor),
    ('sampler', SMOTE(random_state=42, sampling_strategy='not majority')),
    ('classifier', LGBMClassifier(
        random_state=42,
        n_estimators=300,
        max_depth=5,
        learning_rate=0.05,
        colsample_bytree=0.8,
        subsample=0.8,
        class_weight='balanced'
    ))
])

# Train with cross-validation
print("Training Improved Crop Category Recommendation Model...")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
print("\n--- Improved Crop Category Recommendation Model ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"Weighted F1: {f1_score(y_test, y_pred, average='weighted'):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Feature Importance
try:
    plt.figure(figsize=(12, 8))
    feature_importances = model.named_steps['classifier'].feature_importances_
    feature_names = features  # Simplified for this example
    pd.Series(feature_importances, index=feature_names).sort_values().plot(kind='barh')
    plt.title('Feature Importance')
    plt.show()
except Exception as e:
    print(f"Could not plot feature importance: {str(e)}")

# Save artifacts
joblib.dump(model, 'improved_crop_category_model.pkl')
joblib.dump(label_encoder, 'crop_category_encoder.pkl')

# =============================================
# SOIL pH PREDICTION MODEL (VALIDATED)
# =============================================

# Check if soil pH prediction is actually needed
if df['soil_ph'].nunique() > 10:  # Only proceed if sufficient variability
    # Features & Target
    features_ph = ['state', 'district', 'month_sin', 'month_cos']
    X_ph = df[features_ph]
    y_ph = df['soil_ph']

    # Train-test split with district grouping
    groups = df['district']  # For GroupKFold
    X_train_ph, X_test_ph, y_train_ph, y_test_ph = train_test_split(
        X_ph, y_ph, test_size=0.2, random_state=42, stratify=groups
    )

    # Preprocessing
    preprocessor_ph = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), ['state', 'district'])
        ])

    # Model Pipeline
    model_ph = Pipeline([
        ('preprocessor', preprocessor_ph),
        ('regressor', XGBRegressor(
            random_state=42,
            n_estimators=100,
            max_depth=3
        ))
    ])

    # Train with grouped cross-validation
    print("\nTraining Soil pH Model...")
    cv = GroupKFold(n_splits=5)
    model_ph.fit(X_train_ph, y_train_ph)

    # Evaluation
    y_pred_ph = model_ph.predict(X_test_ph)
    rmse = np.sqrt(mean_squared_error(y_test_ph, y_pred_ph))
    print("\n--- Soil pH Prediction Model ---")
    print(f"RMSE: {rmse:.2f}")
    print(f"R²: {r2_score(y_test_ph, y_pred_ph):.2f}")

    # Save model
    joblib.dump(model_ph, 'soil_ph_model.pkl')
else:
    print("\nSoil pH shows insufficient variability - skipping regression model")

# =============================================
# PREDICTION EXAMPLES (FIXED)
# =============================================

# Example 1: Crop Prediction
sample_data = pd.DataFrame({
    'soil_type': ['vertisols'],
    'soil_ph': [6.5],
    'temperature_c': [22],
    'rainfall_mm': [15],
    'month_sin': [np.sin(2 * np.pi * (1-1)/12)],  # January
    'month_cos': [np.cos(2 * np.pi * (1-1)/12)],
    'fertility_index': [1],
    'district_month_avg_ph': [6.8],
    'district_month_avg_rain': [12],
    'gdd': [12],
    'is_monsoon': [0],
    'district_encoded': ['cereals']  # Most common in district
})

# Ensure sample data has same columns as training data in correct order
sample_data_fixed = pd.DataFrame(sample_data, columns=X_train.columns)
assert list(sample_data_fixed.columns) == list(X_train.columns), "Mismatch in feature names!"

predicted_category = label_encoder.inverse_transform(model.predict(sample_data_fixed))[0]
print(f"\nRecommended crop category: {predicted_category}")
print(f"Potential crops in this category: {', '.join(crop_categories.get(predicted_category, []))}")

# Example 2: Soil pH Prediction (if model exists)
if 'model_ph' in locals():
    sample_ph_data = pd.DataFrame({
        'state': ['chhattisgarh'],
        'district': ['durg'],
        'month_sin': [np.sin(2 * np.pi * (1-1)/12)],  # January
        'month_cos': [np.cos(2 * np.pi * (1-1)/12)]
    })
    # Ensure columns match training data
    sample_ph_data_fixed = pd.DataFrame(sample_ph_data, columns=X_train_ph.columns)
    predicted_ph = model_ph.predict(sample_ph_data_fixed)[0]
    print(f"Predicted soil pH: {predicted_ph:.2f}")

In [None]:
# ============================================================
# 📦 Setup
# ============================================================
!pip install -q pandas scikit-learn joblib lightgbm imbalanced-learn

import re
import math
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from imblearn.over_sampling import SMOTENC

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier

RANDOM_STATE = 42
MIN_SAMPLES_PER_CLASS = 10  # more aggressive merging
CSV_PATH = "/content/crop_steps.csv"

# ============================================================
# 🧼 Load & Clean
# ============================================================
df = pd.read_csv(CSV_PATH)
df.columns = [c.strip().replace(" ", "_").lower() for c in df.columns]

cols_needed = [
    "crop_name","days","stage","irrigation","irrigation_type",
    "fertilizer_type","fertilizer_dosage"
]
for c in cols_needed:
    if c not in df.columns:
        df[c] = np.nan

# Normalize strings, keep NaN
for c in ["crop_name","stage","irrigation","irrigation_type","fertilizer_type","fertilizer_dosage"]:
    df[c] = df[c].apply(lambda x: x.strip().lower() if isinstance(x, str) else np.nan)

df["days"] = pd.to_numeric(df["days"], errors="coerce")
df = df.dropna(subset=["crop_name","days"]).drop_duplicates().reset_index(drop=True)

# ============================================================
# 🔤 Stage grouping
# ============================================================
def map_stage_group(s):
    if not isinstance(s, str): return "other"
    x = s.lower()
    if any(k in x for k in ["harvest","harvesting","harvestable"]):
        return "harvest"
    if any(k in x for k in ["maturity","ripen","mature"]):
        return "maturity"
    if any(k in x for k in ["flower","bloom","anthesis","tassel","panicl","ear","silk","fruit","pod","boll","grain fill","reproductive"]):
        return "reproductive"
    if any(k in x for k in ["sow","seed","nursery","germination","transplant"]):
        return "sowing"
    if any(k in x for k in ["veg","tillering","elongation","leaf","stem","v-stage"]):
        return "vegetative"
    return "other"

df["stage_grouped"] = df["stage"].apply(map_stage_group)

# ============================================================
# 🚰 Irrigation type grouping
# ============================================================
def map_irrigation_type(t):
    if not isinstance(t,str): return "other"
    x = t.lower()
    if "drip" in x: return "drip"
    if "sprinkler" in x: return "sprinkler"
    if "furrow" in x: return "furrow"
    if "flood" in x or "basin" in x or "surface" in x: return "flood"
    if "rainfed" in x or "natural" in x: return "rainfed"
    if x in ["none","-","–","na","n/a","no irrigation"]: return "none"
    if any(k in x for k in [" or ","/"]): return "mixed"
    return "other"

df["irrigation_type_grouped"] = df["irrigation_type"].apply(map_irrigation_type)

# ============================================================
# 🧪 Fertilizer type grouping + dosage
# ============================================================
def map_fert_type(t):
    if not isinstance(t,str): return "other"
    x = t.lower()
    if x in ["-","–","na","n/a","none","no","no fertilizer"]: return "none"
    if "npk" in x or "starter" in x or "basal" in x: return "npk"
    if "fym" in x or "compost" in x or "manure" in x or "organic" in x: return "organic"
    if "nitrogen" in x or re.search(r"\bn\b", x): return "nitrogen"
    if "phosphorus" in x or re.search(r"\bp\b", x): return "phosphorus"
    if "potassium" in x or re.search(r"\bk\b", x): return "potassium"
    if "micro" in x or any(k in x for k in ["boron","zinc","fe","mn"]): return "micronutrients"
    return "other"

def extract_dosage(s):
    if not isinstance(s, str): return (np.nan, "unknown")
    txt = s.lower()
    num_match = re.findall(r"(\d+(?:\.\d+)?)\s*(?:-|to|–)\s*(\d+(?:\.\d+)?)", txt)
    if num_match:
        lo, hi = map(float, num_match[0])
        amount = (lo + hi) / 2
    else:
        nums = re.findall(r"\d+(?:\.\d+)?", txt)
        amount = float(nums[0]) if nums else np.nan
    unit = "unknown"
    if "per plant" in txt or "/ plant" in txt: unit = "per plant"
    elif "per acre" in txt or "/ acre" in txt: unit = "per acre"
    elif "per ha" in txt or "/ ha" in txt or "per hectare" in txt: unit = "per ha"
    return (amount, unit)

df["fertilizer_type_grouped"] = df["fertilizer_type"].apply(map_fert_type)
dos_pairs = df["fertilizer_dosage"].apply(extract_dosage)
df["fert_dose_amount"] = [p[0] for p in dos_pairs]
df["fert_dose_unit"] = [p[1] for p in dos_pairs]

# ============================================================
# ⚖️ Merge rare classes
# ============================================================
def merge_rare(y, min_samples=MIN_SAMPLES_PER_CLASS):
    counts = y.value_counts()
    rare = set(counts[counts < min_samples].index)
    return y.apply(lambda v: v if v not in rare else "other")

# ============================================================
# 🧠 Training helper
# ============================================================
def train_classifier(X, y, model_name):
    y = merge_rare(y)

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
    )

    cat_cols = [c for c in X.columns if X[c].dtype == "object"]
    num_cols = [c for c in X.columns if np.issubdtype(X[c].dtype, np.number)]

    # Fill NaNs before SMOTE
    X_train_copy = X_train.copy()
    for col in cat_cols:
        X_train_copy[col] = X_train_copy[col].fillna("missing").astype("category")
    for col in num_cols:
        X_train_copy[col] = X_train_copy[col].fillna(X_train_copy[col].median())

    # Categorical feature indices for SMOTENC
    cat_indices = [X_train_copy.columns.get_loc(c) for c in cat_cols]

    smote = SMOTENC(categorical_features=cat_indices, random_state=RANDOM_STATE)
    X_bal, y_bal = smote.fit_resample(X_train_copy, y_train)

    # Pipeline
    pre = ColumnTransformer([
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", StandardScaler(), num_cols)
    ])

    model = LGBMClassifier(
        random_state=RANDOM_STATE,
        class_weight="balanced",
        n_estimators=300,
        learning_rate=0.05
    )

    pipe = Pipeline([("pre", pre), ("clf", model)])
    pipe.fit(X_bal, y_bal)

    preds = pipe.predict(X_test)
    acc = accuracy_score(y_test, preds)
    f1m = f1_score(y_test, preds, average="macro")

    print(f"\n=== {model_name.upper()} ===")
    print(f"Accuracy: {acc:.3f} | Macro F1: {f1m:.3f}")
    print(classification_report(y_test, preds, zero_division=0))

    joblib.dump(pipe, f"/content/{model_name}_model.joblib")
    print(f"💾 Saved -> /content/{model_name}_model.joblib")
    return pipe

# ============================================================
# 1) Irrigation requirement
# ============================================================
def map_irrig_need(v):
    if not isinstance(v,str): return np.nan
    x = v.lower().strip()
    if x in ["yes","y","1","true","limited","light","partial"]: return "yes"
    if x in ["no","n","0","false","skip"]: return "no"
    return np.nan

df["irrigation_bin"] = df["irrigation"].apply(map_irrig_need)
df_irri = df.dropna(subset=["irrigation_bin"])
irrigation_model = train_classifier(df_irri[["crop_name","days"]], df_irri["irrigation_bin"], "irrigation_requirement")

# ============================================================
# 2) Stage prediction
# ============================================================
stage_model = train_classifier(df[["crop_name","days","fert_dose_amount"]], df["stage_grouped"], "stage_prediction")

# ============================================================
# 3) Irrigation type
# ============================================================
irrig_type_model = train_classifier(df[["crop_name","days","stage_grouped"]], df["irrigation_type_grouped"], "irrigation_type")

# ============================================================
# 4) Fertilizer type
# ============================================================
fert_type_model = train_classifier(df[["crop_name","days","stage_grouped","fert_dose_amount"]], df["fertilizer_type_grouped"], "fertilizer_type_grouped")

# ============================================================
# Dosage table
# ============================================================
dose_table = df.groupby(["crop_name","stage_grouped","fertilizer_type_grouped"], dropna=False)["fert_dose_amount"].median().reset_index().rename(columns={"fert_dose_amount":"median_dose"})
dose_table.to_parquet("/content/fertilizer_median_dose_table.parquet", index=False)

print("\n✅ All models trained & saved.")
