In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score
import joblib
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.calibration import CalibratedClassifierCV


In [3]:
print("Loading dataset...")
df = pd.read_csv('synthetic_spoilage_data.csv') 

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nData types:\n{df.dtypes}")
print(f"\nMissing values:\n{df.isnull().sum()}")
print(f"\nBasic statistics:\n{df.describe()}")
print(f"\nClass distribution:\n{df['label_safe'].value_counts()}")

Loading dataset...
Dataset shape: (92000, 8)

First few rows:
          crop_type category  temperature_c  humidity_percent  vpd_kpa  \
0  Mango (Alphonso)    Fruit           32.9              86.1     0.70   
1  Mango (Alphonso)    Fruit            8.0              36.3     0.68   
2  Mango (Alphonso)    Fruit           18.9              54.3     1.00   
3  Mango (Alphonso)    Fruit           27.4              93.6     0.23   
4  Mango (Alphonso)    Fruit           33.3              33.0     3.43   

   transit_hours  spoilage_risk  label_safe  
0          101.8          0.658           0  
1          159.7          1.000           0  
2          162.1          0.306           1  
3          137.0          0.545           0  
4          133.9          1.000           0  

Data types:
crop_type            object
category             object
temperature_c       float64
humidity_percent    float64
vpd_kpa             float64
transit_hours       float64
spoilage_risk       float64
label_sa

In [4]:
data = df.copy()

In [5]:
label_encoder = LabelEncoder()
data['crop_type_encoded'] = label_encoder.fit_transform(data['crop_type'])
data['category_encoded'] = LabelEncoder().fit_transform(data['category'])

In [6]:
print(f"\nCrop types: {list(label_encoder.classes_)}")


Crop types: ['Amaranthus (Chaulai)', 'Amla (Indian Gooseberry)', 'Ash Gourd (Petha)', 'Avocado', 'Banana (Robusta/Cavendish)', 'Banana (Yelakki/Elaichi)', 'Basil (Tulsi)', 'Beetroot (Chukandar)', 'Betel Leaf (Paan)', 'Bitter Gourd (Karela)', 'Black Pepper', 'Bottle Gourd (Lauki)', 'Brinjal (Eggplant/Baingan)', 'Cabbage (Patta Gobi)', 'Capsicum (Simla Mirch)', 'Cardamom (Green)', 'Carrot (Gajar - Red)', 'Cauliflower (Gobi)', 'Chana Dal (Bengal Gram)', 'Chrysanthemum', 'Clove', 'Cluster Beans (Gavar)', 'Coconut (Mature)', 'Coconut (Tender)', 'Coriander Leaves (Dhania)', 'Cucumber (Kheera)', 'Cumin Seeds', 'Curry Leaves (Kadi Patta)', 'Custard Apple (Sitaphal)', 'Dragon Fruit', 'Drumstick (Moringa)', 'Elephant Foot Yam (Suran)', 'Fenugreek Leaves (Methi)', 'Fig (Pune Anjeer)', 'Garlic (Lahsun)', 'Gerbera', 'Ginger (Adrak)', 'Gladiolus', 'Grapes (Bangalore Blue)', 'Grapes (Thompson Seedless)', 'Green Chili (Hari Mirch)', 'Green Peas (Matar)', 'Groundnut (Peanut)', 'Guava (Allahabad Safeda

In [7]:
features = ['temperature_c', 'humidity_percent', 'vpd_kpa', 'transit_hours', 
            'crop_type_encoded', 'category_encoded']
X = data[features]
y = data['label_safe']

In [8]:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, 
                                                    random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.176, 
                                                    random_state=42, stratify=y_temp)

In [9]:
print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")


Train set: 64436 samples
Validation set: 13764 samples
Test set: 13800 samples


In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


In [11]:
print("\nTraining Random Forest...")
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=4,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)
rf_model.fit(X_train_scaled, y_train)
rf_val_pred = rf_model.predict(X_val_scaled)
rf_val_pred_proba = rf_model.predict_proba(X_val_scaled)[:, 1]
rf_val_auc = roc_auc_score(y_val, rf_val_pred_proba)
rf_val_f1 = f1_score(y_val, rf_val_pred)
print(f"Random Forest - Validation AUC: {rf_val_auc:.4f}, F1: {rf_val_f1:.4f}")


Training Random Forest...
Random Forest - Validation AUC: 0.9623, F1: 0.8928


In [12]:
print("Training Gradient Boosting...")
gb_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=7,
    min_samples_split=10,
    min_samples_leaf=4,
    random_state=42,
    subsample=0.8
)
gb_model.fit(X_train_scaled, y_train)
gb_val_pred = gb_model.predict(X_val_scaled)
gb_val_pred_proba = gb_model.predict_proba(X_val_scaled)[:, 1]
gb_val_auc = roc_auc_score(y_val, gb_val_pred_proba)
gb_val_f1 = f1_score(y_val, gb_val_pred)
print(f"Gradient Boosting - Validation AUC: {gb_val_auc:.4f}, F1: {gb_val_f1:.4f}")


Training Gradient Boosting...
Gradient Boosting - Validation AUC: 0.9906, F1: 0.9569


In [13]:
best_model = rf_model if rf_val_auc >= gb_val_auc else gb_model
best_name = "Random Forest" if rf_val_auc >= gb_val_auc else "Gradient Boosting"
print(f"\n✓ Selected best model: {best_name}")



✓ Selected best model: Gradient Boosting


In [14]:
print("\n" + "="*80)
print("TEST SET EVALUATION")
print("="*80)

y_test_pred = best_model.predict(X_test_scaled)
y_test_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]
test_auc = roc_auc_score(y_test, y_test_pred_proba)
test_f1 = f1_score(y_test, y_test_pred)

print(f"\nTest AUC: {test_auc:.4f}")
print(f"Test F1-Score: {test_f1:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_test_pred, target_names=['Spoiled', 'Safe']))
print(f"\nConfusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))


TEST SET EVALUATION

Test AUC: 0.9898
Test F1-Score: 0.9543

Classification Report:
              precision    recall  f1-score   support

     Spoiled       0.96      0.93      0.95      6389
        Safe       0.94      0.96      0.95      7411

    accuracy                           0.95     13800
   macro avg       0.95      0.95      0.95     13800
weighted avg       0.95      0.95      0.95     13800


Confusion Matrix:
[[5971  418]
 [ 267 7144]]


In [None]:
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': best_model.feature_importances_
}).sort_values('Importance', ascending=False)
print(f"\nFeature Importance:")
print(feature_importance)


Feature Importance:
             Feature  Importance
0      temperature_c    0.321962
3      transit_hours    0.288318
4  crop_type_encoded    0.251282
5   category_encoded    0.127642
2            vpd_kpa    0.006387
1   humidity_percent    0.004409


: 