In [1]:
# Import statements
import numpy as np
import pandas as pd

1. Preprocessing

In [2]:
# Load the dataset
df = pd.read_csv('../dataset/climate_soil_data.csv')

In [3]:
df['CROP'].unique()

array(['barley', 'oats', 'spring_wheat', 'winter_wheat', 'fall_rye',
       'soybeans', 'corn', 'canola', 'canary_seed', 'durum_wheat',
       'flaxseed', 'lentils', 'mustard', 'peas'], dtype=object)

In [4]:
# Scale numeric values
from sklearn.preprocessing import StandardScaler

# Copying original dataframe
df_ready = df.copy()

scaler = StandardScaler()
# all numeric columns except yield
num_cols = ['GSL', 'CWD_fall',
       'CWD_spring', 'CWD_summer', 'WW_fall', 'WW_spring', 'WW_summer',
       'CDD_fall', 'CDD_spring', 'CDD_summer', 'CSDI_fall', 'CSDI_spring',
       'CSDI_summer', 'WSDI_fall', 'WSDI_spring', 'WSDI_summer', 'CFD_fall',
       'CFD_spring', 'CFD_summer', 'CSU_fall', 'CSU_spring', 'CSU_summer',
       'T_PH_H2O', 'S_PH_H2O', 'AWC_CLASS']

df_ready[num_cols] = scaler.fit_transform(df[num_cols])

df_ready.head()

Unnamed: 0,CROP,LATITUDE,LONGITUDE,YEAR,YIELD,GSL,CWD_fall,CWD_spring,CWD_summer,WW_fall,...,WSDI_summer,CFD_fall,CFD_spring,CFD_summer,CSU_fall,CSU_spring,CSU_summer,T_PH_H2O,S_PH_H2O,AWC_CLASS
0,barley,46.25,-62.25,1987,59.1,0.558674,-0.05118,0.304782,-0.660209,0.381064,...,-0.330965,-1.278973,-0.590631,-0.43178,-0.630641,-1.044751,-0.694238,-1.458289,-1.434444,1.130886
1,oats,46.25,-62.25,1987,62.1,0.558674,-0.05118,0.304782,-0.660209,0.381064,...,-0.330965,-1.278973,-0.590631,-0.43178,-0.630641,-1.044751,-0.694238,-1.458289,-1.434444,1.130886
2,spring_wheat,46.25,-62.25,1987,52.2,0.558674,-0.05118,0.304782,-0.660209,0.381064,...,-0.330965,-1.278973,-0.590631,-0.43178,-0.630641,-1.044751,-0.694238,-1.458289,-1.434444,1.130886
3,winter_wheat,46.25,-62.25,1987,52.5,0.558674,-0.05118,0.304782,-0.660209,0.381064,...,-0.330965,-1.278973,-0.590631,-0.43178,-0.630641,-1.044751,-0.694238,-1.458289,-1.434444,1.130886
4,barley,46.25,-62.25,1988,55.7,0.825567,0.156479,0.589268,-0.660209,2.295238,...,-0.330965,-0.893973,-1.215961,-0.43178,-0.630641,-0.224984,-0.263296,-1.458289,-1.434444,1.130886


In [5]:
# Binarize the labels
from sklearn.preprocessing import LabelBinarizer
data = df_ready.drop('CROP', axis=1)  
target = df_ready['CROP']

# Initialize the LabelBinarizer
lb = LabelBinarizer()

# Fit and transform the target labels
binary_labels = lb.fit_transform(target)

binary_labels

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0]])

In [6]:
# Train and Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data , binary_labels,
                                                    shuffle = True,
                                                    test_size=0.2,
                                                    random_state=1)

# Show the Training and Testing Data
print('Shape of training feature:', X_train.shape)
print('Shape of testing feature:', X_test.shape)
print('Shape of training label:', y_train.shape)
print('Shape of training label:', y_test.shape)


Shape of training feature: (9890, 29)
Shape of testing feature: (2473, 29)
Shape of training label: (9890, 14)
Shape of training label: (2473, 14)


2. Modelling & Evaluation

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# List of models to train
classifiers = [
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('Gaussian Naive Bayes', GaussianNB()),
    ('Logistic Regression', LogisticRegression(max_iter=2000)),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Support Vector Machine', SVC(max_iter=2000))  
]

In [8]:
from sklearn.metrics import accuracy_score
# Dictionary to store results
model_results = {}

for class_idx in range(binary_labels.shape[1]):  
    class_name = lb.classes_[class_idx]
    model_results[class_name] = []
    print(f"\nTraining models for class: {class_name}")

    for model_name, clf in classifiers:
        clf.fit(X_train, y_train[:, class_idx])  
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test[:, class_idx], predictions)
        model_results[class_name].append((model_name, accuracy))
        print(f"{model_name} accuracy: {accuracy}")


Training models for class: barley
Decision Tree accuracy: 0.8685806712494946
Random Forest accuracy: 0.819652244237768
Gaussian Naive Bayes accuracy: 0.6477961989486454
Logistic Regression accuracy: 0.8900121310149616
K-Nearest Neighbors accuracy: 0.8989082086534573
Support Vector Machine accuracy: 0.8900121310149616

Training models for class: canary_seed
Decision Tree accuracy: 0.9753336029114436
Random Forest accuracy: 0.9280226445612616
Gaussian Naive Bayes accuracy: 0.8871815608572584
Logistic Regression accuracy: 0.9470279013344116
K-Nearest Neighbors accuracy: 0.9830165790537808
Support Vector Machine accuracy: 0.9567327133036797

Training models for class: canola
Decision Tree accuracy: 0.8778811160533765
Random Forest accuracy: 0.8673675697533361
Gaussian Naive Bayes accuracy: 0.7149211484027497
Logistic Regression accuracy: 0.9078042862919531
K-Nearest Neighbors accuracy: 0.9073999191265669
Support Vector Machine accuracy: 0.9073999191265669

Training models for class: corn
