In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,confusion_matrix,log_loss

In [2]:
df = pd.read_csv("Crop_recommendation.csv")

In [3]:
df

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.717340,rice
...,...,...,...,...,...,...,...,...
2195,107,34,32,26.774637,66.413269,6.780064,177.774507,coffee
2196,99,15,27,27.417112,56.636362,6.086922,127.924610,coffee
2197,118,33,30,24.131797,67.225123,6.362608,173.322839,coffee
2198,117,32,34,26.272418,52.127394,6.758793,127.175293,coffee


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            2200 non-null   int64  
 1   P            2200 non-null   int64  
 2   K            2200 non-null   int64  
 3   temperature  2200 non-null   float64
 4   humidity     2200 non-null   float64
 5   ph           2200 non-null   float64
 6   rainfall     2200 non-null   float64
 7   label        2200 non-null   object 
dtypes: float64(4), int64(3), object(1)
memory usage: 137.6+ KB


In [5]:
df.describe()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall
count,2200.0,2200.0,2200.0,2200.0,2200.0,2200.0,2200.0
mean,50.551818,53.362727,48.149091,25.616244,71.481779,6.46948,103.463655
std,36.917334,32.985883,50.647931,5.063749,22.263812,0.773938,54.958389
min,0.0,5.0,5.0,8.825675,14.25804,3.504752,20.211267
25%,21.0,28.0,20.0,22.769375,60.261953,5.971693,64.551686
50%,37.0,51.0,32.0,25.598693,80.473146,6.425045,94.867624
75%,84.25,68.0,49.0,28.561654,89.948771,6.923643,124.267508
max,140.0,145.0,205.0,43.675493,99.981876,9.935091,298.560117


In [6]:
df['label'].unique()

array(['rice', 'maize', 'chickpea', 'kidneybeans', 'pigeonpeas',
       'mothbeans', 'mungbean', 'blackgram', 'lentil', 'pomegranate',
       'banana', 'mango', 'grapes', 'watermelon', 'muskmelon', 'apple',
       'orange', 'papaya', 'coconut', 'cotton', 'jute', 'coffee'],
      dtype=object)

Model Training

In [7]:
##### Split 20% of the dataset for validation, keeping 80% for training and testing
df_train_test, df_val = train_test_split(df, test_size=0.2, random_state=42)

Training

In [8]:

def model_training(df):
    """
    Trains a  model on the provided dataset.
    Parameters: df :The dataset containing features and labels.
    Returns: The trained RandomForest model.
    """


    X = df[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']]
    y = df['label']


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


    model = RandomForestClassifier(n_estimators=100, random_state=42)


    model.fit(X_train, y_train)


    ########## Predict on the test set
    y_test_pred = model.predict(X_test)

    # Create a DataFrame to store actual and predicted labels
    results_df = pd.DataFrame({'Actual': y_test.values, 'Predicted': y_test_pred})


    print(results_df)


    print("\n")
    print("#"*100)
    print("\n")


    accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred, average='weighted')
    recall = recall_score(y_test, y_test_pred, average='weighted')
    f1 = f1_score(y_test, y_test_pred, average='weighted')

    ########## Display the metrics
    print("Testing Dataset : \n")
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"Precision: {precision * 100:.2f}%")
    print(f"Recall: {recall * 100:.2f}%")
    print(f"F1 Score: {f1 * 100:.2f}%")



    print("Loss\n")
    print("#"*100)
    print("\n")
    y_train_prob = model.predict_proba(X_train)
    y_test_prob = model.predict_proba(X_test)


    train_loss = log_loss(y_train, y_train_prob)
    test_loss = log_loss(y_test, y_test_prob)

    print(f"Train Loss: {train_loss:.4f}")
    print(f"Test Loss: {test_loss:.4f}")

    return model




model = model_training(df_train_test)

          Actual    Predicted
0         papaya       papaya
1           jute         jute
2         orange       orange
3         orange       orange
4     pigeonpeas   pigeonpeas
..           ...          ...
523  kidneybeans  kidneybeans
524        maize        maize
525    muskmelon    muskmelon
526  pomegranate  pomegranate
527       grapes       grapes

[528 rows x 2 columns]


####################################################################################################


Testing Dataset : 

Accuracy: 99.81%
Precision: 99.82%
Recall: 99.81%
F1 Score: 99.81%
Loss

####################################################################################################


Train Loss: 0.0227
Test Loss: 0.0792


Validation

In [9]:
X_val = df_val[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']]
y_val = df_val['label']
y_val_pred = model.predict(X_val)


accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred, average='weighted')
recall = recall_score(y_val, y_val_pred, average='weighted')
f1 = f1_score(y_val, y_val_pred, average='weighted')

########## Display the metrics
print("Validation Dataset : \n")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")

Validation Dataset : 

Accuracy: 98.86%
Precision: 98.97%
Recall: 98.86%
F1 Score: 98.87%


XG Boost Model Training

Save Model

In [10]:
import pickle

In [11]:
with open('crop_recommendation_model_v1.pkl','wb') as file:
    pickle.dump(model, file)

Validation