### Crop Recommendation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("indiancrop_dataset.csv")
df.head()

Unnamed: 0,N_SOIL,P_SOIL,K_SOIL,TEMPERATURE,HUMIDITY,ph,RAINFALL,STATE,CROP_PRICE,CROP
0,90,42,43,20.879744,82.002744,6.502985,202.935536,Andaman and Nicobar,7000,Rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,Andaman and Nicobar,5000,Rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,Andaman and Nicobar,7000,Rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,Andaman and Nicobar,7000,Rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,Andaman and Nicobar,120000,Rice


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N_SOIL       2200 non-null   int64  
 1   P_SOIL       2200 non-null   int64  
 2   K_SOIL       2200 non-null   int64  
 3   TEMPERATURE  2200 non-null   float64
 4   HUMIDITY     2200 non-null   float64
 5   ph           2200 non-null   float64
 6   RAINFALL     2200 non-null   float64
 7   STATE        2200 non-null   object 
 8   CROP_PRICE   2200 non-null   int64  
 9   CROP         2200 non-null   object 
dtypes: float64(4), int64(4), object(2)
memory usage: 172.0+ KB


In [4]:
df.describe()

Unnamed: 0,N_SOIL,P_SOIL,K_SOIL,TEMPERATURE,HUMIDITY,ph,RAINFALL,CROP_PRICE
count,2200.0,2200.0,2200.0,2200.0,2200.0,2200.0,2200.0,2200.0
mean,50.551818,53.362727,48.149091,25.616244,71.481779,6.46948,103.463655,2689.228182
std,36.917334,32.985883,50.647931,5.063749,22.263812,0.773938,54.958389,3710.361267
min,0.0,5.0,5.0,8.825675,14.25804,3.504752,20.211267,2.0
25%,21.0,28.0,20.0,22.769375,60.261953,5.971693,64.551686,950.0
50%,37.0,51.0,32.0,25.598693,80.473146,6.425045,94.867624,1825.0
75%,84.25,68.0,49.0,28.561654,89.948771,6.923643,124.267508,3500.0
max,140.0,145.0,205.0,43.675493,99.981876,9.935091,298.560117,120000.0


In [5]:
numerical_features = [feature for feature in df.columns if df[feature].dtype!='O']
numerical_features

['N_SOIL',
 'P_SOIL',
 'K_SOIL',
 'TEMPERATURE',
 'HUMIDITY',
 'ph',
 'RAINFALL',
 'CROP_PRICE']

In [6]:
discrete_features = [feature for feature in numerical_features if len(df[feature].unique())<15]
discrete_features

[]

In [7]:
continous_feautures = [feature for feature in numerical_features if len(df[feature].unique())>15]
continous_feautures

['N_SOIL',
 'P_SOIL',
 'K_SOIL',
 'TEMPERATURE',
 'HUMIDITY',
 'ph',
 'RAINFALL',
 'CROP_PRICE']

In [8]:
categorical_features = [feature for feature in df.columns if df[feature].dtype=='O']
categorical_features

['STATE', 'CROP']

In [9]:
for feature in categorical_features:
    print('The feature is {}'.format(feature))
    print('The number of unique labels are: {}'.format(df[feature].unique()))
    print()

The feature is STATE
The number of unique labels are: ['Andaman and Nicobar' 'Andhra Pradesh' 'Assam' 'Chattisgarh' 'Goa'
 'Gujarat' 'Haryana' 'Himachal Pradesh' 'Jammu and Kashmir' 'Karnataka'
 'Kerala' 'Madhya Pradesh' 'Maharashtra' 'Manipur' 'Meghalaya' 'Nagaland'
 'Odisha' 'Pondicherry' 'Punjab' 'Rajasthan' 'Tamil Nadu' 'Telangana'
 'Tripura' 'Uttar Pradesh' 'Uttrakhand' 'West Bengal']

The feature is CROP
The number of unique labels are: ['Rice' 'Maize' 'ChickPea' 'KidneyBeans' 'PigeonPeas' 'MothBeans'
 'MungBean' 'Blackgram' 'Lentil' 'Pomegranate' 'Banana' 'Mango' 'Grapes'
 'Watermelon' 'Muskmelon' 'Apple' 'Orange' 'Papaya' 'Coconut' 'Cotton'
 'Jute' 'Coffee']



In [10]:
df = df.drop('STATE',axis=1)

In [11]:
df = df.drop('CROP_PRICE',axis=1)

In [12]:
df.columns

Index(['N_SOIL', 'P_SOIL', 'K_SOIL', 'TEMPERATURE', 'HUMIDITY', 'ph',
       'RAINFALL', 'CROP'],
      dtype='object')

In [13]:
df.head()

Unnamed: 0,N_SOIL,P_SOIL,K_SOIL,TEMPERATURE,HUMIDITY,ph,RAINFALL,CROP
0,90,42,43,20.879744,82.002744,6.502985,202.935536,Rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,Rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,Rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,Rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,Rice


In [14]:
df['CROP'] = df['CROP'].str.lower()

In [15]:
df.head()

Unnamed: 0,N_SOIL,P_SOIL,K_SOIL,TEMPERATURE,HUMIDITY,ph,RAINFALL,CROP
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


In [17]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df['CROP'] = le.fit_transform(df['CROP'])

In [18]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [19]:
X.head()

Unnamed: 0,N_SOIL,P_SOIL,K_SOIL,TEMPERATURE,HUMIDITY,ph,RAINFALL
0,90,42,43,20.879744,82.002744,6.502985,202.935536
1,85,58,41,21.770462,80.319644,7.038096,226.655537
2,60,55,44,23.004459,82.320763,7.840207,263.964248
3,74,35,40,26.491096,80.158363,6.980401,242.864034
4,78,42,42,20.130175,81.604873,7.628473,262.71734


In [20]:
y.head

<bound method NDFrame.head of 0       20
1       20
2       20
3       20
4       20
        ..
2195     5
2196     5
2197     5
2198     5
2199     5
Name: CROP, Length: 2200, dtype: int32>

In [21]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X.values,y,test_size=0.2,random_state=0)

In [22]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [23]:
models = [RandomForestClassifier(), XGBClassifier(), CatBoostClassifier(), SVC()]
model_accuracies = []
best_model = None
best_accuracy = 0.0

for model in models:
    print(model)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    model_accuracies.append(score)

    if score > best_accuracy:
        best_accuracy = score
        best_model = model

print("Best Model:", best_model)
print("Best Accuracy:", best_accuracy)

RandomForestClassifier()
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)


<catboost.core.CatBoostClassifier object at 0x000001E0209F67F0>
Learning rate set to 0.081441
0:	learn: 2.4995979	total: 172ms	remaining: 2m 51s
1:	learn: 2.1460593	total: 193ms	remaining: 1m 36s
2:	learn: 1.8996771	total: 212ms	remaining: 1m 10s
3:	learn: 1.7079879	total: 231ms	remaining: 57.6s
4:	learn: 1.5364228	total: 249ms	remaining: 49.6s
5:	learn: 1.4070660	total: 267ms	remaining: 44.2s
6:	learn: 1.2877293	total: 285ms	remaining: 40.4s
7:	learn: 1.1942846	total: 304ms	remaining: 37.6s
8:	learn: 1.0949095	total: 323ms	remaining: 35.5s
9:	learn: 1.0181945	total: 342ms	remaining: 33.9s
10:	learn: 0.9443012	total: 362ms	remaining: 32.5s
11:	learn: 0.8791064	total: 381ms	remaining: 31.3s
12:	learn: 0.8202064	total: 399ms	remaining: 30.3s
13:	learn: 0.7700318	total: 417ms	remaining: 29.4s
14:	learn: 0.7196700	total: 435ms	remaining: 28.6s
15:	learn: 0.6778686	total: 454ms	remaining: 27.9s
16:	learn: 0.6362521	total: 473ms	remaining: 27.3s
17:	learn: 0.6020150	total: 492ms	remaining: 2

In [24]:
import joblib

joblib.dump(best_model, 'croprecommendation.pkl')

['croprecommendation.pkl']