## Crop Prediction

In this project, you will build multi-class classification models to predict the type of "crop" and identify the single most importance feature for predictive performance.

In [1]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
import warnings
warnings.filterwarnings("ignore")

## Data Exploration

In [2]:
#load the data
data = pd.read_csv('soil_measures.csv')

In [3]:
#head
data.head()

Unnamed: 0,N,P,K,ph,crop
0,90,42,43,6.502985,rice
1,85,58,41,7.038096,rice
2,60,55,44,7.840207,rice
3,74,35,40,6.980401,rice
4,78,42,42,7.628473,rice


In [4]:
#info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   N       2200 non-null   int64  
 1   P       2200 non-null   int64  
 2   K       2200 non-null   int64  
 3   ph      2200 non-null   float64
 4   crop    2200 non-null   object 
dtypes: float64(1), int64(3), object(1)
memory usage: 86.1+ KB


In [5]:
data.describe()

Unnamed: 0,N,P,K,ph
count,2200.0,2200.0,2200.0,2200.0
mean,50.551818,53.362727,48.149091,6.46948
std,36.917334,32.985883,50.647931,0.773938
min,0.0,5.0,5.0,3.504752
25%,21.0,28.0,20.0,5.971693
50%,37.0,51.0,32.0,6.425045
75%,84.25,68.0,49.0,6.923643
max,140.0,145.0,205.0,9.935091


In [6]:
#check for missing values
data.isnull().sum()

N       0
P       0
K       0
ph      0
crop    0
dtype: int64

## Splitting the Data

In [7]:
#split the data into X and y
X = data.drop('crop', axis=1)
y = data['crop']

In [8]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Building the Model

### Logistic Regression

In [9]:
#call the model
log_model = LogisticRegression()

#fit the model
log_model.fit(X_train, y_train)

#predict the model
log_pred = log_model.predict(X_test)

#accuracy the model
print('Accuracy', log_model.score(X_test, y_test))

Accuracy 0.5681818181818182


### SVC

In [10]:
#call the model
svm_model = SVC()

#fit the model
svm_model.fit(X_train, y_train)

#predict the model
svm_pred = svm_model.predict(X_test)

#accuracy the model
print('Accuracy', svm_model.score(X_test, y_test))

Accuracy 0.6045454545454545


### Random Forest

In [11]:
#call the model
rf_model = RandomForestClassifier()

#fit the model
rf_model.fit(X_train, y_train)

#predict the model
rf_pred = rf_model.predict(X_test)

#accuracy the model
print('Accuracy', rf_model.score(X_test, y_test))

Accuracy 0.8159090909090909


### Decision Tree

In [12]:
#call the model
dt_model = DecisionTreeClassifier()

#fit the model
dt_model.fit(X_train, y_train)

#predict the model
y_pred = dt_model.predict(X_test)

#accuracy the model
print('Accuracy', dt_model.score(X_test, y_test))

Accuracy 0.7818181818181819


## Comparing the Models

In [13]:
#accuracy
models = [log_model, svm_model, rf_model, dt_model]
model_names = ['Logistic Regression', 'SVM', 'Random Forest', 'Decision Tree']
for i, model in enumerate(models):
    print(f'{model_names[i]} Accuracy: {model.score(X_test, y_test)}')



Logistic Regression Accuracy: 0.5681818181818182
SVM Accuracy: 0.6045454545454545
Random Forest Accuracy: 0.8159090909090909
Decision Tree Accuracy: 0.7818181818181819


## Pridict the model

In [14]:
rf_model.predict([[61,56,70,5.386167788]])

array(['chickpea'], dtype=object)

In [15]:
rf_model.predict([[92,33,70,6.236167548]])

array(['muskmelon'], dtype=object)

In [20]:
import pickle
#save the model
pickle.dump(model, open('./model\crop_model.pkl', 'wb'))