In [2]:
"""
Crop Recommendations Data Science Project

The goal of this project is to provide assistance with Precision Farming
by recommending which crops to grow based on key defining variables found
in this dataset, such as Nitrogen, Phosphorus, and Potassium content found 
in the soil, temperature, humidity, ph value of the soil, and rainfall.
There are 22 crops that are the target variable we want to predict.

Precision Farming helps farmers make informed decisions about their farming
strategy and is the trend that is prevalent in the industry.

The dataset I will use is from Kaggle and can be found at the url below:

https://www.kaggle.com/datasets/atharvaingle/crop-recommendation-dataset
"""

'\nCrop Recommendations Data Science Project\n\nThe goal of this project is to provide assistance with Precision Farming\nby recommending which crops to grow based on key defining variables found\nin this dataset, such as Nitrogen, Phosphorus, and Potassium content found \nin the soil, temperature, humidity, ph value of the soil, and rainfall.\nThere are 22 crops that are the target variable we want to predict.\n\nPrecision Farming helps farmers make informed decisions about their farming\nstrategy and is the trend that is prevalent in the industry.\n\nThe dataset I will use is from Kaggle and can be found at the url below:\n\nhttps://www.kaggle.com/datasets/atharvaingle/crop-recommendation-dataset\n'

In [3]:
import pandas as pd
import numpy as np

path = "/Users/adamastor/Desktop/Data Science Projects/New/Crop Recommendations/Crop_recommendation.csv"

df = pd.read_csv(path)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            2200 non-null   int64  
 1   P            2200 non-null   int64  
 2   K            2200 non-null   int64  
 3   temperature  2200 non-null   float64
 4   humidity     2200 non-null   float64
 5   ph           2200 non-null   float64
 6   rainfall     2200 non-null   float64
 7   label        2200 non-null   object 
dtypes: float64(4), int64(3), object(1)
memory usage: 137.6+ KB


In [10]:
df.head()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


In [11]:
"""All numerical data here, minus out target variable 'label'

Let's look for null values."""

df.isnull()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
2195,False,False,False,False,False,False,False,False
2196,False,False,False,False,False,False,False,False
2197,False,False,False,False,False,False,False,False
2198,False,False,False,False,False,False,False,False


In [12]:
"""Percentage of missing values per column"""

df.isnull().sum()*100/df.isnull().count()

N              0.0
P              0.0
K              0.0
temperature    0.0
humidity       0.0
ph             0.0
rainfall       0.0
label          0.0
dtype: float64

In [13]:
"""Nice, no null values. Let's check for zero values.

None of the variables should allow for zero values, unless maybe N, P, and K"""

"""Count of 0 values by column"""
print((df.N.values == 0).sum())
print((df.P.values == 0).sum())
print((df.K.values == 0).sum())
print((df.temperature.values == 0).sum())
print((df.humidity.values == 0).sum())
print((df.ph.values == 0).sum())
print((df.rainfall.values == 0).sum())

27
0
0
0
0
0
0


In [16]:
"""Nitrogen seems to have some zero values. That may be reasonable 
for soil to not contain any Nitrogen at all. We will leave them as they are.

Now let's test for outliers in the data. Only 22 types of crops in the 'labels'
variable, so a simple frequency table will suffice."""

frequency_table = df['label'].value_counts()

print(frequency_table)

rice           100
maize          100
jute           100
cotton         100
coconut        100
papaya         100
orange         100
apple          100
muskmelon      100
watermelon     100
grapes         100
mango          100
banana         100
pomegranate    100
lentil         100
blackgram      100
mungbean       100
mothbeans      100
pigeonpeas     100
kidneybeans    100
chickpea       100
coffee         100
Name: label, dtype: int64


In [17]:
"""Ok, 100's across the board. So no outliers to worry about. That makes things 
simple.


Let's now see if our data follows a normal distribution."""

"Ok, 100's across the board. So no outliers to worry about. That makes things \nsimple.\n\n\nLet's now see if our data follows a normal distribution."

In [19]:
from scipy.stats import jarque_bera
from scipy.stats import normaltest

jb_stats = jarque_bera(df["N"])
norm_stats = normaltest(df["N"])

print("Jarque-Bera test statistics is {0} and p value is {1}".format(jb_stats[0], jb_stats[1]))
print("Normality test statistics is {0} and p value is {1}".format(norm_stats[0], norm_stats[1]))

jb_stats = jarque_bera(df["P"])
norm_stats = normaltest(df["P"])

print("Jarque-Bera test statistics is {0} and p value is {1}".format(jb_stats[0], jb_stats[1]))
print("Normality test statistics is {0} and p value is {1}".format(norm_stats[0], norm_stats[1]))

jb_stats = jarque_bera(df["K"])
norm_stats = normaltest(df["K"])

print("Jarque-Bera test statistics is {0} and p value is {1}".format(jb_stats[0], jb_stats[1]))
print("Normality test statistics is {0} and p value is {1}".format(norm_stats[0], norm_stats[1]))

jb_stats = jarque_bera(df["humidity"])
norm_stats = normaltest(df["humidity"])

print("Jarque-Bera test statistics is {0} and p value is {1}".format(jb_stats[0], jb_stats[1]))
print("Normality test statistics is {0} and p value is {1}".format(norm_stats[0], norm_stats[1]))

jb_stats = jarque_bera(df["ph"])
norm_stats = normaltest(df["ph"])

print("Jarque-Bera test statistics is {0} and p value is {1}".format(jb_stats[0], jb_stats[1]))
print("Normality test statistics is {0} and p value is {1}".format(norm_stats[0], norm_stats[1]))

jb_stats = jarque_bera(df["rainfall"])
norm_stats = normaltest(df["rainfall"])

print("Jarque-Bera test statistics is {0} and p value is {1}".format(jb_stats[0], jb_stats[1]))
print("Normality test statistics is {0} and p value is {1}".format(norm_stats[0], norm_stats[1]))

Jarque-Bera test statistics is 197.8534047698708 and p value is 0.0
Normality test statistics is 760.001851557691 and p value is 9.283138203996376e-166
Jarque-Bera test statistics is 441.2028469232892 and p value is 0.0
Normality test statistics is 302.0868399340315 and p value is 2.527412799414875e-66
Jarque-Bera test statistics is 3869.9507139546195 and p value is 0.0
Normality test statistics is 1017.3540240028498 and p value is 1.2144527335503216e-221
Jarque-Bera test statistics is 444.5869595572963 and p value is 0.0
Normality test statistics is 305.47164349728394 and p value is 4.6523826301869605e-67
Jarque-Bera test statistics is 278.80772267280406 and p value is 0.0
Normality test statistics is 112.77233775047868 and p value is 3.249361285898636e-25
Jarque-Bera test statistics is 374.8463460956853 and p value is 0.0
Normality test statistics is 269.92702021787755 and p value is 2.4327209847393453e-59


In [23]:
"""None of the data is normally distributed

Let's take a look at what the averages for each column are that qualify our data 
to be labeled as being best suited for growing a particular type of crop."""

columns_to_include=df.columns.difference(['label'])

averages = df.groupby('label')[columns_to_include].mean()

print(averages)

                  K       N       P   humidity        ph    rainfall  \
label                                                                  
apple        199.89   20.80  134.22  92.333383  5.929663  112.654779   
banana        50.05  100.23   82.01  80.358123  5.983893  104.626980   
blackgram     19.24   40.02   67.47  65.118426  7.133952   67.884151   
chickpea      79.92   40.09   67.79  16.860439  7.336957   80.058977   
coconut       30.59   21.98   16.93  94.844272  5.976562  175.686646   
coffee        29.94  101.20   28.74  58.869846  6.790308  158.066295   
cotton        19.56  117.77   46.24  79.843474  6.912675   80.398043   
grapes       200.11   23.18  132.53  81.875228  6.025937   69.611829   
jute          39.99   78.40   46.86  79.639864  6.732778  174.792798   
kidneybeans   20.05   20.75   67.54  21.605357  5.749411  105.919778   
lentil        19.41   18.77   68.36  64.804785  6.927932   45.680454   
maize         19.79   77.76   48.44  65.092249  6.245190   84.76

In [30]:
"""Let's add some color to make that easier to look at."""

import seaborn as sns
import matplotlib.pyplot as plt

"""plt.figure(figsize=(14,14))
sns.heatmap(averages, cmap='viridis', annot=True)

plt.show()"""

styled_df = averages.style.background_gradient(cmap='viridis')
styled_df

Unnamed: 0_level_0,K,N,P,humidity,ph,rainfall,temperature
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
apple,199.89,20.8,134.22,92.333383,5.929663,112.654779,22.630942
banana,50.05,100.23,82.01,80.358123,5.983893,104.62698,27.376798
blackgram,19.24,40.02,67.47,65.118426,7.133952,67.884151,29.97334
chickpea,79.92,40.09,67.79,16.860439,7.336957,80.058977,18.872847
coconut,30.59,21.98,16.93,94.844272,5.976562,175.686646,27.409892
coffee,29.94,101.2,28.74,58.869846,6.790308,158.066295,25.540477
cotton,19.56,117.77,46.24,79.843474,6.912675,80.398043,23.988958
grapes,200.11,23.18,132.53,81.875228,6.025937,69.611829,23.849575
jute,39.99,78.4,46.86,79.639864,6.732778,174.792798,24.958376
kidneybeans,20.05,20.75,67.54,21.605357,5.749411,105.919778,20.115085


In [31]:
"""From the above, we can see the higher values in yellow and lower values in dark purple.

Conclusions we can draw from the visualization are as follows:
 - Crops that thrive in high Potassium soils are apples and grapes
 - Crops that thrive in hight Nitrogen soils are cotton, coffee, bananas, muskmelons, and watermelon
 - Crops that thrive in high Phosphorus soils are apples, and grapes
 - Crops that thrive in high humidity are coconuts, muskmelons, apples, oranges, papayas, and pomegranates
 - Crops that thrive in high pH soil are chickpeas, blackgrams, and oranges
 - Crops that thrive in high rainfall are rice, coconuts, and jutes
 - Crops that thrive in high temperatures are papayas, mangoes, and blackgrams"""

'From the above, we can see the higher values in yellow and lower values in dark purple.\n\nConclusions we can draw from the visualization are as follows:\n - Crops that thrive in high Potassium soils are apples and grapes\n - Crops that thrive in hight Nitrogen soils are cotton, coffee, bananas, muskmelons, and watermelon\n - Crops that thrive in high Phosphorus soils are apples, and grapes\n - Crops that thrive in high humidity are coconuts, muskmelons, apples, oranges, papayas, and pomegranates\n - Crops that thrive in high pH soil are chickpeas, blackgrams, and oranges\n - Crops that thrive in high rainfall are rice, coconuts, and jutes\n - Crops that thrive in high temperatures are papayas, mangoes, and blackgrams'

In [38]:
"""Since we have a multi-class categorical target variable, we will try 
Random Forest first."""

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error

In [39]:
X = df.drop('label', axis=1)  # Features
y = df['label']               # Target variable

# Split the dataset into training set and test set (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
#Initialize the model
rfc = RandomForestClassifier(n_estimators=100, random_state=42)

#Train the model
rfc.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [41]:
#Make predictions
y_pred_class = rfc.predict(X_test)

#Evaluate the model
print(f'Accuracy: {accuracy_score(y_test, y_pred_class)}')

Accuracy: 0.9931818181818182


In [1]:
"""Not bad for first go.

We can try to tune it, although I doubt it will improve much more than that."""

'Not bad for first go.\n\nWe can try to tune it, although I doubt it will improve much more than that.'