<a href="https://colab.research.google.com/github/andrey101010/ds-challenge-S-and-S/blob/main/S_and_S_Crop_yield.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# These libraries are needed for the project. Uncomment to install them. 
# To make use of this notebook the appropriate csv files need to be droped in the '/content' folder!
# You need to load the images before procceeding with the script.

# !pip install pandas
# !pip install matplotlib
# !pip install sklearn 

# Crop Yield Prediction

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import mutual_info_regression

In [None]:
# works if the files and the notebook are in the same folder
pwd = !pwd

In [None]:
temperature = pd.read_csv('{}/temp.csv'.format(pwd[0]))
rainfall = pd.read_csv('{}/rainfall.csv'.format(pwd[0]))
pesticide = pd.read_csv('{}/pesticides.csv'.format(pwd[0]))
crop_yield = pd.read_csv('{}/yield.csv'.format(pwd[0]))

In [None]:
crop_yield.head()

In [None]:
# rainfall is presented as an average over multiple decades!
rainfall.head()

In [None]:
pesticide.head()

In [None]:
temperature.head()

In [None]:
print('DF shape crop yield: ',crop_yield.shape)
print('DF shape pesticide: ', pesticide.shape)
print('DF shape rainfall: ', rainfall.shape)
print('DF shape temperature: ',temperature.shape)

# EDA
crop yield

In [None]:
crop_yield.info()
crop_yield.describe()

In [None]:
crop_yield.head()

In [None]:
crop_yield = crop_yield.drop(['Domain Code', 'Domain', 'Element Code', 'Element', 'Unit', 'Year Code'], axis = 1)

In [None]:
crop_yield.rename(columns={'Value': 'Crop_Yield'}, inplace=True)

In [None]:
crop_yield.Item.unique()

In [None]:
crop_yield.Area.nunique()

In [None]:
pesticide.info()
pesticide.describe()

In [None]:
pesticide.head()

In [None]:
pesticide.Unit.unique()

In [None]:
pesticide.Item.unique()

In [None]:
pesticide.Element.unique()

In [None]:
pesticide.Domain.unique()

In [None]:
# removing some columns not needed for ML analysis
pesticide = pesticide.drop(['Domain', 'Element', 'Item', 'Unit'], axis = 1)

In [None]:
pesticide.rename(columns={'Value': 'Pesticide_Amount'}, inplace=True)

In [None]:
pesticide

In [None]:
rainfall.info()
rainfall.describe()

In [None]:
rainfall[' Area'].nunique()

In [None]:
temperature.info()
temperature.describe()

In [None]:
temperature.country.nunique()

In [None]:
rainfall_temperature = pd.merge(rainfall, temperature,  how='left', left_on=['Year',' Area'], right_on = ['year','country'])

In [None]:
rainfall_temperature_pesticide = pd.merge(rainfall_temperature, pesticide,  how='left', left_on=['Year',' Area'], right_on = ['Year','Area'])

In [None]:
crop_yield

In [None]:
merged_table = pd.merge(rainfall_temperature_pesticide, crop_yield,  how='left', left_on=['Year',' Area'], right_on = ['Year','Area'])

In [None]:
# The Area code as well as item code are purpusly left in the table. The will be later applied to an Ml algorihm
final_df = merged_table.drop(['year', 'Area_x', 'Area_y', 'country'], axis = 1)

In [None]:
crop_yield.groupby('Year')['Crop_Yield'].mean().plot()

In [None]:
final_df.groupby('Year')['Crop_Yield'].mean().plot();

In [None]:
# Also some object columns needs to be transformed to numerical columns
final_df.info()

In [None]:
# transform the avg_temp to numeric
final_df.avg_temp = pd.to_numeric(final_df.avg_temp)

In [None]:
# Drops those rows with an average rainfall and the value '..'
final_df = final_df[final_df.average_rain_fall_mm_per_year != '..']

In [None]:
#  this function removes commas in the float string. In addition Nan values are ignored as those cause a Attribute Error.
def remove_commas(x):
    try:
        x = x.split(',')
        return x[0]
    except AttributeError:
        pass

In [None]:
# remove the commas in the strings for the column_>
final_df.average_rain_fall_mm_per_year = final_df.average_rain_fall_mm_per_year.apply(remove_commas)

In [None]:
final_df.average_rain_fall_mm_per_year = pd.to_numeric(final_df.average_rain_fall_mm_per_year)

In [None]:
final_df.info()

Graphical representation of  rainfall, temperature and crop yield

In [None]:
# Rainfall is more or less constant over the years
final_df.groupby('Year').average_rain_fall_mm_per_year.mean().plot()
plt.ylabel('Precipitation mm*m-2')
plt.ylim(0,1300);

In [None]:
final_df.groupby('Year').avg_temp.mean().plot()
plt.ylabel('Temperature [°C]')
plt.ylim(0,25);

In [None]:
final_df.groupby('Year').Pesticide_Amount.mean().plot();
plt.ylabel('Crop amount [hg/ha]');

Cleaning up the Data

In [None]:
# An example to drop a country, because of missing values
final_df[final_df[' Area'] == 'Andorra'].head()

In [None]:
# Some countries appear only because of their appearence in some datasets. But other datasets are not providing those coutries with values.
# Hence those can be droped out of the table
final_df.groupby(" Area").mean()

In [None]:
# Some missing values can be easily filled with fillna and mean. It is fully true for the item code and with a good approximation for average temperature.
filling_missing_values = final_df.groupby(" Area")[['avg_temp', 'Area Code']].transform(lambda x: x.fillna(x.mean()))

In [None]:
# New values are concatenated with the final data frame
final_df = pd.concat([final_df.drop(['avg_temp', 'Area Code'], axis = 1), filling_missing_values], axis = 1)

In [None]:
# There are some missing valaes in the table. Before dropping them some can be adequately replaced. Such as 
# Area Code 
final_df.isnull().sum()

In [None]:
# Dropping all NaNs. However, if there would be more time, filling with appropriate values would be the better solution!
final_df = final_df.dropna()

In [None]:
# more than half of the data frame is missing now.
final_df.info()

In [None]:
# the top countries with most entries
final_df.groupby(' Area')['Crop_Yield'].count().sort_values().tail(10).plot(kind='bar', figsize =(15, 7));
plt.ylabel('Amount of Entries', fontsize = 18);
plt.yticks(fontsize = 18)
plt.xlabel(None);
plt.xticks(fontsize = 18);

# Machine learning approach

Here we seperate two countries, from the data frame. One country has a high crop yield while the other has a low crop yield. Later, a trained model is used to predict the crop yield.  

In [None]:
# high crop yield
# here I pick UK, but any other country should work
final_df[final_df.Crop_Yield > 300000]

In [None]:
final_df[final_df[' Area'] == 'United Kingdom']

In [None]:
UK_DF = final_df[final_df[' Area'] == 'United Kingdom']

In [None]:
UK_DF

In [None]:
# UK is droped out of the dataset
final_df = final_df[final_df[' Area'] != 'United Kingdom']

In [None]:
# low crop yield
# here I pick UK, but any other country should work
final_df[final_df.Crop_Yield < 100000]

In [None]:
Albania_DF = final_df[final_df[' Area'] == 'Albania']

In [None]:
# Albania is droped out of the dataset
final_df = final_df[final_df[' Area'] != 'Albania']

In [None]:
y = final_df['Crop_Yield']
X = final_df.drop(['Item', ' Area', 'Crop_Yield', 'Year'], axis = 1)

In [None]:
y_uk = UK_DF['Crop_Yield']
X_uk = UK_DF.drop(['Item', ' Area', 'Crop_Yield', 'Year'], axis = 1)

In [None]:
y_albania = Albania_DF['Crop_Yield']
X_albania = Albania_DF.drop(['Item', ' Area', 'Crop_Yield', 'Year'], axis = 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

ML Algorithms

Random Forest

In [None]:
# Random Forest
random_forest_regression = RandomForestRegressor()
random_forest_regression.fit(X_train, y_train)

# Calculate the estimated value for each data point
y_pred = random_forest_regression.predict(X_test)

In [None]:
# Calcualte the R-squared for our model
print("R-squared:", r2_score(y_test, y_pred).round(3))
print("Mean Squared Error", mean_squared_error(y_test, y_pred).round(2))
print("Root Mean Squared Error", (mean_squared_error(y_test, y_pred)**(1/2)).round(2))

UK - random Forest

In [None]:
y_pred = random_forest_regression.predict(X_uk)

In [None]:
# Calcualte the R-squared for our model
print("R-squared:", r2_score(y_uk, y_pred).round(3))
print("Mean Squared Error", mean_squared_error(y_uk, y_pred).round(2))
print("Root Mean Squared Error", (mean_squared_error(y_uk, y_pred)**(1/2)).round(2))

Albania - random forest

In [None]:
y_pred = random_forest_regression.predict(X_albania)

In [None]:
# Calcualte the R-squared for our model
print("R-squared:", r2_score(y_albania, y_pred).round(3))
print("Mean Squared Error", mean_squared_error(y_albania, y_pred).round(2))
print("Root Mean Squared Error", (mean_squared_error(y_albania, y_pred)**(1/2)).round(2))

Linear Regression

In [None]:
# Linear Regression
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)

# Calculate the estimated value for each data point
y_pred = linear_regression.predict(X_test)

In [None]:
# Calcualte the R-squared for our model
print("R-squared:", r2_score(y_test, y_pred).round(3))
print("Mean Squared Error", mean_squared_error(y_test, y_pred).round(2))
print("Root Mean Squared Error", (mean_squared_error(y_test, y_pred)**(1/2)).round(2))

UK - linear regression

In [None]:
y_pred = random_forest_regression.predict(X_uk)

In [None]:
# Calcualte the R-squared for our model
print("R-squared:", r2_score(y_uk, y_pred).round(3))
print("Mean Squared Error", mean_squared_error(y_uk, y_pred).round(2))
print("Root Mean Squared Error", (mean_squared_error(y_uk, y_pred)**(1/2)).round(2))

Albania - linear regression

In [None]:
y_pred = random_forest_regression.predict(X_albania)

In [None]:
# Calcualte the R-squared for our model
print("R-squared:", r2_score(y_albania, y_pred).round(3))
print("Mean Squared Error", mean_squared_error(y_albania, y_pred).round(2))
print("Root Mean Squared Error", (mean_squared_error(y_albania, y_pred)**(1/2)).round(2))

Bayes Regression

In [None]:
# Bayes regression
bayes_regression = BayesianRidge()
bayes_regression.fit(X_train, y_train)

# Calculate the estimated value for each data point
y_pred = bayes_regression.predict(X_test)

In [None]:
# Calcualte the R-squared for our model
print("R-squared:", r2_score(y_test, y_pred).round(3))
print("Mean Squared Error", mean_squared_error(y_test, y_pred).round(2))
print("Root Mean Squared Error", (mean_squared_error(y_test, y_pred)**(1/2)).round(2))

This section onvestigates the feature importance

In [None]:
# Please checkout the link
# https://www.kaggle.com/code/ryanholbrook/mutual-information

discrete_features = X.dtypes == int

def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(X_train, y_train, discrete_features)
mi_scores  # show a few features with their MI scores