In [94]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

In [95]:
pesticides_df = pd.read_csv(r"./pesticides.csv")
rainfall_df = pd.read_csv(r"./rainfall.csv")
temperature_df = pd.read_csv(r"./temp.csv")
cropYield_df = pd.read_csv(r"./yield.csv")
# summarized_df/yield_df is the final pre-processed dataset by cleaning and merging of the above 4 csv's
summarized_df = pd.read_csv(r"./yield_df.csv") 

In [96]:
summarized_df

Unnamed: 0.1,Unnamed: 0,Area,Item,Year,hg/ha_yield,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp
0,0,Albania,Maize,1990,36613,1485.0,121.00,16.37
1,1,Albania,Potatoes,1990,66667,1485.0,121.00,16.37
2,2,Albania,"Rice, paddy",1990,23333,1485.0,121.00,16.37
3,3,Albania,Sorghum,1990,12500,1485.0,121.00,16.37
4,4,Albania,Soybeans,1990,7000,1485.0,121.00,16.37
...,...,...,...,...,...,...,...,...
28237,28237,Zimbabwe,"Rice, paddy",2013,22581,657.0,2550.07,19.76
28238,28238,Zimbabwe,Sorghum,2013,3066,657.0,2550.07,19.76
28239,28239,Zimbabwe,Soybeans,2013,13142,657.0,2550.07,19.76
28240,28240,Zimbabwe,Sweet potatoes,2013,22222,657.0,2550.07,19.76


In [97]:
# Want to see the list of summarized columns that were merged together
for column in summarized_df.columns:
    print(column)

Unnamed: 0
Area
Item
Year
hg/ha_yield
average_rain_fall_mm_per_year
pesticides_tonnes
avg_temp


In [98]:
# Problem Description:
# I wonder if we could train a simple neural net to predict hg/ha_yield
# based upon average_rain_fall_mm_per_year, pesticides_tonnes, and avg_temp

In [99]:
# rename the hg/ha_yield because of the '/'
summarized_df = summarized_df.rename(columns={"hg/ha_yield": "hg_ha_yield"})

In [100]:
# verify change took place
summarized_df.hg_ha_yield

0        36613
1        66667
2        23333
3        12500
4         7000
         ...  
28237    22581
28238     3066
28239    13142
28240    22222
28241    22888
Name: hg_ha_yield, Length: 28242, dtype: int64

In [101]:
# set our value we want to predict
y = summarized_df.hg_ha_yield

In [102]:
# selecting the features that we want to use
crop_yield_features = ["average_rain_fall_mm_per_year","pesticides_tonnes","avg_temp"]
X = summarized_df[crop_yield_features]

In [103]:
X.describe() # just quickly review our features
# this is kind of cool, this describes our world averages pretty easily

Unnamed: 0,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp
count,28242.0,28242.0,28242.0
mean,1149.05598,37076.909344,20.542627
std,709.81215,59958.784665,6.312051
min,51.0,0.04,1.3
25%,593.0,1702.0,16.7025
50%,1083.0,17529.44,21.51
75%,1668.0,48687.88,26.0
max,3240.0,367778.0,30.65


In [104]:
X.head()

Unnamed: 0,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp
0,1485.0,121.0,16.37
1,1485.0,121.0,16.37
2,1485.0,121.0,16.37
3,1485.0,121.0,16.37
4,1485.0,121.0,16.37


In [105]:
# lets try to throw it into a decision tree, see what we get
"""
Keep in mind the 4 steps for building models:

Define: What type of model will it be? A decision tree? Some other type of model? Some other parameters of the model type are specified too.
Fit: Capture patterns from provided data. This is the heart of modeling.
Predict: Just what it sounds like
Evaluate: Determine how accurate the model's predictions are.
"""

crop_yield_decision_tree_model = DecisionTreeRegressor(random_state=1)

# Fit the model
crop_yield_decision_tree_model.fit(X,y)

DecisionTreeRegressor(random_state=1)

In [113]:
# Decision Tree Regressor results
print("Making predictions for the following random sample:")
sample_data = X.sample()
print(sample_data)
print("The predictions are")
predicted_yield = crop_yield_decision_tree_model.predict(sample_data)
print(predicted_yield)

Making predictions for the following random sample:
       average_rain_fall_mm_per_year  pesticides_tonnes  avg_temp
25327                          636.0            33236.0     17.01
The predictions are
[88881.85714286]


In [114]:
summarized_df.iloc[sample_data.index] # check the actual value

Unnamed: 0.1,Unnamed: 0,Area,Item,Year,hg_ha_yield,average_rain_fall_mm_per_year,pesticides_tonnes,avg_temp
25327,25327,Spain,Maize,1996,85308,636.0,33236.0,17.01


In [115]:
# holy cow it's horrible! even though it was trained in-sample, lol
# using MAE (Mean Absolute Error)
# error = actual - predicted
# this is evaluating our model
mean_absolute_error(y, crop_yield_decision_tree_model.predict(X))

60547.17674191764