In [None]:

from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
import plotly.express as px
import statsmodels.api as sm
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from IPython.display import display
from math import sqrt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.inspection import permutation_importance

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df = pd.read_csv('../dataset/cleaned_data.csv')
df.head()

Unnamed: 0,Area,Item,Year,Yield,avg_temp,Pesticides,avg_precipitation,group,yield_normalized
0,Angola,Cassava,1990,41177,24.12,64.0,1010,0,0.087702
1,Angola,Cassava,1991,40295,24.02,79.0,1010,0,0.085821
2,Angola,Cassava,1992,42295,23.96,23.0,1010,0,0.090086
3,Angola,Cassava,1993,42295,24.15,169.0,1010,0,0.090086
4,Angola,Cassava,1994,58596,24.04,25.5,1010,0,0.124847


### Linear Regression Model

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Define the features and target variable
features = ['avg_temp', 'Pesticides', 'avg_precipitation', 'Item']
X = df[features]
y = df['Yield']

# Create a column transformer to apply OneHotEncoder to the 'Item' column
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), ['Item'])
    ],
    remainder='passthrough'
)

# Create a pipeline with the preprocessor and the LinearRegression model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Fit the pipeline to the data
pipeline.fit(X, y)

# Get the feature names after one-hot encoding
ohe = preprocessor.named_transformers_['onehot']
feature_names = ohe.get_feature_names_out(input_features=['Item'])
other_features = ['avg_temp', 'Pesticides', 'avg_precipitation']
all_features = list(feature_names) + other_features

# Get the coefficients for the features from the model
coefficients = pipeline.named_steps['model'].coef_

# Print out the features with their coefficients
feature_coefficients = dict(zip(all_features, coefficients))
for feature, coef in feature_coefficients.items():
    print(f'{feature}: {coef}')

# Print out the intercept
intercept = pipeline.named_steps['model'].intercept_
print(f'Intercept: {intercept}')

Item_Cassava: 73691.38333381955
Item_Maize: -51269.26243665961
Item_Plantains and others: 36321.717322162825
Item_Potatoes: 110373.3098962319
Item_Rice, paddy: -42439.490406804936
Item_Sorghum: -65222.84743043374
Item_Soybeans: -70384.68987578891
Item_Sweet potatoes: 37017.510523288845
Item_Wheat: -60960.25673407328
Item_Yams: 32872.6258082854
avg_temp: -1881.8009102492074
Pesticides: 0.11303158925875323
avg_precipitation: -7.151592588650146
Intercept: 127913.37725739882


In [8]:
# This time with polynomial features
from sklearn.preprocessing import PolynomialFeatures

categorical_features = ['Item']
numerical_features = ['avg_temp', 'Pesticides', 'avg_precipitation']
features = numerical_features + categorical_features
X = df[features]
y = df['Yield']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a column transformer to apply OneHotEncoder to the 'Item' column
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough'
)

# Define polynomial features with interaction only
poly_features = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

# Create a pipeline with the preprocessor, polynomial features, and a linear regression model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('polynomial_features', poly_features),
    ('linear_regression', LinearRegression())
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Predict and evaluate on training data
y_train_pred = pipeline.predict(X_train)
r2_train = r2_score(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train =  np.sqrt(mse_train)
mae_train = mean_absolute_error(y_train, y_train_pred)

# Predict and evaluate on testing data
y_test_pred = pipeline.predict(X_test)
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test =  np.sqrt(mse_test)
mae_test = mean_absolute_error(y_test, y_test_pred)

# Print the results for training data
print(f'Training Data - R-squared: {r2_train}')
print(f'Mean Squared Error: {mse_train}')
print(f'Root Mean Squared Error: {rmse_train}')
print(f'Mean Absolute Error: {mae_train}')

# Print the results for testing data
print(f'Testing Data - R-squared: {r2_test}')
print(f'Mean Squared Error: {mse_test}')
print(f'Root Mean Squared Error: {rmse_test}')
print(f'Mean Absolute Error: {mae_test}')

Training Data - R-squared: 0.7117227321699506
Mean Squared Error: 1979179822.1676779
Root Mean Squared Error: 44487.97390495186
Mean Absolute Error: 27297.063750659217
Testing Data - R-squared: 0.6835850746893448
Mean Squared Error: 2234526332.996134
Root Mean Squared Error: 47270.77673358175
Mean Absolute Error: 28285.205242170956


In [9]:
# experiment with linear regression without applying the 

# Create a column transformer to apply OneHotEncoder to the 'Item' column
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), ['Item'])
    ],
    remainder='passthrough'
)

# Create a pipeline with the preprocessor and the LinearRegression model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Fit the pipeline to the data
pipeline.fit(X_train, y_train)

# Predict and evaluate on training data
y_train_pred = pipeline.predict(X_train)
r2_train = r2_score(y_train, y_train_pred)
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train =  np.sqrt(mse_train)
mae_train = mean_absolute_error(y_train, y_train_pred)

# Predict and evaluate on testing data
y_test_pred = pipeline.predict(X_test)
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test =  np.sqrt(mse_test)
mae_test = mean_absolute_error(y_test, y_test_pred)

# Print the results for training data
print(f'Training Data - R-squared: {r2_train}')
print(f'Mean Squared Error: {mse_train}')
print(f'Root Mean Squared Error: {rmse_train}')
print(f'Mean Absolute Error: {mae_train}')

print('test data start here \n')
# Print the results for testing data
print(f'Testing Data - R-squared: {r2_test}')
print(f'Mean Squared Error: {mse_test}')
print(f'Root Mean Squared Error: {rmse_test}')
print(f'Mean Absolute Error: {mae_test}')

Training Data - R-squared: 0.6461088192621558
Mean Squared Error: 2429654927.1181436
Root Mean Squared Error: 49291.52997339547
Mean Absolute Error: 30832.38565185251
test data start here 

Testing Data - R-squared: 0.6199842590081104
Mean Squared Error: 2683676123.5764027
Root Mean Squared Error: 51804.2095159882
Mean Absolute Error: 31714.73449654645


Note: As you can see the result is better with polynomial 

### Data Preparation for Modeling

In [None]:
# Encoding Categorical Features
categorical_features = ['Item', 'Area']
numerical_features = ['Pesticides', 'avg_temp', 'avg_precipitation']

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough'
)

X = df_yield[categorical_features + numerical_features]
y = df_yield['Yield']

# Splitting data
X_transformed = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

std_scaler = StandardScaler(with_mean=False)
X_train_scale = std_scaler.fit_transform(X_train)
# Scale test data for evaluation
X_test_scale = std_scaler.transform(X_test)

### Random Forrest