In [126]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import warnings
from datetime import datetime
import country_converter as coco
warnings.simplefilter('ignore')
from sklearn.linear_model import LinearRegression,Ridge,Lasso,LogisticRegression
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,PolynomialFeatures
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score,accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor,HistGradientBoostingRegressor,GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor

In [56]:
data = pd.read_csv("draft2.csv")
data

Unnamed: 0,Country,Item,Year,production_in_t,area_in_h,yield_in_kg_per_h
0,Afghanistan,"Almonds, in shell",2019,38205.0,29203.0,1308.0
1,Afghanistan,"Almonds, in shell",2020,39307.0,22134.0,1776.0
2,Afghanistan,"Almonds, in shell",2021,64256.0,36862.0,1743.0
3,Afghanistan,"Almonds, in shell",2022,63515.0,36462.0,1742.0
4,Afghanistan,"Almonds, in shell",2023,67000.0,37000.0,1811.0
...,...,...,...,...,...,...
89693,Zimbabwe,"Whole milk, evaporated",2019,5010.0,,
89694,Zimbabwe,"Whole milk, evaporated",2020,4796.0,,
89695,Zimbabwe,"Whole milk, evaporated",2021,4921.0,,
89696,Zimbabwe,"Whole milk, evaporated",2022,4966.0,,


In [57]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89698 entries, 0 to 89697
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Country            89698 non-null  object 
 1   Item               89698 non-null  object 
 2   Year               89698 non-null  int64  
 3   production_in_t    86181 non-null  float64
 4   area_in_h          49308 non-null  float64
 5   yield_in_kg_per_h  54888 non-null  float64
dtypes: float64(3), int64(1), object(2)
memory usage: 4.1+ MB


In [58]:
data.isnull().sum()

Country                  0
Item                     0
Year                     0
production_in_t       3517
area_in_h            40390
yield_in_kg_per_h    34810
dtype: int64

In [59]:
df = data.copy()

In [61]:
# Encoding
lb_en = LabelEncoder()
df['Item'] = lb_en.fit_transform(df['Item'])
df['Country'] = lb_en.fit_transform(df['Country'])

In [63]:
kni = KNNImputer(n_neighbors=1)
features = ['Country','Item','production_in_t','area_in_h','yield_in_kg_per_h']
df[features] = kni.fit_transform(df[features])

In [125]:
lr = LinearRegression()

x = df[['Country','Item','area_in_h','yield_in_kg_per_h']]
y = df['production_in_t']


x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)
area_model = lr.fit(x_train,y_train)
y_predict = lr.predict(x_test)
lr.score(x_test,y_test)

0.42153667698093367

In [82]:
poly_model = make_pipeline(PolynomialFeatures(degree=2),LinearRegression())
poly_model.fit(x_train,y_train)
y_poly_predict = poly_model.predict(x_test)

poly_score = poly_model.score(x_test,y_test)
mae = mean_absolute_error(y_test,y_poly_predict)
mse = mean_squared_error(y_test,y_poly_predict)
rs = r2_score(y_test,y_poly_predict)

print(f'the poly score is {poly_score}')
print(f'mae:{mae}, mse:{mse}, r2_score:{rs}')

the poly score is 0.7700411386460766
mae:996048.9910101872, mse:59302250269143.77, r2_score:0.7700411386460766


In [90]:
# Ridge model
poly_model = make_pipeline(PolynomialFeatures(degree=4),Ridge(alpha=0.1))
poly_model.fit(x_train,y_train)
y_poly_predict = poly_model.predict(x_test)

poly_score = poly_model.score(x_test,y_test)
mae = mean_absolute_error(y_test,y_poly_predict)
mse = mean_squared_error(y_test,y_poly_predict)
rs = r2_score(y_test,y_poly_predict)

print(f'the poly score is {poly_score}')
print(f'mae:{mae}, mse:{mse}, r2_score:{rs}')

the poly score is 0.7805214174531054
mae:1251397.7724168347, mse:56599575046951.37, r2_score:0.7805214174531054


In [92]:
# Lasso model
poly_model = make_pipeline(PolynomialFeatures(degree=4),Lasso(alpha=0.1))
poly_model.fit(x_train,y_train)
y_poly_predict = poly_model.predict(x_test)

poly_score = poly_model.score(x_test,y_test)
mae = mean_absolute_error(y_test,y_poly_predict)
mse = mean_squared_error(y_test,y_poly_predict)
rs = r2_score(y_test,y_poly_predict)

print(f'the poly score is {poly_score}')
print(f'mae:{mae}, mse:{mse}, r2_score:{rs}')

the poly score is 0.7790369931187736
mae:1092975.9962079069, mse:56982381357879.59, r2_score:0.7790369931187736


In [None]:
from sklearn.ensemble import RandomForestRegressor  # Why ? --> Dealing with outlier and non-linear

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

rf = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=-1)
rf.fit(x_train,y_train)
y_rf_predict = rf.predict(x_test)

rf_score = rf.score(x_test,y_test)
mae = mean_absolute_error(y_test,y_rf_predict)
mse = mean_squared_error(y_test,y_rf_predict)
rs = r2_score(y_test,y_rf_predict)

print(f'the rf score is {rf_score}')
print(f'mae:{mae}, mse:{mse}, r2_score:{rs}')

the rf score is 0.7337278529535571
mae:399695.92956219654, mse:68666792881478.125, r2_score:0.7337278529535571


In [128]:
from sklearn.ensemble import HistGradientBoostingRegressor

hgbr = HistGradientBoostingRegressor(max_iter=510, learning_rate=0.1, max_depth=16,min_samples_leaf=25,random_state=0)

hgbr.fit(x_train,y_train)
y_hgbr_predict = hgbr.predict(x_test)

hgbr_score = hgbr.score(x_test,y_test)
mae = mean_absolute_error(y_test,y_hgbr_predict)
mse = mean_squared_error(y_test,y_hgbr_predict)
rs = r2_score(y_test,y_hgbr_predict)

print(f'the hgbr score is {hgbr_score}')
print(f'mae:{mae}, mse:{mse}, r2_score:{rs}')


the hgbr score is 0.8282517982643982
mae:643390.5604892316, mse:44290769151637.55, r2_score:0.8282517982643982


In [100]:
from xgboost import XGBRegressor

xgb = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=0
)

xgb.fit(x_train, y_train)
y_xgb_pred = xgb.predict(x_test)

xgb_score = xgb.score(x_test,y_test)
mae = mean_absolute_error(y_test,y_xgb_pred)
mse = mean_squared_error(y_test,y_xgb_pred)
rs = r2_score(y_test,y_xgb_pred)

print(f'the xgb score is {xgb_score}')
print(f'mae:{mae}, mse:{mse}, r2_score:{rs}')

the xgb score is 0.8111475643491063
mae:694484.9772581374, mse:48701643141596.59, r2_score:0.8111475643491063


In [112]:
gbr = GradientBoostingRegressor(
    n_estimators=30,
    learning_rate=0.1,
    max_depth=3,
    min_samples_split=2, 
    min_samples_leaf=1,
    random_state=0
)

gbr.fit(x_train, y_train)
y_gbr_pred = gbr.predict(x_test)

gbr_score = gbr.score(x_test,y_test)
mae = mean_absolute_error(y_test,y_gbr_pred)
mse = mean_squared_error(y_test,y_gbr_pred)
rs = r2_score(y_test,y_gbr_pred)

print(f'the gbr score is {gbr_score}')
print(f'mae:{mae}, mse:{mse}, r2_score:{rs}')

the gbr score is 0.8232671937814606
mae:1181519.6954807749, mse:45576208907250.695, r2_score:0.8232671937814606


In [124]:
kr = KNeighborsRegressor(n_neighbors=2,weights='uniform',algorithm='ball_tree')
kr.fit(x_train,y_train)
y_kr_predict = kr.predict(x_test)

kr_score = kr.score(x_test,y_test)
mae = mean_absolute_error(y_test,y_kr_predict)
mse = mean_squared_error(y_test,y_kr_predict)
rs = r2_score(y_test,y_kr_predict)

print(f'the kr score is {kr_score}')
print(f'mae:{mae}, mse:{mse}, r2_score:{rs}')

the kr score is 0.7030619218970595
mae:563705.1972686733, mse:76574984405569.22, r2_score:0.7030619218970595
