In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import warnings
from datetime import datetime
import country_converter as coco
warnings.simplefilter('ignore')
from sklearn.linear_model import LinearRegression,Ridge,Lasso,LogisticRegression
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score,accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

In [5]:
df = pd.read_csv("draft2.csv")
df

Unnamed: 0,Country,Item,Year,production_in_t,area_in_h,yield_in_kg_per_h
0,Afghanistan,"Almonds, in shell",2019,38205.0,29203.0,1308.0
1,Afghanistan,"Almonds, in shell",2020,39307.0,22134.0,1776.0
2,Afghanistan,"Almonds, in shell",2021,64256.0,36862.0,1743.0
3,Afghanistan,"Almonds, in shell",2022,63515.0,36462.0,1742.0
4,Afghanistan,"Almonds, in shell",2023,67000.0,37000.0,1811.0
...,...,...,...,...,...,...
89693,Zimbabwe,"Whole milk, evaporated",2019,5010.0,,
89694,Zimbabwe,"Whole milk, evaporated",2020,4796.0,,
89695,Zimbabwe,"Whole milk, evaporated",2021,4921.0,,
89696,Zimbabwe,"Whole milk, evaporated",2022,4966.0,,


In [6]:
df = df.dropna(subset=['production_in_t','area_in_h','yield_in_kg_per_h'])

In [29]:
df

Unnamed: 0,Country,Item,Year,production_in_t,area_in_h,yield_in_kg_per_h,country_encoded,Item_encoded
0,Afghanistan,"Almonds, in shell",2019,38205.0,29203.0,1308.0,0,2
1,Afghanistan,"Almonds, in shell",2020,39307.0,22134.0,1776.0,0,2
2,Afghanistan,"Almonds, in shell",2021,64256.0,36862.0,1743.0,0,2
3,Afghanistan,"Almonds, in shell",2022,63515.0,36462.0,1742.0,0,2
4,Afghanistan,"Almonds, in shell",2023,67000.0,37000.0,1811.0,0,2
...,...,...,...,...,...,...,...,...
89680,Zimbabwe,Wheat,2019,94685.0,24186.0,3915.0,198,153
89681,Zimbabwe,Wheat,2020,212530.0,44466.0,4780.0,198,153
89682,Zimbabwe,Wheat,2021,337212.0,66434.0,5076.0,198,153
89683,Zimbabwe,Wheat,2022,250000.0,48504.0,5154.0,198,153


In [7]:
# Encoding
lb_en = LabelEncoder()
df['country_encoded'] = lb_en.fit_transform(df['Country'])
df['Item_encoded'] = lb_en.fit_transform(df['Item'])

In [8]:
# Predicting the area using Linear Regression
lr = LinearRegression()
x = df[['country_encoded','Item_encoded','Year','production_in_t','yield_in_kg_per_h']]
y = df['area_in_h']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)
area_model = lr.fit(x_train,y_train)
y_predict = lr.predict(x_test)

std_score = lr.score(x_test,y_test)

mae = mean_absolute_error(y_test,y_predict)
mse = mean_squared_error(y_test,y_predict)
rs = r2_score(y_test,y_predict)

print(f'the std score is {std_score}')
print(f'mae:{mae}, mse:{mse}, r2_score:{rs}')

the std score is 0.5227346523968448
mae:284773.83348119614, mse:2129729408515.7488, r2_score:0.5227346523968448


In [9]:
# Polynomial Regression
poly_model = make_pipeline(PolynomialFeatures(degree=2),LinearRegression())
poly_model.fit(x_train,y_train)
y_poly_predict = poly_model.predict(x_test)

poly_score = poly_model.score(x_test,y_test)
mae = mean_absolute_error(y_test,y_poly_predict)
mse = mean_squared_error(y_test,y_poly_predict)
rs = r2_score(y_test,y_poly_predict)

print(f'the poly score is {poly_score}')
print(f'mae:{mae}, mse:{mse}, r2_score:{rs}')

the poly score is 0.8721994599959298
mae:179555.75842712625, mse:570291913791.2754, r2_score:0.8721994599959298


In [33]:
# Ridge model
ridge = Ridge(alpha=0.5)
ridge.fit(x_test,y_test)
y_ridge_predict = ridge.predict(x_test)

ridge_score = ridge.score(x_test,y_test)
mae = mean_absolute_error(y_test,y_ridge_predict)
mse = mean_squared_error(y_test,y_ridge_predict)
rs = r2_score(y_test,y_ridge_predict)

print(f'the ridge score is {ridge_score}')
print(f'mae:{mae}, mse:{mse}, r2_score:{rs}')

the ridge score is 0.5423277526512923
mae:289090.8146157273, mse:2042297957593.4182, r2_score:0.5423277526512923


In [34]:
# Lasso
lasso = Lasso(alpha=0.5)
lasso.fit(x_test,y_test)
y_lasso_predict = lasso.predict(x_test)

lasso_score = ridge.score(x_test,y_test)
mae = mean_absolute_error(y_test,y_lasso_predict)
mse = mean_squared_error(y_test,y_lasso_predict)
rs = r2_score(y_test,y_lasso_predict)

print(f'the lasso score is {lasso_score}')
print(f'mae:{mae}, mse:{mse}, r2_score:{rs}')

the lasso score is 0.5423277526512923
mae:289090.8185595613, mse:2042297957593.343, r2_score:0.5423277526513091


In [35]:
from sklearn.ensemble import RandomForestRegressor

x = df[['country_encoded','Item_encoded','Year','production_in_t','yield_in_kg_per_h']]
y = df['area_in_h']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

rf = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=-1)
rf.fit(x_train,y_train)
y_rf_predict = rf.predict(x_test)

rf_score = rf.score(x_test,y_test)
mae = mean_absolute_error(y_test,y_rf_predict)
mse = mean_squared_error(y_test,y_rf_predict)
rs = r2_score(y_test,y_rf_predict)

print(f'the rf score is {rf_score}')
print(f'mae:{mae}, mse:{mse}, r2_score:{rs}')


the rf score is 0.9992317875098884
mae:5681.2035143495195, mse:3428040062.8208957, r2_score:0.9992317875098884


In [10]:
from sklearn.neighbors import KNeighborsRegressor
kr = KNeighborsRegressor(n_neighbors=1, weights='distance',algorithm='ball_tree')
x = df[['Item_encoded','production_in_t','yield_in_kg_per_h']]
y = df['area_in_h']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)
kr.fit(x_train,y_train)
y_kr_predict = kr.predict(x_test)


In [59]:
kr.score(x_test,y_test)

0.8925869707696962

In [12]:
kr_score = kr.score(x_test,y_test)
mae = mean_absolute_error(y_test,y_kr_predict)
mse = mean_squared_error(y_test,y_kr_predict)
rs = r2_score(y_test,y_kr_predict)

print(f'the kr score is {kr_score}')
print(f'mae:{mae}, mse:{mse}, r2_score:{rs}')

the kr score is 0.8925869707696962
mae:75329.28700906344, mse:479315517789.80945, r2_score:0.8925869707696962


In [13]:
df.to_csv('final_draft.csv')

In [17]:
len(df.Item.unique())

156