In [1]:
!git clone https://github.com/aymanezz/Climate_Impact_Agriculture_Yield.git


Cloning into 'Climate_Impact_Agriculture_Yield'...
remote: Enumerating objects: 217, done.[K
remote: Counting objects: 100% (86/86), done.[K
remote: Compressing objects: 100% (67/67), done.[K
remote: Total 217 (delta 39), reused 49 (delta 18), pack-reused 131[K
Receiving objects: 100% (217/217), 64.49 MiB | 11.36 MiB/s, done.
Resolving deltas: 100% (95/95), done.
Updating files: 100% (43/43), done.


In [79]:
# Importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [109]:
# Load the cleaned data
production = pd.read_csv('/content/Climate_Impact_Agriculture_Yield/data/modified/clean_productions.csv')
temp_change = pd.read_csv('/content/Climate_Impact_Agriculture_Yield/data/modified/clean_temperature_change.csv')
atmospheric = pd.read_csv('/content/Climate_Impact_Agriculture_Yield/data/modified/clean_Atmospheric.csv')
precip = pd.read_csv('/content/Climate_Impact_Agriculture_Yield/data/modified/clean_precip.csv')

In [110]:
production.head()

Unnamed: 0,Area,Item,Year,Production
0,Afghanistan,"Almonds, in shell",1961,104244900.0
1,Afghanistan,Apples,1961,7127.0
2,Afghanistan,Apricots,1961,26210.0
3,Afghanistan,Barley,1961,73644.0
4,Afghanistan,Cantaloupes and other melons,1961,5890.0


In [112]:
production = production.groupby(['Area','Year'])['Production'].mean().reset_index()

In [114]:
production.head()

Unnamed: 0,Area,Year,Production
0,Afghanistan,1961,14222590.0
1,Afghanistan,1962,15072720.0
2,Afghanistan,1963,14687670.0
3,Afghanistan,1964,15400820.0
4,Afghanistan,1965,15846840.0


In [115]:
precip.rename(columns = {'year':'Year'},inplace = True)

In [116]:
atmospheric.head()

Unnamed: 0,Year,CO2concentration
0,1959,0.236
1,1960,0.293333
2,1961,0.2325
3,1962,0.255833
4,1963,0.168333


In [117]:
temp_change.head()

Unnamed: 0,Area,Year,Temperature Change
0,Afghanistan,1961,-0.126
1,Albania,1961,0.635
2,Algeria,1961,0.155
3,American Samoa,1961,0.121
4,Andorra,1961,0.756


In [118]:
precip.head()

Unnamed: 0,Year,Area,precip
0,1979,Afghanistan,1.024591
1,1979,Algeria,0.417374
2,1979,Angola,3.25293
3,1979,Armenia,2.161077
4,1979,Australia,1.683215


In [119]:
# Merge temperature change data
production = pd.merge(production, temp_change, how='left', on=['Area', 'Year'])

# Merge precip data
production = pd.merge(production, precip, how='left', on=['Area', 'Year'])

# Merge atmospheric data
production = pd.merge(production, atmospheric, how='left', left_on='Year', right_on='Year')


In [120]:
production.head()

Unnamed: 0,Area,Year,Production,Temperature Change,precip,CO2concentration
0,Afghanistan,1961,14222590.0,-0.126,,0.2325
1,Afghanistan,1962,15072720.0,-0.173,,0.255833
2,Afghanistan,1963,14687670.0,0.844,,0.168333
3,Afghanistan,1964,15400820.0,-0.751,,0.195
4,Afghanistan,1965,15846840.0,-0.22,,0.130833


In [121]:


# Drop the redundant 'year' column
production.shape

(15190, 6)

In [122]:
production.dropna(inplace = True)

In [125]:
production.head()

Unnamed: 0,Area,Year,Production,Temperature Change,precip,CO2concentration
18,Afghanistan,1979,9887619.0,0.38,1.024591,0.424167
19,Afghanistan,1980,10314180.0,0.655,0.986932,0.571667
20,Afghanistan,1981,11216040.0,0.558,1.046278,0.400833
21,Afghanistan,1982,11691620.0,-0.286,1.201458,0.399167
22,Afghanistan,1983,12052460.0,0.24,1.065508,0.490833


In [126]:
production.shape

(4180, 6)

In [129]:
# Creating lag features
for lag in [1, 3, 6]:
    production[f'Temperature Change_lag_{lag}'] = production.groupby('Area')['Temperature Change'].shift(lag)
    production[f'CO2concentration_lag_{lag}'] = production['CO2concentration'].shift(lag)

# Creating interaction features
production['Temperature Change_x_Precip'] = production['Temperature Change'] * production['precip']
production['Temperature Change_x_CO2concentration'] = production['Temperature Change'] * production['CO2concentration']
production['Precip_x_CO2concentration'] = production['precip'] * production['CO2concentration']


In [130]:
production.dropna(inplace = True)

In [131]:
production.head()

Unnamed: 0,Area,Year,Production,Temperature Change,precip,CO2concentration,Temperature Change_lag_1,CO2concentration_lag_1,Temperature Change_lag_3,CO2concentration_lag_3,Temperature Change_lag_6,CO2concentration_lag_6,Temperature Change_x_Precip,Temperature Change_x_CO2concentration,Precip_x_CO2concentration
24,Afghanistan,1985,13193510.0,0.397,0.760531,0.43,0.252,0.498333,-0.286,0.399167,0.38,0.424167,0.301931,0.17071,0.327028
25,Afghanistan,1986,13979170.0,-0.015,0.970171,0.364167,0.397,0.43,0.24,0.490833,0.655,0.571667,-0.014553,-0.005463,0.353304
26,Afghanistan,1987,14053300.0,0.493,0.793219,0.490833,-0.015,0.364167,0.252,0.498333,0.558,0.400833,0.391057,0.241981,0.389338
27,Afghanistan,1988,14333790.0,1.035,1.083792,0.680833,0.493,0.490833,0.397,0.43,-0.286,0.399167,1.121725,0.704662,0.737882
28,Afghanistan,1989,14915530.0,0.015,0.91031,0.43,1.035,0.680833,-0.015,0.364167,0.24,0.490833,0.013655,0.00645,0.391433


In [132]:
# Save the feature-engineered data
#production.to_csv('data/final/feature_engineered_production.csv', index=False)


In [135]:
process_data_df = production.copy()


In [137]:
# One-hot encoding for 'Area'
process_data_df = pd.get_dummies(process_data_df, columns=['Area'], drop_first=True)

In [138]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
process_data = scaler.fit_transform(process_data_df)
process_data = pd.DataFrame(process_data, columns =process_data_df.columns )
process_data.head()

Unnamed: 0,Year,Production,Temperature Change,precip,CO2concentration,Temperature Change_lag_1,CO2concentration_lag_1,Temperature Change_lag_3,CO2concentration_lag_3,Temperature Change_lag_6,...,Area_Syrian Arab Republic,Area_Tajikistan,Area_Thailand,Area_Turkmenistan,Area_United Kingdom of Great Britain and Northern Ireland,Area_United Republic of Tanzania,Area_Vanuatu,Area_Viet Nam,Area_Yemen,Area_Zambia
0,-0.973684,0.248688,-0.501736,-0.653082,-0.477011,-0.650465,0.020115,-1.26366,-0.6321839,-0.326679,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.921053,0.274747,-0.993915,-0.567934,-0.931034,-0.476448,-0.451149,-0.621762,-3.828355e-16,0.002393,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.868421,0.277205,-0.387053,-0.639806,-0.057471,-0.970897,-0.905172,-0.607118,0.05172414,-0.11368,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.815789,0.286509,0.260425,-0.521785,1.252874,-0.361236,-0.031609,-0.430169,-0.4195402,-1.123633,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.763158,0.305803,-0.958077,-0.592247,-0.477011,0.289229,1.278736,-0.932948,-0.8735632,-0.494207,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [141]:

# Extract the target variable
y = process_data_df['Production']

# Extract the features (all columns except 'Production')
X = process_data_df.drop(columns=['Production'])

In [142]:
# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
score = model.score(X_test, y_test)
print(f'MSE: {mse}\nScore: {score}\n')


MSE: 2378627068660607.5
Score: 0.7895598380476383

