In [1]:
!git clone https://github.com/aymanezz/Climate_Impact_Agriculture_Yield.git


Cloning into 'Climate_Impact_Agriculture_Yield'...
remote: Enumerating objects: 161, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 161 (delta 4), reused 21 (delta 2), pack-reused 131[K
Receiving objects: 100% (161/161), 61.20 MiB | 8.02 MiB/s, done.
Resolving deltas: 100% (60/60), done.
Updating files: 100% (39/39), done.


In [2]:
# Importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [3]:
# Load the cleaned data
production = pd.read_csv('/content/Climate_Impact_Agriculture_Yield/data/modified/clean_productions.csv')
temp_change = pd.read_csv('/content/Climate_Impact_Agriculture_Yield/data/modified/clean_temperature_change.csv')
land_cover = pd.read_csv('/content/Climate_Impact_Agriculture_Yield/data/modified/clean_land_cover.csv')
sea_level = pd.read_csv('/content/Climate_Impact_Agriculture_Yield/data/modified/clean_sea_level.csv')
atmospheric = pd.read_csv('/content/Climate_Impact_Agriculture_Yield/data/modified/clean_Atmospheric.csv')

In [4]:
# Merge temperature change data
production = pd.merge(production, temp_change, how='left', on=['Area', 'Year'])

# Merge land cover data
production = pd.merge(production, land_cover, how='left', on=['Area', 'Year'])

# Merge atmospheric data
production = pd.merge(production, atmospheric, how='left', left_on='Year', right_on='Year')

# Drop the redundant 'year' column
production.drop(columns=['Year'], inplace=True)

In [5]:
production.columns

Index(['Area', 'Item', 'Production', 'Temperature Change', 'Indicator',
       'Covered', 'CO2concentration'],
      dtype='object')

In [6]:
production.rename(columns={'Temperature Change_x': 'Temperature Change', 'Covered_y': 'Covered'}, inplace=True)


In [7]:
# Creating lag features
for lag in [1, 3, 6]:
    production[f'Temperature Change_lag_{lag}'] = production.groupby('Area')['Temperature Change'].shift(lag)
    production[f'CO2concentration_lag_{lag}'] = production['CO2concentration'].shift(lag)

# Creating interaction features
production['Temperature Change_x_Land Cover'] = production['Temperature Change'] * production['Covered']
production['Temperature Change_x_CO2concentration'] = production['Temperature Change'] * production['CO2concentration']
production['Land Cover_x_CO2concentration'] = production['Covered'] * production['CO2concentration']


In [8]:
production.dropna(inplace = True)

In [12]:
production.head()

Unnamed: 0,Area,Item,Production,Temperature Change,Indicator,Covered,CO2concentration,Temperature Change_lag_1,CO2concentration_lag_1,Temperature Change_lag_3,CO2concentration_lag_3,Temperature Change_lag_6,CO2concentration_lag_6,Temperature Change_x_Land Cover,Temperature Change_x_CO2concentration,Land Cover_x_CO2concentration
170258,Albania,Apples,234360600.0,0.106,Climate Altering Land Cover Index,95.192667,0.2375,-0.28,0.2375,-0.28,0.2375,-0.28,0.2375,10.090423,0.025175,22.608258
170259,Albania,Apples,234360600.0,0.106,Artificial surfaces (including urban and assoc...,12.8347,0.2375,0.106,0.2375,-0.28,0.2375,-0.28,0.2375,1.360478,0.025175,3.048241
170260,Albania,Apples,234360600.0,0.106,Grassland,401.779,0.2375,0.106,0.2375,-0.28,0.2375,-0.28,0.2375,42.588574,0.025175,95.422512
170261,Albania,Apples,234360600.0,0.106,Herbaceous crops,985.5961,0.2375,0.106,0.2375,0.106,0.2375,-0.28,0.2375,104.473187,0.025175,234.079074
170262,Albania,Apples,234360600.0,0.106,Inland water bodies,55.6714,0.2375,0.106,0.2375,0.106,0.2375,-0.28,0.2375,5.901168,0.025175,13.221957


In [10]:
# Save the feature-engineered data
#production.to_csv('data/final/feature_engineered_production.csv', index=False)


OSError: Cannot save file into a non-existent directory: 'data/final'

In [18]:
process_data_df = production[['Temperature Change', 'CO2concentration', 'Covered',
                'Temperature Change_lag_1', 'Temperature Change_lag_3', 'Temperature Change_lag_6',
                'CO2concentration_lag_1', 'CO2concentration_lag_3', 'CO2concentration_lag_6',
                'Temperature Change_x_Land Cover', 'Temperature Change_x_CO2concentration',
                'Land Cover_x_CO2concentration','Production']]


In [20]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
process_data = scaler.fit_transform(process_data)
process_data = pd.DataFrame(process_data, columns =process_data_df.columns )
process_data.head()

Unnamed: 0,Temperature Change,CO2concentration,Covered,Temperature Change_lag_1,Temperature Change_lag_3,Temperature Change_lag_6,CO2concentration_lag_1,CO2concentration_lag_3,CO2concentration_lag_6,Temperature Change_x_Land Cover,Temperature Change_x_CO2concentration,Land Cover_x_CO2concentration,Production
0,-1.045144,-2.0,-0.008095,-1.575342,-1.573187,-1.573973,-2.0,-2.0,-2.0,-0.076675,-0.88541,-0.052132,306.956127
1,-1.045144,-2.0,-0.065969,-1.046575,-1.573187,-1.573973,-2.0,-2.0,-2.0,-0.08469,-0.88541,-0.078039,306.956127
2,-1.045144,-2.0,0.207349,-1.046575,-1.573187,-1.573973,-2.0,-2.0,-2.0,-0.046836,-0.88541,0.04431,306.956127
3,-1.045144,-2.0,0.617606,-1.046575,-1.045144,-1.573973,-2.0,-2.0,-2.0,0.009983,-0.88541,0.22796,306.956127
4,-1.045144,-2.0,-0.035867,-1.046575,-1.045144,-1.573973,-2.0,-2.0,-2.0,-0.080521,-0.88541,-0.064564,306.956127


In [21]:

# Define the feature columns and target column
feature_cols = ['Temperature Change', 'CO2concentration', 'Covered',
                'Temperature Change_lag_1', 'Temperature Change_lag_3', 'Temperature Change_lag_6',
                'CO2concentration_lag_1', 'CO2concentration_lag_3', 'CO2concentration_lag_6',
                'Temperature Change_x_Land Cover', 'Temperature Change_x_CO2concentration',
                'Land Cover_x_CO2concentration']
target_col = 'Production'

In [22]:
X = process_data[feature_cols]
y = process_data[target_col]

In [23]:
# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
score = model.score(X_test, y_test)
print(f'MSE: {mse}\nScore: {score}\n')


MSE: 21213.130395519573
Score: 0.4724589899270417

