In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit

In [49]:
#read file
df = pd.read_csv('global-data-on-sustainable-energy (1).csv')
df.head()

Unnamed: 0,Entity,Year,Access to electricity (% of population),Access to clean fuels for cooking,Renewable-electricity-generating-capacity-per-capita,Financial flows to developing countries (US $),Renewable energy share in the total final energy consumption (%),Electricity from fossil fuels (TWh),Electricity from nuclear (TWh),Electricity from renewables (TWh),...,Primary energy consumption per capita (kWh/person),Energy intensity level of primary energy (MJ/$2017 PPP GDP),Value_co2_emissions_kt_by_country,Renewables (% equivalent primary energy),gdp_growth,gdp_per_capita,Density\n(P/Km2),Land Area(Km2),Latitude,Longitude
0,Afghanistan,2000,1.613591,6.2,9.22,20000.0,44.99,0.16,0.0,0.31,...,302.59482,1.64,760.0,,,,60,652230.0,33.93911,67.709953
1,Afghanistan,2001,4.074574,7.2,8.86,130000.0,45.6,0.09,0.0,0.5,...,236.89185,1.74,730.0,,,,60,652230.0,33.93911,67.709953
2,Afghanistan,2002,9.409158,8.2,8.47,3950000.0,37.83,0.13,0.0,0.56,...,210.86215,1.4,1029.999971,,,179.426579,60,652230.0,33.93911,67.709953
3,Afghanistan,2003,14.738506,9.5,8.09,25970000.0,36.66,0.31,0.0,0.63,...,229.96822,1.4,1220.000029,,8.832278,190.683814,60,652230.0,33.93911,67.709953
4,Afghanistan,2004,20.064968,10.9,7.75,,44.24,0.33,0.0,0.56,...,204.23125,1.2,1029.999971,,1.414118,211.382074,60,652230.0,33.93911,67.709953


In [50]:
#check for missing values
df.isna().sum()

Entity                                                                 0
Year                                                                   0
Access to electricity (% of population)                               10
Access to clean fuels for cooking                                    169
Renewable-electricity-generating-capacity-per-capita                 931
Financial flows to developing countries (US $)                      2089
Renewable energy share in the total final energy consumption (%)     194
Electricity from fossil fuels (TWh)                                   21
Electricity from nuclear (TWh)                                       126
Electricity from renewables (TWh)                                     21
Low-carbon electricity (% electricity)                                42
Primary energy consumption per capita (kWh/person)                     0
Energy intensity level of primary energy (MJ/$2017 PPP GDP)          207
Value_co2_emissions_kt_by_country                  

In [51]:
#drop columns with many missing values > 50%
df = df.drop(columns = ['Financial flows to developing countries (US $)'])
df = df.drop(columns = ['Renewables (% equivalent primary energy)'])

In [52]:
# imputation
df['Access to electricity (% of population)'] = df['Access to electricity (% of population)'].fillna(df['Access to electricity (% of population)'].mean())
df['Access to clean fuels for cooking'] = df['Access to clean fuels for cooking'].fillna(df['Access to clean fuels for cooking'].mean())
df['Renewable-electricity-generating-capacity-per-capita'] = df['Renewable-electricity-generating-capacity-per-capita'].fillna(df['Renewable-electricity-generating-capacity-per-capita'].mean())
df['Renewable energy share in the total final energy consumption (%)'] = df['Renewable energy share in the total final energy consumption (%)'].fillna(df['Renewable energy share in the total final energy consumption (%)'].mean())
df['Electricity from fossil fuels (TWh)'] = df['Electricity from fossil fuels (TWh)'].fillna(df['Electricity from fossil fuels (TWh)'].mean())
df['Electricity from nuclear (TWh)'] = df['Electricity from nuclear (TWh)'].fillna(df['Electricity from nuclear (TWh)'].mean())
df['Electricity from renewables (TWh)'] = df['Electricity from renewables (TWh)'].fillna(df['Electricity from renewables (TWh)'].mean())
df['Low-carbon electricity (% electricity)'] = df['Low-carbon electricity (% electricity)'].fillna(df['Low-carbon electricity (% electricity)'].mean())
df['Energy intensity level of primary energy (MJ/$2017 PPP GDP)'] = df['Energy intensity level of primary energy (MJ/$2017 PPP GDP)'].fillna(df['Energy intensity level of primary energy (MJ/$2017 PPP GDP)'].mean())
df['Value_co2_emissions_kt_by_country'] = df['Value_co2_emissions_kt_by_country'].fillna(df['Value_co2_emissions_kt_by_country'].mean())
df['gdp_growth'] = df['gdp_growth'].fillna(df['gdp_growth'].mean())
df['gdp_per_capita'] = df['gdp_per_capita'].fillna(df['gdp_per_capita'].mean())


In [53]:
df = df.rename(columns={col: 'Density' for col in df.columns if 'Density' in col})
df['Density'] = df['Density'].astype(str).str.replace(',', '').astype(float)

df = df.dropna(subset=['Latitude', 'Longitude', 'Land Area(Km2)', 'Density'])

In [54]:
#sort values
df = df.sort_values(by=['Entity', 'Year']).reset_index(drop=True)

le = LabelEncoder()
df['Entity_Encoded'] = le.fit_transform(df['Entity'])

df.head()

Unnamed: 0,Entity,Year,Access to electricity (% of population),Access to clean fuels for cooking,Renewable-electricity-generating-capacity-per-capita,Renewable energy share in the total final energy consumption (%),Electricity from fossil fuels (TWh),Electricity from nuclear (TWh),Electricity from renewables (TWh),Low-carbon electricity (% electricity),Primary energy consumption per capita (kWh/person),Energy intensity level of primary energy (MJ/$2017 PPP GDP),Value_co2_emissions_kt_by_country,gdp_growth,gdp_per_capita,Density,Land Area(Km2),Latitude,Longitude,Entity_Encoded
0,Afghanistan,2000,1.613591,6.2,9.22,44.99,0.16,0.0,0.31,65.95744,302.59482,1.64,760.0,3.44161,13283.774348,60.0,652230.0,33.93911,67.709953,0
1,Afghanistan,2001,4.074574,7.2,8.86,45.6,0.09,0.0,0.5,84.745766,236.89185,1.74,730.0,3.44161,13283.774348,60.0,652230.0,33.93911,67.709953,0
2,Afghanistan,2002,9.409158,8.2,8.47,37.83,0.13,0.0,0.56,81.159424,210.86215,1.4,1029.999971,3.44161,179.426579,60.0,652230.0,33.93911,67.709953,0
3,Afghanistan,2003,14.738506,9.5,8.09,36.66,0.31,0.0,0.63,67.02128,229.96822,1.4,1220.000029,8.832278,190.683814,60.0,652230.0,33.93911,67.709953,0
4,Afghanistan,2004,20.064968,10.9,7.75,44.24,0.33,0.0,0.56,62.92135,204.23125,1.2,1029.999971,1.414118,211.382074,60.0,652230.0,33.93911,67.709953,0


In [55]:
target = 'Primary energy consumption per capita (kWh/person)'

#split data into train and test based on year
train_df = df[df['Year'] <= 2017]
test_df = df[df['Year'] > 2017]

train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3123 entries, 0 to 3644
Data columns (total 20 columns):
 #   Column                                                            Non-Null Count  Dtype  
---  ------                                                            --------------  -----  
 0   Entity                                                            3123 non-null   object 
 1   Year                                                              3123 non-null   int64  
 2   Access to electricity (% of population)                           3123 non-null   float64
 3   Access to clean fuels for cooking                                 3123 non-null   float64
 4   Renewable-electricity-generating-capacity-per-capita              3123 non-null   float64
 5   Renewable energy share in the total final energy consumption (%)  3123 non-null   float64
 6   Electricity from fossil fuels (TWh)                               3123 non-null   float64
 7   Electricity from nuclear (TWh)        

In [56]:
features_to_drop = ['Entity', 'Year', target]
X_train_raw = train_df.drop(columns=[c for c in features_to_drop if c in train_df.columns])
y_train = train_df[target]

X_test_raw = test_df.drop(columns=[c for c in features_to_drop if c in test_df.columns])
y_test = test_df[target]

*Part 2: Model Training*  

In [57]:
final_model = RandomForestRegressor(n_estimators=200, max_depth=12, random_state=42, n_jobs=-1)
final_model.fit(X_train_raw, y_train)

y_pred = final_model.predict(X_test_raw)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f'Final Model R²: {r2:.4f}')
print(f'Final Model MAE: {mae:.4f}')

Final Model R²: 0.8764
Final Model MAE: 5157.7590
