In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [72]:
import statsmodels.api as sm
from scipy import stats

In [73]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

In [74]:
# Set visual display settings
sns.set(style='whitegrid')

<h2>Load dataset from csv</h2>

In [75]:
# load csv
df = pd.read_csv('complete_renewable_energy_dataset 2.csv')

In [76]:
# Get the first few rows of the data
df.head()

Unnamed: 0,Country,Year,Energy Type,Production (GWh),Installed Capacity (MW),Investments (USD),Population,GDP,Energy Consumption,Energy Exports,...,Economic Freedom Index,Ease of Doing Business,Innovation Index,Number of Research Institutions,Number of Renewable Energy Conferences,Number of Renewable Energy Publications,Energy Sector Workforce,Proportion of Energy from Renewables,Public-Private Partnerships in Energy,Regional Renewable Energy Cooperation
0,USA,2011,Solar,85404.690027,5549.794323,5604125000.0,1064007213,177527800000000.0,369654.644184,93087.198199,...,49.2168,57.678605,76.803228,851,75,6361,511658,25.907895,1,0
1,Australia,2008,Geothermal,22205.069382,43211.593798,636188600.0,1033255852,64353720000000.0,771781.636293,1752.536054,...,79.990942,1.761579,48.697825,590,39,9752,974948,95.003547,0,1
2,Canada,2013,Biomass,94102.732038,6139.117212,6158680000.0,14895124,156792600000000.0,342707.152899,65146.592498,...,94.37404,77.657334,67.334813,101,7,311,326318,67.354539,1,0
3,Japan,2010,Biomass,88771.93291,40323.63926,8526116000.0,1448827283,135046400000000.0,498839.574253,50257.591014,...,65.143472,44.987734,4.544289,327,3,7708,506078,33.754104,0,0
4,China,2018,Solar,93288.408581,30755.403056,5086237000.0,1499494307,86779560000000.0,819064.362785,63101.395562,...,86.213626,62.535223,10.287199,985,69,4919,187964,59.509228,1,0


In [77]:
# df = df.drop_duplicates(inplace=True)
# print(df.isnull().sum())
# data.fillna(method='ffill', inplace=True)  # Forward fill for missing values

df = df.drop_duplicates()

<h4>DataFrame Information</h4>

In [78]:
# DataFrame Info
# 2500 rows & 56 columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 56 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Country                                  2500 non-null   object 
 1   Year                                     2500 non-null   int64  
 2   Energy Type                              2500 non-null   object 
 3   Production (GWh)                         2500 non-null   float64
 4   Installed Capacity (MW)                  2500 non-null   float64
 5   Investments (USD)                        2500 non-null   float64
 6   Population                               2500 non-null   int64  
 7   GDP                                      2500 non-null   float64
 8   Energy Consumption                       2500 non-null   float64
 9   Energy Exports                           2500 non-null   float64
 10  Energy Imports                           2500 no

<h4>Data Cleaning: Drop duplicate rows</h4>

In [79]:
# data.drop_duplicates(inplace=True)
# print(data.isnull().sum())
# data.fillna(method='ffill', inplace=True)  # Forward fill for missing values
df = df.drop_duplicates()

<h4>List of countries and energy types included in the dataset</h4>

In [80]:
energyTypes = df['Energy Type'].unique().tolist()
energyTypes

['Solar', 'Geothermal', 'Biomass', 'Wind', 'Hydro']

In [81]:
countriesIncluded = df['Country'].unique().tolist()
countriesIncluded

['USA',
 'Australia',
 'Canada',
 'Japan',
 'China',
 'India',
 'Germany',
 'France',
 'Brazil',
 'Russia']

<h4>Socioeconomic factors with highest mean scores</h4>

In [82]:
# collect all numeric columns
numeric_cols = df.select_dtypes(include='number')

# remove Year column
numeric_cols_without_year = numeric_cols.drop(['Year'], axis=1)

# Calculate the mean of all numeric columns
mean_values = numeric_cols_without_year.mean()

# Sort the means in descending order
sorted_means = mean_values.sort_values(ascending=False).head(22)

# Display the sorted means
print("Mean of numeric columns ranked in descending order:")
print(sorted_means)

Mean of numeric columns ranked in descending order:
GDP                                        1.010745e+14
R&D Expenditure                            5.004689e+09
Investments (USD)                          4.956155e+09
Population                                 7.458927e+08
International Aid for Renewables           5.089057e+08
Energy Subsidies                           5.003271e+08
Renewable Energy Jobs                      5.028961e+05
Energy Sector Workforce                    5.020409e+05
Energy Consumption                         4.957012e+05
CO2 Emissions                              4.912184e+05
Energy Imports                             5.080037e+04
Energy Exports                             5.014339e+04
Production (GWh)                           4.874320e+04
Installed Capacity (MW)                    2.488291e+04
Number of Renewable Energy Publications    4.978934e+03
Annual Rainfall                            1.581673e+03
Energy Storage Capacity                    5.150320e

<h4>Normalize Dataset</h4>

In [83]:
# Normalize the numeric columns
scaler = StandardScaler()
df_normalized = df.copy()
df_normalized.iloc[:, 3:] = scaler.fit_transform(df_normalized.iloc[:, 3:])


 -0.70700446]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_normalized.iloc[:, 3:] = scaler.fit_transform(df_normalized.iloc[:, 3:])
  0.38838566]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_normalized.iloc[:, 3:] = scaler.fit_transform(df_normalized.iloc[:, 3:])
  0.98255219]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_normalized.iloc[:, 3:] = scaler.fit_transform(df_normalized.iloc[:, 3:])
 -0.98965352]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_normalized.iloc[:, 3:] = scaler.fit_transform(df_normalized.iloc[:, 3:])
  0.98728088]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_normalized.iloc[:, 3:] = scaler.fit_transform(df_normalized.iloc[:, 3:])
 -0.97316006]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first

<h4>Descriptive Statistics of Renewable Energy Jobs Column</h4>

In [68]:
# Group by 'Country' and describe the 'Renewable Energy Jobs'
descriptionByCountry = df.groupby('Country')['Renewable Energy Jobs'].describe(percentiles=[.25, .5, .75])

# Sort by mean in descending order
sortedByCountry = descriptionByCountry.sort_values(by='mean', ascending=False)

sortedByCountry 

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Brazil,244.0,523577.762295,288246.126218,5188.0,302472.75,511802.5,782022.75,998387.0
Australia,245.0,518076.065306,292766.553442,1641.0,255363.0,518296.0,771078.0,999456.0
Canada,234.0,515523.662393,271889.403224,3191.0,277420.5,529146.0,745679.25,995378.0
Japan,249.0,507847.586345,290822.67811,7470.0,264050.0,481232.0,774359.0,993024.0
China,262.0,501456.041985,291023.710296,4674.0,229083.75,493253.0,742584.5,999319.0
Germany,241.0,499940.705394,286647.450263,6389.0,268866.0,469219.0,779664.0,991498.0
USA,235.0,498057.714894,292372.329844,1694.0,238173.0,501933.0,742513.5,998044.0
Russia,245.0,495688.261224,297489.368648,3463.0,246940.0,476523.0,763479.0,999562.0
France,296.0,486579.246622,283601.443729,1320.0,244822.5,484186.5,722560.25,997429.0
India,249.0,486305.927711,274271.325621,2718.0,247235.0,455352.0,719163.0,991752.0


In [69]:
# Group by 'Year' and describe the 'Renewable Energy Jobs'
descriptionByYear = df.groupby('Year')['Renewable Energy Jobs'].describe(percentiles=[.25, .5, .75])

# Sort by mean in descending order
sortedByYear = descriptionByYear.sort_values(by='mean', ascending=False)

sortedByYear

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021,101.0,559596.930693,268512.045285,12733.0,353560.0,568404.0,777569.0,991252.0
2006,119.0,541980.588235,256492.937724,33161.0,364632.0,535864.0,734299.5,982282.0
2002,113.0,530687.318584,301400.561458,1320.0,257407.0,539338.0,795601.0,998387.0
2023,110.0,522263.418182,272137.897508,1641.0,295195.5,499783.0,772739.0,980158.0
2010,130.0,519524.407692,307078.30886,9873.0,277765.5,492373.0,808156.75,993072.0
2016,124.0,519341.685484,293028.504939,7470.0,291643.5,503323.0,770658.0,986338.0
2014,121.0,517078.942149,293888.554644,2718.0,241186.0,510179.0,778597.0,993024.0
2015,88.0,516313.636364,284993.490208,29336.0,280665.25,537173.0,734862.0,999319.0
2001,109.0,515246.614679,292975.76179,8052.0,256042.0,528148.0,785839.0,994319.0
2020,97.0,511842.412371,312604.687595,6378.0,212980.0,550007.0,810742.0,965246.0
