In [107]:
%pip install pandas
%pip install numpy

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [155]:
import pandas as pd
import numpy as np

In [156]:
# read all data
ph = pd.read_csv('data/share-of-deaths-homicides.csv')
emis = pd.read_csv('data/GCB2022v27_MtCO2_flat.csv')
gdp = pd.read_csv('data/gdp.csv')
pop = pd.read_csv('data/population.csv')


In [157]:
# Standardize the column names
ph.rename(columns={'Entity' : 'Country'}, inplace=True)
ph.rename(columns={'Deaths - Interpersonal violence - Sex: Both - Age: All Ages (Percent)' : 'Homicide Rate'}, inplace=True)
gdp.rename(columns={'Country Name' : 'Country'}, inplace=True)
pop.rename(columns={'Country Name' : 'Country'}, inplace=True) 
emis.rename(columns={'Total' : 'Total Emissions'}, inplace=True)
emis.rename(columns={'Per Capita' : 'Emissions Per Capita'}, inplace=True)

In [158]:
# drop unwanted columns
emis = emis.drop(columns = ['Other'])
emis = emis.drop(columns=['ISO 3166-1 alpha-3'])
ph = ph.drop(columns=['Code'])
gdp = gdp.drop(columns=['Country Code', 'Indicator Name', 'Indicator Code'])

In [159]:
#debucketizing gdp
melted_gdp = gdp.melt(id_vars=['Country'], var_name="Year", value_name='GDP per capita')
#melted_gdp = melted_gdp.dropna()
melted_gdp = melted_gdp.sort_values(["Country","Year"])
melted_gdp = melted_gdp.reset_index(drop=True)
melted_gdp['Year'] = melted_gdp['Year'].astype(int)
melted_gdp['GDP per capita'] = melted_gdp['GDP per capita'].replace(np.nan,0)

#debucketizing population
melted_pop = pop.melt(id_vars=['Country'], var_name="Year", value_name='Population')
melted_pop = melted_pop.sort_values(["Country","Year"])
melted_pop = melted_pop.reset_index(drop=True)
melted_pop['Year'] = melted_pop['Year'].astype(int)
melted_pop['Population'] = melted_pop['Population'].replace(np.nan,0)



In [160]:
# drop rows with year < 1990 and year > 2019
emis = emis[emis['Year'] >= 1990]
emis = emis[emis['Year'] <= 2019]
#make sure debucketized gdp works first
gdp = melted_gdp[melted_gdp['Year'] >= 1990]
gdp = melted_gdp[melted_gdp['Year'] <= 2019]

pop = melted_pop[melted_pop['Year'] >= 1990]
pop = melted_pop[melted_pop['Year'] <= 2019]

In [161]:
# remove NaN values
emis.isnull().sum()

c = emis.groupby('Country')

c.apply(lambda x: x.isnull().sum())


Unnamed: 0_level_0,Country,Year,Total Emissions,Coal,Oil,Gas,Cement,Flaring,Emissions Per Capita
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Afghanistan,0,0,0,0,0,0,0,0,0
Albania,0,0,0,0,0,0,0,0,0
Algeria,0,0,0,0,0,0,0,0,0
Andorra,0,0,0,0,0,0,0,0,0
Angola,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
Viet Nam,0,0,0,0,0,0,0,0,0
Wallis and Futuna Islands,0,0,0,0,0,0,11,0,0
Yemen,0,0,0,0,0,0,0,0,0
Zambia,0,0,0,0,0,0,0,0,0


In [162]:
# set year column as index
emis.set_index('Year', inplace=True)

# fill missing values with previous and forward 5-year averages
emis.fillna(emis.rolling(window=11, min_periods=1).mean().shift(1).fillna(method='bfill'), inplace=True)

# reset index
emis.reset_index(inplace=True)

emis.isnull().sum()

  emis.fillna(emis.rolling(window=11, min_periods=1).mean().shift(1).fillna(method='bfill'), inplace=True)


Year                    0
Country                 0
Total Emissions         0
Coal                    0
Oil                     0
Gas                     0
Cement                  0
Flaring                 0
Emissions Per Capita    0
dtype: int64

In [163]:
# merging tables
merged = pd.merge(ph, emis, how='inner', on=['Country', 'Year'])
merged = pd.merge(merged, gdp, how="left", on=['Country','Year'])
merged = pd.merge(merged, pop, how='left', on=['Country','Year'])


In [164]:
# checking for duplicates
duplicates = merged.duplicated(subset=['Country', 'Year'], keep=False)

# print resulting dataframe
# if there are no duplicates, the dataframe will be empty
print(merged[duplicates])

Empty DataFrame
Columns: [Country, Year, Homicide Rate, Total Emissions, Coal, Oil, Gas, Cement, Flaring, Emissions Per Capita, GDP per capita, Population]
Index: []


In [165]:
merged.describe()

Unnamed: 0,Year,Homicide Rate,Total Emissions,Coal,Oil,Gas,Cement,Flaring,Emissions Per Capita,GDP per capita,Population
count,5610.0,5610.0,5610.0,5610.0,5610.0,5610.0,5610.0,5610.0,5610.0,4950.0,4950.0
mean,2004.5,1.095742,120.147593,50.909146,39.571093,22.19368,5.057668,1.462085,4.545368,10250.963517,33286410.0
std,8.656213,1.67867,543.209645,381.784589,105.091306,68.811459,37.316321,4.788171,6.223573,16763.887997,136238600.0
min,1990.0,0.04,0.0,0.0,0.003664,0.0,0.0,0.0,0.0,0.0,9182.0
25%,1997.0,0.26,1.233389,0.0,0.974387,0.0,9.7e-05,0.0,0.565035,801.488133,1990322.0
50%,2004.5,0.56,9.157214,0.124576,4.934715,0.230832,0.400382,0.0,2.37627,3040.517926,6868105.0
75%,2012.0,1.14,58.731783,8.010227,28.476452,12.57781,1.877339,0.347518,6.495875,11253.254178,19271740.0
max,2019.0,17.74,10740.996069,7543.157408,1559.72816,825.415088,826.876048,62.654678,66.817851,123678.7021,1407745000.0


In [166]:
#Generating measures
merged['Homicide per capita'] = (merged['Homicide Rate'] / merged['Population']) * 100000
merged['Homicide per emissions'] = merged['Homicide Rate'] / merged['Total Emissions']

In [167]:
merged[:25]

Unnamed: 0,Country,Year,Homicide Rate,Total Emissions,Coal,Oil,Gas,Cement,Flaring,Emissions Per Capita,GDP per capita,Population,Homicide per capita,Homicide per emissions
0,Afghanistan,1990,0.84,2.024326,0.278464,1.271408,0.40304,0.045766,0.025648,0.189281,0.0,10694796.0,0.007854,0.414953
1,Afghanistan,1991,1.04,1.914301,0.249627,1.204085,0.389125,0.045766,0.025697,0.178155,0.0,10745167.0,0.009679,0.543279
2,Afghanistan,1992,1.1,1.482054,0.021984,1.029584,0.362736,0.045766,0.021984,0.122916,0.0,12057433.0,0.009123,0.742213
3,Afghanistan,1993,1.15,1.486943,0.01832,1.047904,0.351744,0.046991,0.021984,0.106182,0.0,14003760.0,0.008212,0.773399
4,Afghanistan,1994,1.18,1.453829,0.014693,1.032171,0.337935,0.046991,0.022039,0.094065,0.0,15455555.0,0.007635,0.81165
5,Afghanistan,1995,1.2,1.417327,0.014656,1.011264,0.322432,0.046991,0.021984,0.086323,0.0,16418912.0,0.007309,0.846664
6,Afghanistan,1996,1.33,1.370104,0.007328,0.985616,0.307776,0.0474,0.021984,0.080092,0.0,17106595.0,0.007775,0.970729
7,Afghanistan,1997,1.17,1.304152,0.003664,0.948976,0.282128,0.0474,0.021984,0.073313,0.0,17788819.0,0.006577,0.897135
8,Afghanistan,1998,1.13,1.278504,0.003664,0.941648,0.263808,0.0474,0.021984,0.069134,0.0,18493132.0,0.00611,0.883845
9,Afghanistan,1999,1.13,1.09164,0.003664,0.776768,0.241824,0.0474,0.021984,0.056671,0.0,19262847.0,0.005866,1.03514


In [168]:
# replacing any missing values / infinities with 0
merged = merged.replace(np.nan, 0)
merged = merged.fillna(0)
merged['Homicide per emissions'].replace([np.inf, -np.inf], 0, inplace=True)

In [169]:
merged.sum()

Country                   AfghanistanAfghanistanAfghanistanAfghanistanAf...
Year                                                               11245245
Homicide Rate                                                       6147.11
Total Emissions                                                674027.99492
Coal                                                          285600.306574
Oil                                                           221993.830495
Gas                                                           124506.542015
Cement                                                           28373.5179
Flaring                                                         8202.295118
Emissions Per Capita                                           25499.513081
GDP per capita                                              50742269.408442
Population                                                   164767741468.0
Homicide per capita                                             1310.349168
Homicide per

In [170]:
merged.dtypes

Country                    object
Year                        int64
Homicide Rate             float64
Total Emissions           float64
Coal                      float64
Oil                       float64
Gas                       float64
Cement                    float64
Flaring                   float64
Emissions Per Capita      float64
GDP per capita            float64
Population                float64
Homicide per capita       float64
Homicide per emissions    float64
dtype: object

In [171]:
merged.to_csv("~/Desktop/datascience/phase2/CSI4142-Project/out.csv", index=False)