In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl')

In [2]:
#read in EU
eu_df = pd.read_csv('./additional_data/EU27_COUNTRY_LIST.csv')

In [3]:
#read in cleaned data-sets with respective index values
violence = pd.read_csv('./datasets_cleaned/masterviolence_pht_df.csv')
care = pd.read_csv('./datasets_cleaned/master_care_df.csv')
eco_sector = pd.read_csv('./datasets_cleaned/Economic sector representation 2013-2022.csv')
employment = pd.read_csv('./datasets_cleaned/Employment by sex and age.csv')
pay = pd.read_csv('./datasets_cleaned/Gender Pay Gap 2009-2020.csv')
decision = pd.read_csv('./datasets_cleaned/Members of national parliaments.csv')
pension = pd.read_csv('./datasets_cleaned/Pension gap 2012-2021.csv')

In [4]:
#can only calculate overall index for years that overlap - check 'Year' cols of each dataset to find overlap - 2013 - 2020 
print(violence['Year'].unique())
print(care['Year'].unique())
print(eco_sector['Year'].unique())
print(employment['Year'].unique())
print(pay['Year'].unique())
print(decision['Year'].unique())
print(pension['Year'].unique())

[2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020]
[2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
 2014 2015 2016 2017 2018 2019 2020 2021]
[2013 2014 2015 2016 2017 2018 2019 2020 2021 2022]
[2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021]
[2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020]
[2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020
 2021 2022]
[2012 2013 2014 2015 2016 2017 2018 2019 2020 2021]


In [5]:
#filter dfs for chosen timeframe - 2013 to 2020
violence_date_filter = violence[(violence['Year'] >= 2013) & (violence['Year'] < 2021)]
care_date_filter = care[(care['Year'] >= 2013) & (care['Year'] < 2021)]
eco_sector_date_filter = eco_sector[(eco_sector['Year'] >= 2013) & (eco_sector['Year'] < 2021)]
employment_date_filter = employment[(employment['Year'] >= 2013) & (employment['Year'] < 2021)]
pay_date_filter = pay[(pay['Year'] >= 2013) & (pay['Year'] < 2021)]
decision_date_filter = decision[(decision['Year'] >= 2013) & (decision['Year'] < 2021)]
pension_date_filter = pension[(pension['Year'] >= 2013) & (pension['Year'] < 2021)]

#also filter employment data for appropriate / broadest age group - age 15-64 - also drop European Union stats
employment_date_filter = employment_date_filter[(employment_date_filter['Age'] == 'Y15-64')]
i = employment_date_filter[(employment_date_filter['Country'] == 'European Union')]
employment_date_filter = employment_date_filter.drop(i.index)

#drop EU from decision too
j = decision_date_filter[(decision_date_filter['Country'] == 'European Union')]
decision_date_filter = decision_date_filter.drop(j.index)

In [6]:
#drop additional index columns in dfs where this has occured 
violence_date_filter = violence_date_filter.drop(violence_date_filter.columns[0], axis=1)
care_date_filter = care_date_filter.drop(care_date_filter.columns[0], axis=1)
employment_date_filter = employment_date_filter.drop(employment_date_filter.columns[0], axis=1)
decision_date_filter = decision_date_filter.drop(decision_date_filter.columns[0], axis=1)

#sort alphabetically and by year
violence_date_filter = violence_date_filter.sort_values(by=['Country','Year'], ascending=True)
care_date_filter = care_date_filter.sort_values(by=['Country','Year'], ascending=True)
eco_sector_date_filter = eco_sector_date_filter.sort_values(by=['Country','Year'], ascending=True)
employment_date_filter = employment_date_filter.sort_values(by=['Country','Year'], ascending=True)
pay_date_filter = pay_date_filter.sort_values(by=['Country','Year'], ascending=True)
decision_date_filter = decision_date_filter.sort_values(by=['Country','Year'], ascending=True)
pension_date_filter = pension_date_filter.sort_values(by=['Country','Year'], ascending=True)

#reset the index to allow for merging data frames
violence_date_filter = violence_date_filter.reset_index(drop=True)
care_date_filter = care_date_filter.reset_index(drop=True)
eco_sector_date_filter = eco_sector_date_filter.reset_index(drop=True)
employment_date_filter = employment_date_filter.reset_index(drop=True)
pay_date_filter = pay_date_filter.reset_index(drop=True)
decision_date_filter = decision_date_filter.reset_index(drop=True)
pension_date_filter = pension_date_filter.reset_index(drop=True)


In [7]:
#ensure number of rows is equal in each df - filtered correctly 
print(violence_date_filter.shape)
print(care_date_filter.shape)
print(eco_sector_date_filter.shape)
print(employment_date_filter.shape)
print(pay_date_filter.shape)
print(decision_date_filter.shape)
print(pension_date_filter.shape)

(216, 7)
(216, 4)
(216, 5)
(216, 6)
(216, 4)
(216, 4)
(216, 4)


In [8]:
#renaming index columns for merge 
eco_sector_date_filter.rename(columns = {'Index' : 'IndexValueEcoSector'}, inplace = True)
pay_date_filter.rename(columns = {'Index' : 'IndexValuePay'}, inplace = True)
pension_date_filter.rename(columns = {'Index' : 'IndexValuePension'}, inplace = True)

In [9]:
#merging the dataframes - only adding index values
master_index_df = violence_date_filter.merge(care_date_filter['IndexValueCare'], left_index=True, right_index=True).merge(
    eco_sector_date_filter['IndexValueEcoSector'], left_index=True, right_index=True).merge(
    employment_date_filter['IndexValueEmployment'], left_index=True, right_index=True).merge(
    pay_date_filter['IndexValuePay'], left_index=True, right_index=True).merge(
    decision_date_filter['IndexValueDecisionMakers'], left_index=True, right_index=True).merge(
    pension_date_filter['IndexValuePension'], left_index=True, right_index=True)

In [10]:
#drop unncessary columns from original violence dataframe
master_index_df = master_index_df.drop(master_index_df.columns[2:6],axis=1)

#create a count for number of non-zero columns - we will use this to calculate our average 
#master_index_df['NonZeroCount'] = master_index_df.iloc[:,2:9].gt(0).sum(axis=1)
master_index_df

Unnamed: 0,Country,Year,IndexValueViolence,IndexValueCare,IndexValueEcoSector,IndexValueEmployment,IndexValuePay,IndexValueDecisionMakers,IndexValuePension
0,Austria,2013,0.885280,0.734,0.700564,0.91,0.769500,0.58,0.591
1,Austria,2014,0.889364,0.751,0.698389,0.92,0.795316,0.64,0.576
2,Austria,2015,0.891987,0.751,0.704653,0.92,0.799158,0.60,0.586
3,Austria,2016,0.804257,0.749,0.702406,0.93,0.807842,0.62,0.595
4,Austria,2017,0.765287,0.795,0.692097,0.92,0.812579,0.62,0.595
...,...,...,...,...,...,...,...,...,...
211,Sweden,2016,0.145063,0.000,0.733925,0.97,0.916619,0.90,0.734
212,Sweden,2017,0.097412,0.000,0.736059,0.97,0.916571,0.92,0.753
213,Sweden,2018,0.082916,0.000,0.725338,0.97,0.928524,0.90,0.732
214,Sweden,2019,0.070909,0.000,0.726577,0.97,0.933524,0.92,0.731


In [11]:
#generating an overall index - take the average of the nonzero individual index values (i.e. the NonZeroCount)
#scale to 0-100 index - inspired by https://ourworldindata.org/human-development-index#:~:text=The%20HDI%20is%20calculated%20as,and%20expected%20years%20of%20schooling). 
#replace 0's with nan's for now - allows us to calculate geometric mean - then replace later
master_index_df = master_index_df.replace(0, np.nan)
master_index_df['IndexTotal'] = (np.exp(np.log(master_index_df.iloc[:,2:9].prod(axis=1))/master_index_df.iloc[:,2:9].notna().sum(1)))
master_index_df = master_index_df.replace(np.nan, 0)

master_index_df

Unnamed: 0,Country,Year,IndexValueViolence,IndexValueCare,IndexValueEcoSector,IndexValueEmployment,IndexValuePay,IndexValueDecisionMakers,IndexValuePension,IndexTotal
0,Austria,2013,0.885280,0.734,0.700564,0.91,0.769500,0.58,0.591,0.728856
1,Austria,2014,0.889364,0.751,0.698389,0.92,0.795316,0.64,0.576,0.743691
2,Austria,2015,0.891987,0.751,0.704653,0.92,0.799158,0.60,0.586,0.740443
3,Austria,2016,0.804257,0.749,0.702406,0.93,0.807842,0.62,0.595,0.736251
4,Austria,2017,0.765287,0.795,0.692097,0.92,0.812579,0.62,0.595,0.735220
...,...,...,...,...,...,...,...,...,...,...
211,Sweden,2016,0.145063,0.000,0.733925,0.97,0.916619,0.90,0.734,0.630016
212,Sweden,2017,0.097412,0.000,0.736059,0.97,0.916571,0.92,0.753,0.594531
213,Sweden,2018,0.082916,0.000,0.725338,0.97,0.928524,0.90,0.732,0.573786
214,Sweden,2019,0.070909,0.000,0.726577,0.97,0.933524,0.92,0.731,0.561605


In [12]:
master_index_df.to_csv('./datasets_cleaned/master_index_df.csv')