In [1]:
#read in packages
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl')

In [2]:
#read in list of EU countries
eu_df = pd.read_csv('./additional_data/EU27_COUNTRY_LIST.csv')

In [3]:
#read in raw excel file - no need to read columns as time series stored in a matrix
female_care_df = pd.read_excel('./datasets_raw/eurostat_inactive_population_caring.xlsx','Sheet 3')
male_care_df = pd.read_excel('./datasets_raw/eurostat_inactive_population_caring.xlsx','Sheet 2')
print(female_care_df.head())
print(male_care_df.head())

  Data extracted on 26/12/2022 20:32:46 from [ESTAT]  \
0                                          Dataset:    
1                                     Last updated:    
2                                                NaN   
3                                     Time frequency   
4                                             Reason   

                                          Unnamed: 1  \
0  Inactive population due to caring responsibili...   
1                                   15/12/2022 23:00   
2                                                NaN   
3                                                NaN   
4                                                NaN   

                                          Unnamed: 2 Unnamed: 3 Unnamed: 4  \
0                                                NaN        NaN        NaN   
1                                                NaN        NaN        NaN   
2                                                NaN        NaN        NaN   
3             

In [4]:
#clear unnecessary rows and columns
female_care_df.drop(female_care_df.index[0:9], inplace=True)
#rename columns
female_care_df = female_care_df.rename(columns=female_care_df.iloc[0])
female_care_df.drop(female_care_df.index[0:5], inplace=True)
#drop empty columns - titled 'NaN'
female_care_df = female_care_df.loc[:, female_care_df.columns.notna()]
#change name of first column to Country to fit into tidy data format and consistent across dfs
female_care_df = female_care_df.rename(columns={"TIME": "Country"})
#reformat names of countries - Czech Republic and Germany - to fit into standardized formatting across sheets
female_care_df = female_care_df.replace({'Country' : { "Czechia" : "Czech Republic", "Germany (until 1990 former territory of the FRG)" : "Germany"}})
merged_female_care_df = eu_df.merge(female_care_df, on='Country', how='left')
#drop the 'Initial' column contained in the EU sheet - not needed here
merged_female_care_df = merged_female_care_df.drop('Initial', axis = 1)
 #melt df into tidy data format by stacking and restacking - create a 'Value' column to store the respective value in
merged_female_care_df = merged_female_care_df.set_index(['Country']).rename_axis(['Year'],axis=1).stack().reset_index()
merged_female_care_df = merged_female_care_df.rename(columns={merged_female_care_df.columns[2]: "Value"})
merged_female_care_df

Unnamed: 0,Country,Year,Value
0,Belgium,2000,:
1,Belgium,2001,20.6
2,Belgium,2002,25.5
3,Belgium,2003,29.8
4,Belgium,2004,22
...,...,...,...
589,Sweden,2017,9.8
590,Sweden,2018,6.9
591,Sweden,2019,10
592,Sweden,2020,7.5


In [5]:
#do the same for male df 

#clear unnecessary rows and columns
male_care_df.drop(male_care_df.index[0:9], inplace=True)
#rename columns
male_care_df = male_care_df.rename(columns=male_care_df.iloc[0])
male_care_df.drop(male_care_df.index[0:5], inplace=True)
#drop empty columns - titled 'NaN'
male_care_df = male_care_df.loc[:, male_care_df.columns.notna()]
#change name of first column to Country to fit into tidy data format and consistent across dfs
male_care_df = male_care_df.rename(columns={"TIME": "Country"})
#reformat names of countries - Czech Republic and Germany - to fit into standardized formatting across sheets
male_care_df = male_care_df.replace({'Country' : { "Czechia" : "Czech Republic", "Germany (until 1990 former territory of the FRG)" : "Germany"}})
merged_male_care_df = eu_df.merge(male_care_df, on='Country', how='left')
#drop the 'Initial' column contained in the EU sheet - not needed here
merged_male_care_df = merged_male_care_df.drop('Initial', axis = 1)
 #melt df into tidy data format by stacking and restacking - create a 'Value' column to store the respective value in
merged_male_care_df = merged_male_care_df.set_index(['Country']).rename_axis(['Year'],axis=1).stack().reset_index()
merged_male_care_df = merged_male_care_df.rename(columns={merged_male_care_df.columns[2]: "Value"})
merged_male_care_df

Unnamed: 0,Country,Year,Value
0,Belgium,2000,:
1,Belgium,2001,:
2,Belgium,2002,:
3,Belgium,2003,:
4,Belgium,2004,:
...,...,...,...
589,Sweden,2017,:
590,Sweden,2018,:
591,Sweden,2019,:
592,Sweden,2020,:


In [6]:
#ensure formatting is in tidy data form - check heads and random samples to ensure consistent
#formatting
print(merged_female_care_df.head())
print(merged_female_care_df.sample(5))
print(merged_male_care_df.head())
print(merged_male_care_df.sample(5))

   Country  Year Value
0  Belgium  2000     :
1  Belgium  2001  20.6
2  Belgium  2002  25.5
3  Belgium  2003  29.8
4  Belgium  2004    22
      Country  Year Value
150   Ireland  2018  52.5
7     Belgium  2007  20.6
375     Malta  2001  84.6
593    Sweden  2021  14.9
507  Slovenia  2001  24.1
   Country  Year Value
0  Belgium  2000     :
1  Belgium  2001     :
2  Belgium  2002     :
3  Belgium  2003     :
4  Belgium  2004     :
        Country  Year Value
506    Slovenia  2000   9.9
339  Luxembourg  2009     :
572      Sweden  2000     :
316   Lithuania  2008     :
135     Ireland  2003     :


In [7]:
#count number of null values in each df. Since null values are stored as ':' sum across the values column
#there are many across the male category - this will impact how much data we can use 

print(merged_female_care_df.info())
print((merged_female_care_df['Value']==':').sum())

print(merged_male_care_df.info())
print((merged_male_care_df['Value']==':').sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 594 entries, 0 to 593
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Country  594 non-null    object
 1   Year     594 non-null    object
 2   Value    594 non-null    object
dtypes: object(3)
memory usage: 14.0+ KB
None
33
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 594 entries, 0 to 593
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Country  594 non-null    object
 1   Year     594 non-null    object
 2   Value    594 non-null    object
dtypes: object(3)
memory usage: 14.0+ KB
None
245


In [8]:
#there are a number of null values in female, but many more in male. 
#since this is time series data, we may be ok to include these data points, but we 
#will change their value to 0, as this can then be filtered out in our visual representation. 
#we will ensure that values containing a 0 will not be included in the overall 'gap'
#i.e. we will replace them with just a zero, and index values will be calcualted
#without including 0's 
merged_female_care_df = merged_female_care_df.replace(':', 0)
merged_male_care_df = merged_male_care_df.replace(':', 0)
merged_female_care_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 594 entries, 0 to 593
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Country  594 non-null    object 
 1   Year     594 non-null    object 
 2   Value    594 non-null    float64
dtypes: float64(1), object(2)
memory usage: 14.0+ KB


In [9]:
#Compare if each country has the same number of rows
countries = list(eu_df['Country'])
for c in countries:
        print('Country: ' + str(c) + ': '+ str((merged_female_care_df['Country'] == c).sum()))
        print('Country: ' + str(c) + ': '+ str((merged_male_care_df['Country'] == c).sum()))

Country: European Union: 0
Country: European Union: 0
Country: Belgium: 22
Country: Belgium: 22
Country: Bulgaria: 22
Country: Bulgaria: 22
Country: Czech Republic: 22
Country: Czech Republic: 22
Country: Denmark: 22
Country: Denmark: 22
Country: Germany: 22
Country: Germany: 22
Country: Estonia: 22
Country: Estonia: 22
Country: Ireland: 22
Country: Ireland: 22
Country: Greece: 22
Country: Greece: 22
Country: Spain: 22
Country: Spain: 22
Country: France: 22
Country: France: 22
Country: Croatia: 22
Country: Croatia: 22
Country: Italy: 22
Country: Italy: 22
Country: Cyprus: 22
Country: Cyprus: 22
Country: Latvia: 22
Country: Latvia: 22
Country: Lithuania: 22
Country: Lithuania: 22
Country: Luxembourg: 22
Country: Luxembourg: 22
Country: Hungary: 22
Country: Hungary: 22
Country: Malta: 22
Country: Malta: 22
Country: Netherlands: 22
Country: Netherlands: 22
Country: Austria: 22
Country: Austria: 22
Country: Poland: 22
Country: Poland: 22
Country: Portugal: 22
Country: Portugal: 22
Country:

In [10]:
#convert dataframe year dates to datetime - correct format - and check that each year has 
#the same number of rows - 27
merged_female_care_df['Year'] = merged_female_care_df['Year'].astype(int)
merged_female_care_df['Year'] = pd.to_datetime(merged_female_care_df.Year, format='%Y').dt.year
merged_male_care_df['Year'] = pd.to_datetime(merged_male_care_df['Year']).dt.year

for i in range(2000,2025):
    print('Year: ' + str(i) + ': '+ str((merged_female_care_df['Year'] == i).sum()))
    print('Year: ' + str(i) + ': '+ str((merged_male_care_df['Year'] == i).sum()))
    

Year: 2000: 27
Year: 2000: 27
Year: 2001: 27
Year: 2001: 27
Year: 2002: 27
Year: 2002: 27
Year: 2003: 27
Year: 2003: 27
Year: 2004: 27
Year: 2004: 27
Year: 2005: 27
Year: 2005: 27
Year: 2006: 27
Year: 2006: 27
Year: 2007: 27
Year: 2007: 27
Year: 2008: 27
Year: 2008: 27
Year: 2009: 27
Year: 2009: 27
Year: 2010: 27
Year: 2010: 27
Year: 2011: 27
Year: 2011: 27
Year: 2012: 27
Year: 2012: 27
Year: 2013: 27
Year: 2013: 27
Year: 2014: 27
Year: 2014: 27
Year: 2015: 27
Year: 2015: 27
Year: 2016: 27
Year: 2016: 27
Year: 2017: 27
Year: 2017: 27
Year: 2018: 27
Year: 2018: 27
Year: 2019: 27
Year: 2019: 27
Year: 2020: 27
Year: 2020: 27
Year: 2021: 27
Year: 2021: 27
Year: 2022: 0
Year: 2022: 0
Year: 2023: 0
Year: 2023: 0
Year: 2024: 0
Year: 2024: 0


In [11]:
#check summary stats of each df - clear that Men do less carework than women
#comparability is good - #https://ec.europa.eu/eurostat/cache/metadata/en/sdg_05_40_esmsip2.htm#coverage_comparability1652978116353
#'Data are comparable between all EU Member States respectively other presented countries.'
##Length of comparable time series without methodological break is less than 3 data points.'
#but break in time series in 2021 due to the entry into force of a new social statistics framework
#this may not be a problem as our likely focal point is until 2020 
print(merged_female_care_df.describe())
print(merged_male_care_df.describe())

              Year       Value
count   594.000000  594.000000
mean   2010.500000   33.071886
std       6.349636   18.058252
min    2000.000000    0.000000
25%    2005.000000   20.325000
50%    2010.500000   33.300000
75%    2016.000000   43.975000
max    2021.000000   96.700000
              Year       Value
count   594.000000  594.000000
mean   2010.500000    5.425758
std       6.349636    6.751985
min    2000.000000    0.000000
25%    2005.000000    0.000000
50%    2010.500000    3.400000
75%    2016.000000    8.800000
max    2021.000000   41.400000


In [12]:
#we will find the gap between male and female care - make a copy of the male care
#df to override
master_care_df = merged_male_care_df.copy()
master_care_df = master_care_df.drop('Value', axis = 1)

In [13]:
#calculating the gap. to handle nan/zero values we are very conservative - for loop
#dictates that, if either the care value in the male or female data set is zero, set
#the new gap value to zero. if both are non-zero, calcualte the gap. 
for index, row in merged_female_care_df.iterrows():
    if row['Value'] == 0 or merged_male_care_df.loc[index, 'Value'] == 0:
        master_care_df.loc[index,'care_gap_%_active_population'] = 0
    else:
        master_care_df.loc[index, 'care_gap_%_active_population'] = row['Value'] - merged_male_care_df.loc[index, 'Value']

In [14]:
master_care_df

Unnamed: 0,Country,Year,care_gap_%_active_population
0,Belgium,2000,0.0
1,Belgium,2001,0.0
2,Belgium,2002,0.0
3,Belgium,2003,0.0
4,Belgium,2004,0.0
...,...,...,...
589,Sweden,2017,0.0
590,Sweden,2018,0.0
591,Sweden,2019,0.0
592,Sweden,2020,0.0


In [15]:
#index calculation - our assumption are the minimum and maximum values that make up the scale for calculating an index 
worst_value = 100
best_value = 0

for index, row in master_care_df.iterrows():
    if row['care_gap_%_active_population'] != 0:
        master_care_df.loc[index, 'IndexValueCare'] = (row['care_gap_%_active_population'] - worst_value) / (best_value - worst_value)
    else:
        master_care_df.loc[index, 'IndexValueCare'] = 0

In [16]:
master_care_df

Unnamed: 0,Country,Year,care_gap_%_active_population,IndexValueCare
0,Belgium,2000,0.0,0.000
1,Belgium,2001,0.0,0.000
2,Belgium,2002,0.0,0.000
3,Belgium,2003,0.0,0.000
4,Belgium,2004,0.0,0.000
...,...,...,...,...
589,Sweden,2017,0.0,0.000
590,Sweden,2018,0.0,0.000
591,Sweden,2019,0.0,0.000
592,Sweden,2020,0.0,0.000


In [17]:
master_care_df.to_csv('./datasets_cleaned/master_care_df.csv')