# Loading mortality and demographic data from Eurostat

In [1]:
import pandas as pd
import numpy as np

## Loading and setting up mortality data

In [2]:
# load raw data on mortality from Eurostat
mortality = pd.read_csv('demo_r_mwk2_ts_1_Data.csv', sep=',', encoding='latin-1')
# missing values to -1
mortality = mortality.replace(':','-1')
mortality = mortality.replace('inf','-2')
# remove comma from figures
mortality['Value'] = mortality['Value'].str.replace(',','')
# drop unnecessary columns
mortality = mortality.drop(['UNIT', 'Flag and Footnotes', 'SEX'], axis=1)
# extract country code week number and year
mortality['country'] = mortality['GEO'].astype(str).str[0:2]
mortality['week_nr'] = mortality['TIME'].astype(str).str[4:7]
mortality['year'] = mortality['TIME'].astype(str).str[0:4]
# rename column to deatgs
mortality = mortality.rename(columns={'Value': 'deaths'})
# converting deaths to integer
mortality['deaths'] = mortality['deaths'].astype(int)
# set missing values previously set to -1 to Nan
mortality['deaths'] = np.where(mortality['deaths'] < 0, np.NaN, mortality['deaths'])
mortality.head()

Unnamed: 0,TIME,GEO,GEO_LABEL,deaths,country,week_nr,year
0,2000W01,BE,Belgium,2814.0,BE,W01,2000
1,2000W01,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,298.0,BE,W01,2000
2,2000W01,BE10,Région de Bruxelles-Capitale/Brussels Hoofdste...,298.0,BE,W01,2000
3,2000W01,BE2,Vlaams Gewest,1443.0,BE,W01,2000
4,2000W01,BE21,Prov. Antwerpen,439.0,BE,W01,2000


In [3]:
deaths = pd.DataFrame(mortality['deaths'])
deaths.head()

Unnamed: 0,deaths
0,2814.0
1,298.0
2,298.0
3,1443.0
4,439.0


In [4]:
deaths[~deaths.applymap(np.isreal).all(1)]

Unnamed: 0,deaths


In [5]:
mortality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 484992 entries, 0 to 484991
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   TIME       484992 non-null  object 
 1   GEO        484992 non-null  object 
 2   GEO_LABEL  484992 non-null  object 
 3   deaths     335330 non-null  float64
 4   country    484992 non-null  object 
 5   week_nr    484992 non-null  object 
 6   year       484992 non-null  object 
dtypes: float64(1), object(6)
memory usage: 25.9+ MB


In [6]:
mortality.shape

(484992, 7)

In [7]:
# load mortality on Nuts 3 for exceptionally large NUTS2 regions: https://ec.europa.eu/eurostat/databrowser/view/DEMO_R_MWK3_T__custom_2768883/default/table?lang=en
mortality_NUTS3 = pd.read_csv('demo_r_mweek3_1_Data.csv', sep=',', encoding='latin-1')
# missing values to -1
mortality_NUTS3 = mortality_NUTS3.replace(':','-1')
mortality_NUTS3 = mortality_NUTS3.replace('inf','-2')
# remove comma from figures
mortality_NUTS3['Value'] = mortality_NUTS3['Value'].str.replace(',','')
# drop unnecessary columns
mortality_NUTS3 = mortality_NUTS3.drop(['UNIT', 'SEX', 'AGE'], axis=1)
# extract country code week number and year
mortality_NUTS3['country'] = mortality_NUTS3['GEO'].astype(str).str[0:2]
mortality_NUTS3['week_nr'] = mortality_NUTS3['TIME'].astype(str).str[4:7]
mortality_NUTS3['year'] = mortality_NUTS3['TIME'].astype(str).str[0:4]
# rename column to deatgs
mortality_NUTS3 = mortality_NUTS3.rename(columns={'Value': 'deaths'})
# converting deaths to integer
mortality_NUTS3['deaths'] = mortality_NUTS3['deaths'].astype(float)
# set missing values previously set to -1 to Nan
mortality_NUTS3['deaths'] = np.where(mortality_NUTS3['deaths'] < 0, np.NaN, mortality_NUTS3['deaths'])
mortality_NUTS3.head()
mortality_NUTS3.head()

Unnamed: 0,TIME,GEO,deaths,country,week_nr,year
0,2000W01,FI1D1,46.0,FI,W01,2000
1,2000W01,FI1D2,66.0,FI,W01,2000
2,2000W01,FI1D3,45.0,FI,W01,2000
3,2000W01,FI1D5,14.0,FI,W01,2000
4,2000W01,FI1D7,44.0,FI,W01,2000


In [8]:
# concatenate overall mortality with NUTS3 mortality
frames = [mortality,mortality_NUTS3]
mortality = pd.concat(frames)

## loading and setting up population figure

### population density

In [9]:
# read density data from Eurostat
density = pd.read_csv('demo_r_d3dens_1_Data.csv', sep=',', encoding='latin-1')
# missing values to -1
density = density.replace(':','-1')
# remove comma from figures
density['Value'] = density['Value'].str.replace(',','')
# rename columns
density = density.rename(columns={'TIME': 'year', 'Value': 'density'})
# convert density to integer
density['year'] = density['year'].astype(str)
density['density'] = density['density'].astype(float)
# set missing values previously set to -1 to Nan
density['density'] = np.where(density['density'] <= 0, np.NaN, density['density'])
density = density.drop(['UNIT'],1)
density.head(3)

Unnamed: 0,year,GEO,density
0,2000,EU27_2020,104.2
1,2000,EU28,111.9
2,2000,EU27_2007,112.3


No data for 2020 and 2021. Hence, data for 2020 and 2021 will be added to the 'density' dataframe. This is done by subsetting the data for 2019 and setting the year to 2020 and 2021, and then concatenating the subset for 2020 with the original dataframe. That way data for the year 2021 is included in the data with the population figures of 2020.

In [10]:
# use values of 2019 for 2020 and 2021
dens_2020 = density[density['year'] == '2019']
dens_2020['year'] = '2020'
dens_2021 = density[density['year'] == '2019']
dens_2021['year'] = '2021'
dens_2020.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dens_2020['year'] = '2020'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dens_2021['year'] = '2021'


Unnamed: 0,year,GEO,density
38418,2020,EU27_2020,109.0
38419,2020,EU28,
38420,2020,EU27_2007,
38421,2020,BE,377.3
38422,2020,BE1,7526.7


In [11]:
# concatenate all frames
frames = [density, dens_2020,dens_2021]
density = pd.concat(frames)
density

Unnamed: 0,year,GEO,density
0,2000,EU27_2020,104.2
1,2000,EU28,111.9
2,2000,EU27_2007,112.3
3,2000,BE,338.0
4,2000,BE1,5974.3
...,...,...,...
40435,2021,TRC3,89.2
40436,2021,TRC31,95.9
40437,2021,TRC32,131.1
40438,2021,TRC33,75.5


In [12]:
# inspect code length as indicator of NUTS regions in the data
density['code_length'] = density.GEO.str.len()
density['code_length'].value_counts()

5    33528
4     7348
3     2750
2      814
9       44
Name: code_length, dtype: int64

In [13]:
# load raw population data
population = pd.read_csv('demo_r_d2jan_1_Data.csv', sep=',', encoding='latin-1')
# missing values to -1
population = population.replace(':','-1')
# remove comma from figures
population['Value'] = population['Value'].str.replace(',','')
# rename columns
population = population.rename(columns={'TIME': 'year', 'Value': 'population'})
# convert population to integer
population['year'] = population['year'].astype(str)
population['population'] = population['population'].astype(int)
# set missing values previously set to -1 to Nan
population['population'] = np.where(population['population'] <= 0, np.NaN, population['population'])
population.head(3)

Unnamed: 0,year,GEO,GEO_LABEL,SEX,AGE,UNIT,population
0,2000,EU27_2020,European Union - 27 countries (from 2020),Total,Total,Number,428473834.0
1,2000,EU28,European Union - 28 countries (2013-2020),Total,Total,Number,487259080.0
2,2000,EU27_2007,European Union - 27 countries (2007-2013),Total,Total,Number,482761345.0


In [14]:
# download for NUTS 3 regions (https://appsso.eurostat.ec.europa.eu/nui/show.do?dataset=demo_r_pjanaggr3&lang=en)
population_NUTS3 = pd.read_csv('demo_r_pjanaggr3_1_Data.csv', sep=',', encoding='latin-1')
# missing values to -1
population_NUTS3 = population_NUTS3.replace(':','-1')
# remove comma from figures
population_NUTS3['Value'] = population_NUTS3['Value'].str.replace(',','')
# rename columns
population_NUTS3 = population_NUTS3.rename(columns={'TIME': 'year', 'Value': 'population'})
# convert population to integer
population_NUTS3['year'] = population_NUTS3['year'].astype(str)
population_NUTS3['population'] = population_NUTS3['population'].astype(int)
# set missing values previously set to -1 to Nan
population_NUTS3['population'] = np.where(population_NUTS3['population'] <= 0, np.NaN, population_NUTS3['population'])
population_NUTS3.head(3)

Unnamed: 0,year,GEO,SEX,AGE,UNIT,population
0,2000,FI1D1,Total,Total,Number,167958.0
1,2000,FI1D2,Total,Total,Number,255456.0
2,2000,FI1D3,Total,Total,Number,172551.0


In [15]:
population.shape

(10647, 7)

In [16]:
# concatenate overall and NUTS3 population
frames = [population, population_NUTS3]
population = pd.concat(frames)
population.shape

(11043, 7)

In [18]:
# get value counts for year
population['year'].value_counts()

2000    525
2003    525
2005    525
2019    525
2002    525
2020    525
2009    525
2004    525
2016    525
2013    525
2010    525
2001    525
2014    525
2012    525
2006    525
2011    525
2007    525
2017    525
2018    525
2008    525
2015    525
2021     18
Name: year, dtype: int64

No data for  2021. Hence, data for 2021 will be added to the 'population' dataframe. This is done by subsetting the data for 2019 and setting the year to 2021, and then concatenating the subset for 2020 with the original dataframe. That way data for the year 2021 is included in the data with the population figures of 2020.

In [19]:
# use figures of 2020 for 2021
pop_2020 = population[population['year'] == '2020']
pop_2020['year'] = '2021'
pop_2020.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pop_2020['year'] = '2021'


Unnamed: 0,year,GEO,GEO_LABEL,SEX,AGE,UNIT,population
10140,2021,EU27_2020,European Union - 27 countries (from 2020),Total,Total,Number,447319916.0
10141,2021,EU28,European Union - 28 countries (2013-2020),Total,Total,Number,
10142,2021,EU27_2007,European Union - 27 countries (2007-2013),Total,Total,Number,
10143,2021,BE,Belgium,Total,Total,Number,11522440.0
10144,2021,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,Total,Total,Number,1223364.0


In [22]:
# get shape
pop_2020.shape

(525, 7)

In [23]:
# concatenate all frames
frames = [population, pop_2020]
population = pd.concat(frames)
population['year'].value_counts()

2021    543
2000    525
2010    525
2005    525
2019    525
2002    525
2020    525
2009    525
2004    525
2016    525
2013    525
2001    525
2003    525
2014    525
2012    525
2006    525
2011    525
2007    525
2017    525
2018    525
2008    525
2015    525
Name: year, dtype: int64

In [24]:
# merge population and density figures
population = population.merge(density, on=['year', 'GEO'])
population

Unnamed: 0,year,GEO,GEO_LABEL,SEX,AGE,UNIT,population,density,code_length
0,2000,EU27_2020,European Union - 27 countries (from 2020),Total,Total,Number,428473834.0,104.2,9
1,2000,EU28,European Union - 28 countries (2013-2020),Total,Total,Number,487259080.0,111.9,4
2,2000,EU27_2007,European Union - 27 countries (2007-2013),Total,Total,Number,482761345.0,112.3,9
3,2000,BE,Belgium,Total,Total,Number,10239085.0,338.0,2
4,2000,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,Total,Total,Number,959318.0,5974.3,3
...,...,...,...,...,...,...,...,...,...
11342,2021,TRB2,"Van, Mus, Bitlis, Hakkari",Total,Total,Number,2174672.0,53.7,4
11343,2021,TRC,Güneydogu Anadolu,Total,Total,Number,8975618.0,119.1,3
11344,2021,TRC1,"Gaziantep, Adiyaman, Kilis",Total,Total,Number,2838319.0,185.7,4
11345,2021,TRC2,"Sanliurfa, Diyarbakir",Total,Total,Number,3829967.0,111.9,4


In [25]:
# value count of code lebgth
population['code_length'] = population.GEO.str.len()
population['code_length'].value_counts()

4    7348
3    2750
2     814
5     391
9      44
Name: code_length, dtype: int64

The new dataframe consists data from 2020 and 2021.

In [26]:
# get shape
population.shape

(11347, 9)

In [27]:
# transform variables to numeric
population['population'] = pd.to_numeric(population['population'], errors='coerce')
population['year'] = pd.to_numeric(population['year'], errors='coerce')

In [28]:
# exclude non-EU countries, unknown regions and oversees regions
non_EU = ['UK', 'TR']
non_EU_GEO = ['MKX', 'FRY', 'FRX', 'HUX']
# get country code
population['country'] = population['GEO'].str[:2]
population = population[~population['country'].isin(non_EU)]
# remove French oversees departments and other redundant regions
population = population[~population['GEO'].isin(non_EU_GEO)]
population.head()

Unnamed: 0,year,GEO,GEO_LABEL,SEX,AGE,UNIT,population,density,code_length,country
0,2000,EU27_2020,European Union - 27 countries (from 2020),Total,Total,Number,428473834.0,104.2,9,EU
1,2000,EU28,European Union - 28 countries (2013-2020),Total,Total,Number,487259080.0,111.9,4,EU
2,2000,EU27_2007,European Union - 27 countries (2007-2013),Total,Total,Number,482761345.0,112.3,9,EU
3,2000,BE,Belgium,Total,Total,Number,10239085.0,338.0,2,BE
4,2000,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,Total,Total,Number,959318.0,5974.3,3,BE


## Checking for size of NUTS1 regions in weather data

#weather_final = pd.read_csv('Heat_wave.csv')
#weather_final['NUTS1'].value_counts().head(10)

In [29]:
#weather_final = pd.read_csv('Heat_wave.csv')
#weather_final['NUTS2'].value_counts().head(10)

### For the Netherlands, mortality data is only available on the national level for a long time period. Therefore, the Netherlands will be kept in the data as a whole nation instead of NUTS 1 regions.

In [30]:
mortality_NUTS3['GEO'].unique()

array(['FI1D1', 'FI1D2', 'FI1D3', 'FI1D5', 'FI1D7', 'FI1D8', 'FI1D9',
       'SE311', 'SE312', 'SE313', 'SE321', 'SE322', 'SE331', 'SE332',
       'NO071', 'NO072', 'NO073', 'NO074'], dtype=object)

## Keeping only NUTS1 and NUTS 2 regions except for the Netherlands (NL), mainland Finland (FI1), Norway (NO0), Northern Sweden (SE3). For these regions NUTS3 regions will be used

In [31]:
regions = ['FI1D1', 'FI1D2', 'FI1D3', 'FI1D5', 'FI1D7', 'FI1D8', 'FI1D9','SE311', 'SE312', 'SE313', 'SE321', 'SE322',
           'SE331', 'SE332','NO071', 'NO072', 'NO073', 'NO074', 'NL']
regions

['FI1D1',
 'FI1D2',
 'FI1D3',
 'FI1D5',
 'FI1D7',
 'FI1D8',
 'FI1D9',
 'SE311',
 'SE312',
 'SE313',
 'SE321',
 'SE322',
 'SE331',
 'SE332',
 'NO071',
 'NO072',
 'NO073',
 'NO074',
 'NL']

In [34]:
# filter NUTS 1, NUTS 2 mortality OR in one of the specified special regions
population['code_length'] = population.GEO.str.len()
# keep only NUTS1 regions
population = population[(population['code_length'] == 4) | (population['code_length'] == 3) | (population['GEO'].isin(regions))]
population.head()

Unnamed: 0,year,GEO,GEO_LABEL,SEX,AGE,UNIT,population,density,code_length,country
1,2000,EU28,European Union - 28 countries (2013-2020),Total,Total,Number,487259080.0,111.9,4,EU
4,2000,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,Total,Total,Number,959318.0,5974.3,3,BE
5,2000,BE10,Région de Bruxelles-Capitale/Brussels Hoofdste...,Total,Total,Number,959318.0,5974.3,4,BE
6,2000,BE2,Vlaams Gewest,Total,Total,Number,5940251.0,445.0,3,BE
7,2000,BE21,Prov. Antwerpen,Total,Total,Number,1643972.0,589.1,4,BE


In [35]:
# check value counts
population['code_length'].value_counts()

4    5874
3    2200
5     391
2      22
Name: code_length, dtype: int64

### checking for missing values

In [36]:
# filter missing values
filtered_df = population[population['population'].isnull()]
filtered_df.head(5)

Unnamed: 0,year,GEO,GEO_LABEL,SEX,AGE,UNIT,population,density,code_length,country
39,2000,DK01,Hovedstaden,Total,Total,Number,,,4,DK
40,2000,DK02,Sjælland,Total,Total,Number,,,4,DK
41,2000,DK03,Syddanmark,Total,Total,Number,,,4,DK
42,2000,DK04,Midtjylland,Total,Total,Number,,,4,DK
43,2000,DK05,Nordjylland,Total,Total,Number,,,4,DK
...,...,...,...,...,...,...,...,...,...,...
10541,2004,NO072,,Total,Total,Number,,6.1,5,NO
10542,2004,NO073,,Total,Total,Number,,1.6,5,NO
10845,2021,NO072,,Total,Total,Number,,6.6,5,NO
10847,2021,NO073,,Total,Total,Number,,1.6,5,NO


In [37]:
# sort by NUTS region and year and replace missing values with the previous row
population = population.sort_values(by=['GEO', 'year'])
population['population'] = population['population'].fillna(method='bfill')
population['density'] = population['density'].fillna(method='bfill')

In [38]:
# all missing values for region PL84 have been removed for instance
population[population['GEO'] == 'PL84']

Unnamed: 0,year,GEO,GEO_LABEL,SEX,AGE,UNIT,population,density,code_length,country
304,2000,PL84,Podlaskie,Total,Total,Number,1165447.0,58.6,4,PL
802,2001,PL84,Podlaskie,Total,Total,Number,1165447.0,58.6,4,PL
1300,2002,PL84,Podlaskie,Total,Total,Number,1165447.0,58.6,4,PL
1798,2003,PL84,Podlaskie,Total,Total,Number,1165447.0,58.6,4,PL
2296,2004,PL84,Podlaskie,Total,Total,Number,1165447.0,58.6,4,PL
2794,2005,PL84,Podlaskie,Total,Total,Number,1165447.0,58.6,4,PL
3292,2006,PL84,Podlaskie,Total,Total,Number,1165447.0,58.6,4,PL
3790,2007,PL84,Podlaskie,Total,Total,Number,1165447.0,58.6,4,PL
4288,2008,PL84,Podlaskie,Total,Total,Number,1165447.0,58.6,4,PL
4786,2009,PL84,Podlaskie,Total,Total,Number,1165447.0,58.6,4,PL


In [39]:
# now missing values left
filtered_df1 = population[population['population'].isnull()]
filtered_df1

Unnamed: 0,year,GEO,GEO_LABEL,SEX,AGE,UNIT,population,density,code_length,country


## loading data on population over 65

In [40]:
# loading data separately for different age groups that will be added to obtain the population number of 65 or older
age65_69 = pd.read_csv('demo_r_pjangroup_1_Data_65_69.csv', sep=',', encoding='latin-1')
age70_74 = pd.read_csv('demo_r_pjangroup_1_Data_70_74.csv', sep=',', encoding='latin-1')
over_74 = pd.read_csv('demo_r_pjangroup_1_Data_75+.csv', sep=',', encoding='latin-1')

# rename column
age65_69 = age65_69.rename(columns={'Value': 'age_65_69'})
# set missing values to NaN
age65_69 = age65_69.replace(':','-1')
age65_69['age_65_69'] = np.where(age65_69['age_65_69'] == '-1', np.NaN, age65_69['age_65_69'])

# rename column
age70_74 = age70_74.rename(columns={'Value': 'age_70_74'})
# keep only relevant columns to facilitate the merging
age70_74 = age70_74[['TIME', 'GEO', 'age_70_74']]
# set missing values to NaN
age70_74 = age70_74.replace(':','-1')
age70_74['age_70_74'] = np.where(age70_74['age_70_74'] == '-1', np.NaN, age70_74['age_70_74'])

# rename column
over_74 = over_74.rename(columns={'Value': 'age_over_74'})
# keep only relevant columns to facilitate the merging
over_74 = over_74[['TIME', 'GEO', 'age_over_74']]
# set missing values to NaN
over_74 = over_74.replace(':','-1')
over_74['age_over_74'] = np.where(over_74['age_over_74'] == '-1', np.NaN, over_74['age_over_74'])

In [41]:
# merge data into one dataframe
final_age = age65_69.merge(age70_74, how='inner', on=['GEO','TIME'])
final_age = final_age.merge(over_74, how='inner', on=['GEO','TIME'])
final_age.head()

Unnamed: 0,TIME,GEO,GEO_LABEL,SEX,AGE,UNIT,age_65_69,age_70_74,age_over_74
0,2000,BE,Belgium,Total,Y65-69,Number,521416,457815,735862
1,2000,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,Total,Y65-69,Number,42109,39907,78892
2,2000,BE10,Région de Bruxelles-Capitale/Brussels Hoofdste...,Total,Y65-69,Number,42109,39907,78892
3,2000,BE2,Vlaams Gewest,Total,Y65-69,Number,313903,264867,415046
4,2000,BE21,Prov. Antwerpen,Total,Y65-69,Number,85669,73830,118415


In [42]:
final_age.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10563 entries, 0 to 10562
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   TIME         10563 non-null  int64 
 1   GEO          10563 non-null  object
 2   GEO_LABEL    10563 non-null  object
 3   SEX          10563 non-null  object
 4   AGE          10563 non-null  object
 5   UNIT         10563 non-null  object
 6   age_65_69    9562 non-null   object
 7   age_70_74    9562 non-null   object
 8   age_over_74  9537 non-null   object
dtypes: int64(1), object(8)
memory usage: 825.2+ KB


In [43]:
# remove comma from figures and convert to numeric
final_age['age_65_69'] = final_age['age_65_69'].str.replace(',','')
final_age['age_65_69'] = pd.to_numeric(final_age['age_65_69'], errors='coerce')

# remove comma from figures and convert to numeric
final_age['age_70_74'] = final_age['age_70_74'].str.replace(',','')
final_age['age_70_74'] = pd.to_numeric(final_age['age_70_74'], errors='coerce')

# remove comma from figures and convert to numeric
final_age['age_over_74'] = final_age['age_over_74'].str.replace(',','')
final_age['age_over_74'] = pd.to_numeric(final_age['age_over_74'], errors='coerce')

# add up the numbers to obtain the population of 65 years or older
final_age['population_65+'] = final_age['age_65_69'] + final_age['age_70_74'] + final_age['age_over_74']
final_age

Unnamed: 0,TIME,GEO,GEO_LABEL,SEX,AGE,UNIT,age_65_69,age_70_74,age_over_74,population_65+
0,2000,BE,Belgium,Total,Y65-69,Number,521416.0,457815.0,735862.0,1715093.0
1,2000,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,Total,Y65-69,Number,42109.0,39907.0,78892.0,160908.0
2,2000,BE10,Région de Bruxelles-Capitale/Brussels Hoofdste...,Total,Y65-69,Number,42109.0,39907.0,78892.0,160908.0
3,2000,BE2,Vlaams Gewest,Total,Y65-69,Number,313903.0,264867.0,415046.0,993816.0
4,2000,BE21,Prov. Antwerpen,Total,Y65-69,Number,85669.0,73830.0,118415.0,277914.0
...,...,...,...,...,...,...,...,...,...,...
10558,2020,TRB2,"Van, Mus, Bitlis, Hakkari",Total,Y65-69,Number,35236.0,25679.0,33085.0,94000.0
10559,2020,TRC,Güneydogu Anadolu,Total,Y65-69,Number,158526.0,120257.0,170472.0,449255.0
10560,2020,TRC1,"Gaziantep, Adiyaman, Kilis",Total,Y65-69,Number,63033.0,47843.0,63399.0,174275.0
10561,2020,TRC2,"Sanliurfa, Diyarbakir",Total,Y65-69,Number,59965.0,44552.0,63080.0,167597.0


In [44]:
# download for NUTS 3 regions (https://appsso.eurostat.ec.europa.eu/nui/show.do?dataset=demo_r_pjanaggr3&lang=en)
population65_NUTS3 = pd.read_csv('demo_r_pjanaggr3_1_Data_65.csv', sep=',', encoding='latin-1')
# missing values to -1
population65_NUTS3 = population65_NUTS3.replace(':','-1')
# remove comma from figures
population65_NUTS3['Value'] = population65_NUTS3['Value'].str.replace(',','')
# rename columns
population65_NUTS3 = population65_NUTS3.rename(columns={'Value': 'population_65+'})
# convert population to integer
population65_NUTS3['TIME'] = population65_NUTS3['TIME'].astype(str)
population65_NUTS3['population_65+'] = population65_NUTS3['population_65+'].astype(int)
# set missing values previously set to -1 to Nan
population65_NUTS3['population_65+'] = np.where(population65_NUTS3['population_65+'] <= 0, np.NaN, population65_NUTS3['population_65+'])
population65_NUTS3.head(3)

Unnamed: 0,TIME,GEO,SEX,AGE,UNIT,population_65+
0,2000,FI1D1,Total,65 years or over,Number,31344.0
1,2000,FI1D2,Total,65 years or over,Number,41986.0
2,2000,FI1D3,Total,65 years or over,Number,28613.0


In [45]:
final_age.shape

(10563, 10)

In [46]:
# drop not needed variables
final_age = final_age.drop(['GEO_LABEL', 'age_65_69', 'age_70_74', 'age_over_74'],1)
frames = [final_age,population65_NUTS3]
final_age = pd.concat(frames)
#final_age = final_age.reset_index()
final_age.shape

(10959, 6)

In [47]:
final_age.head()

Unnamed: 0,TIME,GEO,SEX,AGE,UNIT,population_65+
0,2000,BE,Total,Y65-69,Number,1715093.0
1,2000,BE1,Total,Y65-69,Number,160908.0
2,2000,BE10,Total,Y65-69,Number,160908.0
3,2000,BE2,Total,Y65-69,Number,993816.0
4,2000,BE21,Total,Y65-69,Number,277914.0


In [48]:
# exclude non-EU countries
non_EU = ['UK', 'TR']
non_EU_GEO = ['MKX', 'FRY', 'FRX', 'HUX']
# get country code
final_age['country'] = final_age['GEO'].str[:2]
final_age = final_age[~final_age['country'].isin(non_EU)]
# remove French oversees departments and other redundant regions
final_age = final_age[~final_age['GEO'].isin(non_EU_GEO)]
final_age.head()

Unnamed: 0,TIME,GEO,SEX,AGE,UNIT,population_65+,country
0,2000,BE,Total,Y65-69,Number,1715093.0,BE
1,2000,BE1,Total,Y65-69,Number,160908.0,BE
2,2000,BE10,Total,Y65-69,Number,160908.0,BE
3,2000,BE2,Total,Y65-69,Number,993816.0,BE
4,2000,BE21,Total,Y65-69,Number,277914.0,BE


In [50]:
# filter NUTS 1, NUTS 2 mortality OR in one of the specified special regions
final_age['code_length'] = final_age.GEO.str.len()
# keep only NUTS1 regions
final_age = final_age[(final_age['code_length'] == 4) | (final_age['code_length'] == 3) | (final_age['GEO'].isin(regions))]
final_age.head()

Unnamed: 0,TIME,GEO,SEX,AGE,UNIT,population_65+,country,code_length
1,2000,BE1,Total,Y65-69,Number,160908.0,BE,3
2,2000,BE10,Total,Y65-69,Number,160908.0,BE,4
3,2000,BE2,Total,Y65-69,Number,993816.0,BE,3
4,2000,BE21,Total,Y65-69,Number,277914.0,BE,4
5,2000,BE22,Total,Y65-69,Number,109464.0,BE,4


### checking for missing values

In [51]:
# check missing data
filtered_df = final_age[final_age['population_65+'].isnull()]
filtered_df.head()

Unnamed: 0,TIME,GEO,SEX,AGE,UNIT,population_65+,country,code_length
36,2000,DK01,Total,Y65-69,Number,,DK,4
37,2000,DK02,Total,Y65-69,Number,,DK,4
38,2000,DK03,Total,Y65-69,Number,,DK,4
39,2000,DK04,Total,Y65-69,Number,,DK,4
40,2000,DK05,Total,Y65-69,Number,,DK,4
...,...,...,...,...,...,...,...,...
341,2018,NO074,Total,65 years or over,Number,,NO,5
359,2019,NO074,Total,65 years or over,Number,,NO,5
377,2020,NO074,Total,65 years or over,Number,,NO,5
393,2021,NO072,Total,65 years or over,Number,,NO,5


In [52]:
# NaN removed with values of following year in same NUTS region
final_age = final_age.sort_values(by=['GEO', 'TIME'])
final_age['population_65+'] = final_age['population_65+'].fillna(method='bfill')

In [53]:
# all missing values have been imputed
filtered_df = final_age[final_age['population_65+'].isnull()]
filtered_df

Unnamed: 0,TIME,GEO,SEX,AGE,UNIT,population_65+,country,code_length


In [54]:
# deleting redundant columns
final_age = final_age.drop(['SEX', 'AGE', 'UNIT', 'code_length'], axis=1)
final_age = final_age.rename(columns={'TIME': 'year'})
final_age.head()

Unnamed: 0,year,GEO,population_65+,country
451,2000,AL0,237042.0,AL
954,2001,AL0,237042.0,AL
1457,2002,AL0,237042.0,AL
1960,2003,AL0,245608.0,AL
2463,2004,AL0,253974.0,AL


In [55]:
# year value count
final_age['year'] = final_age['year'].astype(str)
final_age['year'].value_counts()

2006    390
2000    390
2015    390
2005    390
2007    390
2019    390
2002    390
2017    390
2020    390
2009    390
2004    390
2016    390
2013    390
2018    390
2010    390
2001    390
2014    390
2008    390
2012    390
2003    390
2011    390
2021     18
Name: year, dtype: int64

Not much data for 2021. Hence, data for 2021 will be added to the 'population' dataframe. This is done by subsetting the data for 2020 and setting the year the 2021, and then concatenating the subset for 2020 with the original dataframe. That way data for the year 2021 is included in the data with the population figures of 2020.

In [56]:
# data for 2021
age_2021 = final_age[final_age['year'] == '2021']
age_2021.head()

Unnamed: 0,year,GEO,population_65+,country
378,2021,FI1D1,44806.0,FI
379,2021,FI1D2,62890.0,FI
380,2021,FI1D3,43343.0,FI
381,2021,FI1D5,16194.0,FI
382,2021,FI1D7,45909.0,FI


In [57]:
# drop few figures for 2021 to simplify merging, values from 2020 will be used
final_age = final_age[final_age['year'] != '2021']
final_age['year'].value_counts()

2006    390
2004    390
2015    390
2005    390
2007    390
2019    390
2002    390
2017    390
2020    390
2009    390
2016    390
2000    390
2013    390
2018    390
2010    390
2001    390
2014    390
2008    390
2012    390
2003    390
2011    390
Name: year, dtype: int64

In [58]:
age_2020 = final_age[final_age['year'] == '2020']
age_2020.head()

Unnamed: 0,year,GEO,population_65+,country
10511,2020,AL0,420036.0,AL
10512,2020,AL01,119241.0,AL
10513,2020,AL02,170897.0,AL
10514,2020,AL03,129898.0,AL
10515,2020,ALX,0.0,AL


In [59]:
# use values of 2020 for 2021
age_2020['year'] = '2021'
age_2020.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_2020['year'] = '2021'


Unnamed: 0,year,GEO,population_65+,country
10511,2021,AL0,420036.0,AL
10512,2021,AL01,119241.0,AL
10513,2021,AL02,170897.0,AL
10514,2021,AL03,129898.0,AL
10515,2021,ALX,0.0,AL


In [60]:
# inspect shape
age_2020.shape

(390, 4)

In [61]:
# concatenate the frames containing information for population 65+
frames = [final_age, age_2020]
final_age = pd.concat(frames)
final_age['year'].value_counts()

2006    390
2000    390
2015    390
2005    390
2007    390
2019    390
2002    390
2017    390
2020    390
2009    390
2004    390
2016    390
2013    390
2018    390
2021    390
2010    390
2001    390
2014    390
2008    390
2012    390
2003    390
2011    390
Name: year, dtype: int64

The new dataframe consists data fro 2020 and 2021.

## Merge population dataframes

In [63]:
# change type of column
population['year'] = population['year'].astype(str)
# merge population 65+ with overall population and density dataframe
population = population.merge(final_age, how='inner', on=['year','GEO'])
population

Unnamed: 0,year,GEO,GEO_LABEL,SEX,AGE,UNIT,population,density,code_length,country_x,population_65+,country_y
0,2000,AL0,Shqipëria,Total,Total,Number,3063318.0,100.7,3,AL,237042.0,AL
1,2001,AL0,Shqipëria,Total,Total,Number,3063318.0,100.7,3,AL,237042.0,AL
2,2002,AL0,Shqipëria,Total,Total,Number,3057018.0,100.7,3,AL,237042.0,AL
3,2003,AL0,Shqipëria,Total,Total,Number,3044993.0,100.7,3,AL,245608.0,AL
4,2004,AL0,Shqipëria,Total,Total,Number,3034231.0,100.7,3,AL,253974.0,AL
...,...,...,...,...,...,...,...,...,...,...,...,...
8460,2017,SK04,Východné Slovensko,Total,Total,Number,1620413.0,103.6,4,SK,216823.0,SK
8461,2018,SK04,Východné Slovensko,Total,Total,Number,1623043.0,103.8,4,SK,225172.0,SK
8462,2019,SK04,Východné Slovensko,Total,Total,Number,1625436.0,103.9,4,SK,233178.0,SK
8463,2020,SK04,Východné Slovensko,Total,Total,Number,1627704.0,103.9,4,SK,241682.0,SK


In [64]:
# rename columns
population = population.drop(['SEX', 'AGE', 'UNIT', 'code_length', 'country_y'], axis=1)
population = population.rename(columns={'country_x': 'country'})
population.head()

Unnamed: 0,year,GEO,GEO_LABEL,population,density,country,population_65+
0,2000,AL0,Shqipëria,3063318.0,100.7,AL,237042.0
1,2001,AL0,Shqipëria,3063318.0,100.7,AL,237042.0
2,2002,AL0,Shqipëria,3057018.0,100.7,AL,237042.0
3,2003,AL0,Shqipëria,3044993.0,100.7,AL,245608.0
4,2004,AL0,Shqipëria,3034231.0,100.7,AL,253974.0


In [65]:
# compute the share of population over the age of 65 in %
population['share_over_65'] = population['population_65+']/population['population']*100
population.head()

Unnamed: 0,year,GEO,GEO_LABEL,population,density,country,population_65+,share_over_65
0,2000,AL0,Shqipëria,3063318.0,100.7,AL,237042.0,7.73808
1,2001,AL0,Shqipëria,3063318.0,100.7,AL,237042.0,7.73808
2,2002,AL0,Shqipëria,3057018.0,100.7,AL,237042.0,7.754027
3,2003,AL0,Shqipëria,3044993.0,100.7,AL,245608.0,8.065963
4,2004,AL0,Shqipëria,3034231.0,100.7,AL,253974.0,8.370292


In [66]:
# find regions with highest share of old people
population.sort_values(by='share_over_65', ascending=False)

Unnamed: 0,year,GEO,GEO_LABEL,population,density,country,population_65+,share_over_65
3717,2021,FI1D1,,140587.0,10.1,FI,44123.0,31.384836
3718,2021,FI1D1,,142335.0,10.1,FI,44123.0,30.999403
3716,2020,FI1D1,,142335.0,10.1,FI,44123.0,30.999403
3715,2019,FI1D1,,144615.0,10.1,FI,43508.0,30.085399
2287,2021,DED4,Chemnitz,1426380.0,219.9,DE,418482.0,29.338746
...,...,...,...,...,...,...,...,...
4762,2003,FRY5,Mayotte,223713.0,624.5,FR,5836.0,2.608700
4761,2002,FRY5,Mayotte,223713.0,624.5,FR,5836.0,2.608700
4760,2001,FRY5,Mayotte,223713.0,624.5,FR,5836.0,2.608700
4759,2000,FRY5,Mayotte,223713.0,624.5,FR,5836.0,2.608700


In [67]:
# no missing values recorded
filtered_df = population[population['share_over_65'].isnull()]
filtered_df.head()

Unnamed: 0,year,GEO,GEO_LABEL,population,density,country,population_65+,share_over_65


In [68]:
# inspect mortality data
mortality

Unnamed: 0,TIME,GEO,GEO_LABEL,deaths,country,week_nr,year
0,2000W01,BE,Belgium,2814.0,BE,W01,2000
1,2000W01,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,298.0,BE,W01,2000
2,2000W01,BE10,Région de Bruxelles-Capitale/Brussels Hoofdste...,298.0,BE,W01,2000
3,2000W01,BE2,Vlaams Gewest,1443.0,BE,W01,2000
4,2000W01,BE21,Prov. Antwerpen,439.0,BE,W01,2000
...,...,...,...,...,...,...,...
21019,2022W19,SE332,,,SE,W19,2022
21020,2022W19,NO071,,,NO,W19,2022
21021,2022W19,NO072,,,NO,W19,2022
21022,2022W19,NO073,,,NO,W19,2022


In [69]:
# inspect population data
population

Unnamed: 0,year,GEO,GEO_LABEL,population,density,country,population_65+,share_over_65
0,2000,AL0,Shqipëria,3063318.0,100.7,AL,237042.0,7.738080
1,2001,AL0,Shqipëria,3063318.0,100.7,AL,237042.0,7.738080
2,2002,AL0,Shqipëria,3057018.0,100.7,AL,237042.0,7.754027
3,2003,AL0,Shqipëria,3044993.0,100.7,AL,245608.0,8.065963
4,2004,AL0,Shqipëria,3034231.0,100.7,AL,253974.0,8.370292
...,...,...,...,...,...,...,...,...
8460,2017,SK04,Východné Slovensko,1620413.0,103.6,SK,216823.0,13.380725
8461,2018,SK04,Východné Slovensko,1623043.0,103.8,SK,225172.0,13.873446
8462,2019,SK04,Východné Slovensko,1625436.0,103.9,SK,233178.0,14.345566
8463,2020,SK04,Východné Slovensko,1627704.0,103.9,SK,241682.0,14.848031


In [70]:
# change colkumn type
population['year'] = population['year'].astype(str)
# merge mortality with population
mort = mortality.merge(population, how='inner', on=['year','GEO'])
mort.head()

Unnamed: 0,TIME,GEO,GEO_LABEL_x,deaths,country_x,week_nr,year,GEO_LABEL_y,population,density,country_y,population_65+,share_over_65
0,2000W01,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,298.0,BE,W01,2000,Région de Bruxelles-Capitale/Brussels Hoofdste...,959318.0,5974.3,BE,160908.0,16.773166
1,2000W02,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,260.0,BE,W02,2000,Région de Bruxelles-Capitale/Brussels Hoofdste...,959318.0,5974.3,BE,160908.0,16.773166
2,2000W03,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,281.0,BE,W03,2000,Région de Bruxelles-Capitale/Brussels Hoofdste...,959318.0,5974.3,BE,160908.0,16.773166
3,2000W04,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,242.0,BE,W04,2000,Région de Bruxelles-Capitale/Brussels Hoofdste...,959318.0,5974.3,BE,160908.0,16.773166
4,2000W05,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,245.0,BE,W05,2000,Région de Bruxelles-Capitale/Brussels Hoofdste...,959318.0,5974.3,BE,160908.0,16.773166


In [71]:
# drop and rename columns
mort = mort.drop(['GEO_LABEL_y', 'country_y'], axis=1)
mort = mort.rename(columns={'GEO_LABEL_x': 'GEO_LABEL', 'country_x': 'country'})
mort.head()

Unnamed: 0,TIME,GEO,GEO_LABEL,deaths,country,week_nr,year,population,density,population_65+,share_over_65
0,2000W01,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,298.0,BE,W01,2000,959318.0,5974.3,160908.0,16.773166
1,2000W02,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,260.0,BE,W02,2000,959318.0,5974.3,160908.0,16.773166
2,2000W03,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,281.0,BE,W03,2000,959318.0,5974.3,160908.0,16.773166
3,2000W04,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,242.0,BE,W04,2000,959318.0,5974.3,160908.0,16.773166
4,2000W05,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,245.0,BE,W05,2000,959318.0,5974.3,160908.0,16.773166


In [72]:
# inspect shape
mort.shape

(386965, 11)

In [73]:
# getting relevant weeks (week 18 to week 39)
relevant_weeks = []
for i in range(18,41):
    strings = 'W{}'.format(i)
    relevant_weeks.append(strings)

In [74]:
# only keeping data from relevant weeks
mort = mort[mort['week_nr'].isin(relevant_weeks)]

In [75]:
# inspect shape
mort.shape

(170407, 11)

In [76]:
# get mortality per 100,000 per week
mort['death_p_100k'] = mort['deaths']/mort['population']*100000

In [77]:
# inspect data
mort.head(3)

Unnamed: 0,TIME,GEO,GEO_LABEL,deaths,country,week_nr,year,population,density,population_65+,share_over_65,death_p_100k
17,2000W18,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,196.0,BE,W18,2000,959318.0,5974.3,160908.0,16.773166,20.431181
18,2000W19,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,172.0,BE,W19,2000,959318.0,5974.3,160908.0,16.773166,17.929404
19,2000W20,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,158.0,BE,W20,2000,959318.0,5974.3,160908.0,16.773166,16.470034


In [78]:
# sort values
mort.sort_values(by=['death_p_100k'],ascending=False).head(10)

Unnamed: 0,TIME,GEO,GEO_LABEL,deaths,country,week_nr,year,population,density,population_65+,share_over_65,death_p_100k
356758,2021W33,FRY1,Guadeloupe,311.0,FR,W33,2021,412682.0,247.0,84319.0,20.431955,75.360689
356759,2021W34,FRY1,Guadeloupe,290.0,FR,W34,2021,412682.0,247.0,84319.0,20.431955,70.272025
356811,2021W33,FRY2,Martinique,219.0,FR,W33,2021,359821.0,328.6,80327.0,22.324156,60.863596
356810,2021W32,FRY2,Martinique,206.0,FR,W32,2021,359821.0,328.6,80327.0,22.324156,57.250689
363337,2021W40,RO31,Sud - Muntenia,1648.0,RO,W40,2021,2901376.0,86.1,607480.0,20.937652,56.800635
350508,2021W37,BG32,Severen tsentralen,439.0,BG,W37,2021,773450.0,53.1,188599.0,24.384123,56.758679
363178,2021W40,RO21,Nord-Est,1762.0,RO,W40,2021,3184215.0,87.5,582256.0,18.2857,55.335459
350511,2021W40,BG32,Severen tsentralen,427.0,BG,W40,2021,773450.0,53.1,188599.0,24.384123,55.207189
363496,2021W40,RO41,Sud-Vest Oltenia,1039.0,RO,W40,2021,1910409.0,66.9,400107.0,20.943526,54.38626
366517,2021W40,RS22,Region Juzne i Istocne Srbije,802.0,RS,W40,2021,1482810.0,57.2,334023.0,22.526352,54.086498


In [80]:
# describe
mort['death_p_100k'].describe()

count    130874.000000
mean         18.517425
std           4.478693
min           0.000000
25%          15.753118
50%          18.092744
75%          20.906112
max          75.360689
Name: death_p_100k, dtype: float64

In [82]:
mort['year'].value_counts()

2021    8119
2010    7728
2008    7728
2016    7728
2012    7728
2013    7728
2019    7728
2014    7728
2017    7728
2018    7728
2005    7728
2004    7728
2015    7728
2007    7728
2020    7728
2001    7728
2009    7728
2006    7728
2002    7728
2003    7728
2000    7728
2011    7728
Name: year, dtype: int64

In [83]:
# find rows with missing values in mortality
filtered_df = mort[mort['death_p_100k'].isnull()]
filtered_df

Unnamed: 0,TIME,GEO,GEO_LABEL,deaths,country,week_nr,year,population,density,population_65+,share_over_65,death_p_100k
1161,2000W18,CZ0,Cesko,,CZ,W18,2000,10278098.0,132.7,1418078.0,13.797086,
1162,2000W19,CZ0,Cesko,,CZ,W19,2000,10278098.0,132.7,1418078.0,13.797086,
1163,2000W20,CZ0,Cesko,,CZ,W20,2000,10278098.0,132.7,1418078.0,13.797086,
1164,2000W21,CZ0,Cesko,,CZ,W21,2000,10278098.0,132.7,1418078.0,13.797086,
1165,2000W22,CZ0,Cesko,,CZ,W22,2000,10278098.0,132.7,1418078.0,13.797086,
...,...,...,...,...,...,...,...,...,...,...,...,...
386934,2021W38,NO073,,,NO,W38,2021,75472.0,1.6,13963.0,18.500901,
386935,2021W39,NO073,,,NO,W39,2021,75472.0,1.6,13963.0,18.500901,
386936,2021W39,NO073,,,NO,W39,2021,75472.0,1.6,13963.0,18.500901,
386937,2021W40,NO073,,,NO,W40,2021,75472.0,1.6,13963.0,18.500901,


In [84]:
filtered_df['GEO'].unique()

array(['CZ0', 'CZ01', 'CZ02', 'CZ03', 'CZ04', 'CZ05', 'CZ06', 'CZ07',
       'CZ08', 'DK0', 'DK01', 'DK02', 'DK03', 'DK04', 'DK05', 'IE0',
       'EL3', 'EL30', 'EL4', 'EL41', 'EL42', 'EL43', 'EL5', 'EL51',
       'EL52', 'EL53', 'EL54', 'EL6', 'EL61', 'EL62', 'EL63', 'EL64',
       'EL65', 'FR1', 'FR10', 'FRB', 'FRB0', 'FRC', 'FRC1', 'FRC2', 'FRD',
       'FRD1', 'FRD2', 'FRE', 'FRE1', 'FRE2', 'FRF', 'FRF1', 'FRF2',
       'FRF3', 'FRG', 'FRG0', 'FRH', 'FRH0', 'FRI', 'FRI1', 'FRI2',
       'FRI3', 'FRJ', 'FRJ1', 'FRJ2', 'FRK', 'FRK1', 'FRK2', 'FRL',
       'FRL0', 'FRM', 'FRM0', 'FRY1', 'FRY2', 'FRY3', 'FRY4', 'FRY5',
       'ITC', 'ITC1', 'ITC2', 'ITC3', 'ITC4', 'ITH', 'ITH1', 'ITH2',
       'ITH3', 'ITH4', 'ITH5', 'ITI', 'ITI1', 'ITI2', 'ITI3', 'ITI4',
       'ITF', 'ITF1', 'ITF2', 'ITF3', 'ITF4', 'ITF5', 'ITF6', 'ITG',
       'ITG1', 'ITG2', 'CY0', 'CY00', 'MT0', 'MT00', 'NL1', 'NL11',
       'NL12', 'NL13', 'NL2', 'NL21', 'NL22', 'NL23', 'NL3', 'NL31',
       'NL32', 'NL33', 'NL34

In [85]:
filtered_df['code_length'] = filtered_df.GEO.str.len()
filtered_df['code_length'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['code_length'] = filtered_df.GEO.str.len()


4    28360
3    11089
5       84
Name: code_length, dtype: int64

In [89]:
# show all columns with missing values
mort.columns[mort.isna().any()].tolist()

['GEO_LABEL', 'deaths', 'death_p_100k']

In [90]:
# filter all instances with missing mortality figures
null_data = mort[mort['deaths'].isnull()]
null_data.shape

(39533, 12)

In [91]:
# unique regions
null_data['GEO'].unique()

array(['CZ0', 'CZ01', 'CZ02', 'CZ03', 'CZ04', 'CZ05', 'CZ06', 'CZ07',
       'CZ08', 'DK0', 'DK01', 'DK02', 'DK03', 'DK04', 'DK05', 'IE0',
       'EL3', 'EL30', 'EL4', 'EL41', 'EL42', 'EL43', 'EL5', 'EL51',
       'EL52', 'EL53', 'EL54', 'EL6', 'EL61', 'EL62', 'EL63', 'EL64',
       'EL65', 'FR1', 'FR10', 'FRB', 'FRB0', 'FRC', 'FRC1', 'FRC2', 'FRD',
       'FRD1', 'FRD2', 'FRE', 'FRE1', 'FRE2', 'FRF', 'FRF1', 'FRF2',
       'FRF3', 'FRG', 'FRG0', 'FRH', 'FRH0', 'FRI', 'FRI1', 'FRI2',
       'FRI3', 'FRJ', 'FRJ1', 'FRJ2', 'FRK', 'FRK1', 'FRK2', 'FRL',
       'FRL0', 'FRM', 'FRM0', 'FRY1', 'FRY2', 'FRY3', 'FRY4', 'FRY5',
       'ITC', 'ITC1', 'ITC2', 'ITC3', 'ITC4', 'ITH', 'ITH1', 'ITH2',
       'ITH3', 'ITH4', 'ITH5', 'ITI', 'ITI1', 'ITI2', 'ITI3', 'ITI4',
       'ITF', 'ITF1', 'ITF2', 'ITF3', 'ITF4', 'ITF5', 'ITF6', 'ITG',
       'ITG1', 'ITG2', 'CY0', 'CY00', 'MT0', 'MT00', 'NL1', 'NL11',
       'NL12', 'NL13', 'NL2', 'NL21', 'NL22', 'NL23', 'NL3', 'NL31',
       'NL32', 'NL33', 'NL34

In [92]:
# inspect data
null_data

Unnamed: 0,TIME,GEO,GEO_LABEL,deaths,country,week_nr,year,population,density,population_65+,share_over_65,death_p_100k
1161,2000W18,CZ0,Cesko,,CZ,W18,2000,10278098.0,132.7,1418078.0,13.797086,
1162,2000W19,CZ0,Cesko,,CZ,W19,2000,10278098.0,132.7,1418078.0,13.797086,
1163,2000W20,CZ0,Cesko,,CZ,W20,2000,10278098.0,132.7,1418078.0,13.797086,
1164,2000W21,CZ0,Cesko,,CZ,W21,2000,10278098.0,132.7,1418078.0,13.797086,
1165,2000W22,CZ0,Cesko,,CZ,W22,2000,10278098.0,132.7,1418078.0,13.797086,
...,...,...,...,...,...,...,...,...,...,...,...,...
386934,2021W38,NO073,,,NO,W38,2021,75472.0,1.6,13963.0,18.500901,
386935,2021W39,NO073,,,NO,W39,2021,75472.0,1.6,13963.0,18.500901,
386936,2021W39,NO073,,,NO,W39,2021,75472.0,1.6,13963.0,18.500901,
386937,2021W40,NO073,,,NO,W40,2021,75472.0,1.6,13963.0,18.500901,


If data is missing on NUTS2 level, it will be imputed by data on the NUTS1 level for small countries

In [93]:
# get code length
mort['code_length'] = mort.GEO.str.len()

In [94]:
# filter for NUTS 1
NUTS1 = mort[mort['code_length'] == 3]
NUTS1 = NUTS1[['death_p_100k', 'GEO', 'TIME']]

In [95]:
# filter for NUTS2
NUTS2 = mort[mort['code_length'] == 4]
NUTS2['NUTS1'] = NUTS2['GEO'].str[:3]
# merge NUTS1 and NUTS2 subdata
NUTS2 = NUTS2.merge(NUTS1, left_on=['NUTS1', 'TIME'], right_on=['GEO', 'TIME'])
NUTS2 = NUTS2.drop(['NUTS1', 'GEO_y'],1)
NUTS2 = NUTS2.rename(columns={'death_p_100k_x': 'death_p_100k', 'death_p_100k_y': 'death_p_100k_NUTS1'})
NUTS2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NUTS2['NUTS1'] = NUTS2['GEO'].str[:3]


Unnamed: 0,TIME,GEO_x,GEO_LABEL,deaths,country,week_nr,year,population,density,population_65+,share_over_65,death_p_100k,code_length,death_p_100k_NUTS1
0,2000W18,BE10,Région de Bruxelles-Capitale/Brussels Hoofdste...,196.0,BE,W18,2000,959318.0,5974.3,160908.0,16.773166,20.431181,4,20.431181
1,2000W19,BE10,Région de Bruxelles-Capitale/Brussels Hoofdste...,172.0,BE,W19,2000,959318.0,5974.3,160908.0,16.773166,17.929404,4,17.929404
2,2000W20,BE10,Région de Bruxelles-Capitale/Brussels Hoofdste...,158.0,BE,W20,2000,959318.0,5974.3,160908.0,16.773166,16.470034,4,16.470034
3,2000W21,BE10,Région de Bruxelles-Capitale/Brussels Hoofdste...,193.0,BE,W21,2000,959318.0,5974.3,160908.0,16.773166,20.118459,4,20.118459
4,2000W22,BE10,Région de Bruxelles-Capitale/Brussels Hoofdste...,170.0,BE,W22,2000,959318.0,5974.3,160908.0,16.773166,17.720923,4,17.720923
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108279,2021W38,RS22,Region Juzne i Istocne Srbije,540.0,RS,W38,2021,1482810.0,57.2,334023.0,22.526352,36.417343,4,40.109493
108280,2021W39,RS21,Region Sumadije i Zapadne Srbije,999.0,RS,W39,2021,1900429.0,72.4,412962.0,21.729936,52.567078,4,51.755138
108281,2021W39,RS22,Region Juzne i Istocne Srbije,752.0,RS,W39,2021,1482810.0,57.2,334023.0,22.526352,50.714522,4,51.755138
108282,2021W40,RS21,Region Sumadije i Zapadne Srbije,1006.0,RS,W40,2021,1900429.0,72.4,412962.0,21.729936,52.935416,4,53.439914


In [96]:
# show which NUTS2 regions do not have mortality figures whilst the NUTS 1 regions has
filtered_df = NUTS2[(NUTS2['death_p_100k'].isnull()) & (NUTS2['death_p_100k_NUTS1'] > 0)]
filtered_df

Unnamed: 0,TIME,GEO_x,GEO_LABEL,deaths,country,week_nr,year,population,density,population_65+,share_over_65,death_p_100k,code_length,death_p_100k_NUTS1
107792,2021W20,NO01,Oslo og Akershus (statistical region 2016),,NO,W20,2021,1326682.0,259.8,188663.0,14.220665,,4,13.767843
107794,2021W20,NO03,Sør-Østlandet (statistical region 2016),,NO,W20,2021,1011472.0,29.4,198487.0,19.623578,,4,13.767843
107795,2021W20,NO04,Agder og Rogaland (statistical region 2016),,NO,W20,2021,787123.0,32.2,126309.0,16.046920,,4,13.767843
107796,2021W20,NO05,Vestlandet (statistical region 2016),,NO,W20,2021,905335.0,19.1,164606.0,18.181778,,4,13.767843
107799,2021W21,NO01,Oslo og Akershus (statistical region 2016),,NO,W21,2021,1326682.0,259.8,188663.0,14.220665,,4,13.618800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107929,2021W39,NO05,Vestlandet (statistical region 2016),,NO,W39,2021,905335.0,19.1,164606.0,18.181778,,4,14.978817
107932,2021W40,NO01,Oslo og Akershus (statistical region 2016),,NO,W40,2021,1326682.0,259.8,188663.0,14.220665,,4,15.016078
107934,2021W40,NO03,Sør-Østlandet (statistical region 2016),,NO,W40,2021,1011472.0,29.4,198487.0,19.623578,,4,15.016078
107935,2021W40,NO04,Agder og Rogaland (statistical region 2016),,NO,W40,2021,787123.0,32.2,126309.0,16.046920,,4,15.016078


Regions in Souther Norway do not have NUTS2 mortality but NUTS 1 mortality.

In [97]:
# impute NaN for regions in Norway with no NUTS 2 mortality with NUTS 1 mortality
NUTS2['death_p_100k'] = NUTS2['death_p_100k'].fillna(NUTS2['death_p_100k_NUTS1'])
filtered_df = NUTS2[(NUTS2['death_p_100k'].isnull()) & (NUTS2['death_p_100k_NUTS1'] > 0)]
filtered_df

Unnamed: 0,TIME,GEO_x,GEO_LABEL,deaths,country,week_nr,year,population,density,population_65+,share_over_65,death_p_100k,code_length,death_p_100k_NUTS1


In [98]:
# drop unneccessary columns
NUTS2 = NUTS2.drop(['death_p_100k_NUTS1', 'code_length'],1)
NUTS2 = NUTS2.rename(columns={'GEO_x': 'GEO'})
NUTS2

Unnamed: 0,TIME,GEO,GEO_LABEL,deaths,country,week_nr,year,population,density,population_65+,share_over_65,death_p_100k
0,2000W18,BE10,Région de Bruxelles-Capitale/Brussels Hoofdste...,196.0,BE,W18,2000,959318.0,5974.3,160908.0,16.773166,20.431181
1,2000W19,BE10,Région de Bruxelles-Capitale/Brussels Hoofdste...,172.0,BE,W19,2000,959318.0,5974.3,160908.0,16.773166,17.929404
2,2000W20,BE10,Région de Bruxelles-Capitale/Brussels Hoofdste...,158.0,BE,W20,2000,959318.0,5974.3,160908.0,16.773166,16.470034
3,2000W21,BE10,Région de Bruxelles-Capitale/Brussels Hoofdste...,193.0,BE,W21,2000,959318.0,5974.3,160908.0,16.773166,20.118459
4,2000W22,BE10,Région de Bruxelles-Capitale/Brussels Hoofdste...,170.0,BE,W22,2000,959318.0,5974.3,160908.0,16.773166,17.720923
...,...,...,...,...,...,...,...,...,...,...,...,...
108279,2021W38,RS22,Region Juzne i Istocne Srbije,540.0,RS,W38,2021,1482810.0,57.2,334023.0,22.526352,36.417343
108280,2021W39,RS21,Region Sumadije i Zapadne Srbije,999.0,RS,W39,2021,1900429.0,72.4,412962.0,21.729936,52.567078
108281,2021W39,RS22,Region Juzne i Istocne Srbije,752.0,RS,W39,2021,1482810.0,57.2,334023.0,22.526352,50.714522
108282,2021W40,RS21,Region Sumadije i Zapadne Srbije,1006.0,RS,W40,2021,1900429.0,72.4,412962.0,21.729936,52.935416


#### No indication of any invalid symbols in any of the numeric column is found.

In [99]:
# inspect shape
mort.shape

(170407, 13)

In [100]:
mort['death_p_100k'].isna().sum()

39533

In [101]:
# remove all NUTS2 regions from mort dataframe
mort = mort[mort['code_length'] != 4]
# cancatenate mort dataframe with all NUTS2 regions including those that have been imputed
frames = [mort,NUTS2]
mort = pd.concat(frames)
mort.shape

(167877, 13)

In [102]:
# slightly reduced number of NaN
mort['death_p_100k'].isna().sum()

37931

In [103]:
# count of NUTS regions
mort['GEO'].value_counts().head(25)

SE331    529
FI1D1    529
FI1D2    529
SE321    529
FI1D5    529
FI1D8    529
FI1D9    529
SE313    529
NO073    529
SE311    529
SE332    529
FI1D3    529
NO072    529
NO071    529
SE322    529
SE312    529
FI1D7    529
CH06     506
ES6      506
FRI2     506
AT31     506
BE1      506
ITC4     506
EL3      506
CH03     506
Name: GEO, dtype: int64

In [104]:
# dropping potential duplicates
mort = mort.drop_duplicates(subset=['GEO', 'TIME'], keep='first')
mort['GEO'].value_counts()

ITH     506
ES12    506
SK0     506
RS22    506
DE1     506
       ... 
NL34    506
NL4     506
BE10    506
PL43    506
PL71    506
Name: GEO, Length: 331, dtype: int64

In [107]:
# write to csv
mort.to_csv('Mortality1.csv')