# Merging all data from different sources and writing csv files for descriptive and modelling parts

Install required packages for geopa

In [1]:
# import needed libraries
import pandas as pd
import numpy as np

In [2]:
# read all data obtained from the weather_data and MRT_calculation notebooks
weather_final_01 = pd.read_csv('FINAL_WEATHER_MERGED_01', index_col=0)
weather_final_05 = pd.read_csv('FINAL_WEATHER_MERGED_05', index_col=0)
weather_final_09 = pd.read_csv('FINAL_WEATHER_MERGED_09', index_col=0)
weather_final_13 = pd.read_csv('FINAL_WEATHER_MERGED_13', index_col=0)
weather_final_17 = pd.read_csv('FINAL_WEATHER_MERGED_17', index_col=0)
weather_final_21 = pd.read_csv('FINAL_WEATHER_MERGED_21', index_col=0)

In [3]:
# concatenating all weather dfs to a large df
frames = [weather_final_01, weather_final_05, weather_final_09, weather_final_13, weather_final_17, weather_final_21]

weather_df = pd.concat(frames)
weather_df.shape

(3891096, 33)

In [4]:
# writing concatenated df to csv. This df is used for descriptive statistics
weather_df.to_csv('weather_df.csv')

## Used in Notebook of the descriptive part --> Descriptive.ipnyb

## From here on, the final dataset for the modelling will be set up

In [5]:
# showing unique latitudes
weather_df['latitude'].unique()

array([54.25, 53.5 , 52.  , 52.75, 43.  , 39.25, 55.  , 42.25, 41.5 ,
       40.75, 40.  , 38.5 , 37.75, 58.  , 57.25, 55.75, 37.  , 56.5 ,
       50.5 , 48.25, 51.25, 47.5 , 46.75, 49.  , 46.  , 45.25, 44.5 ,
       43.75, 49.75, 61.  , 61.75, 60.25, 59.5 , 58.75, 62.5 , 63.25,
       64.  , 64.75, 66.25, 65.5 , 67.  , 68.5 , 67.75, 69.25, 70.  ,
       70.75])

In [6]:
# show first instances
weather_df.head()

Unnamed: 0,longitude,latitude,wind_2AM,dew_point_2AM,temperature_2AM,uvb_2AM,lat_long,date,apparent_temperature_2AM,wind_11AM,...,apparent_temperature_11PM,country,NUTS1,NUTS2,NUTS3,MRT_2AM,MRT_11AM,MRT_4PM,MRT_11PM,Week_Number
0,-10.0,54.25,-3.17,7.57,10.66,-0.02,"('54.25', '-10.0')",2000-05-01,8.82,0.16,...,8.6,IE,IE0,IE04,IE042,2.96,23.1,15.57,3.84,18
1,-10.0,54.25,-1.2,8.22,9.81,-0.02,"('54.25', '-10.0')",2000-05-02,8.13,-1.53,...,7.14,IE,IE0,IE04,IE042,3.03,23.49,25.09,5.6,18
2,-10.0,54.25,-4.29,6.97,8.52,-0.02,"('54.25', '-10.0')",2000-05-03,6.56,-4.15,...,6.43,IE,IE0,IE04,IE042,3.64,27.86,23.63,0.11,18
3,-10.0,54.25,-4.35,6.38,8.56,-0.02,"('54.25', '-10.0')",2000-05-04,6.47,-4.6,...,7.81,IE,IE0,IE04,IE042,1.9,21.78,17.08,0.12,18
4,-10.0,54.25,-3.99,7.33,9.03,-0.02,"('54.25', '-10.0')",2000-05-05,7.15,-3.51,...,8.86,IE,IE0,IE04,IE042,-0.44,12.57,14.37,5.4,18


# Merge with temperature threshold to find heat wave days

In [7]:
# read the threshold dataframe obtained from the Threshold_Computation.ipnyb
thresholds = pd.read_csv('quantile_thresholds.csv', index_col=0)
thresholds.head(2)

Unnamed: 0,latitude,longitude,d2m,t2m,apparent_temperature
0,37.0,-6.25,20.096707,38.9716,39.232187
1,37.0,-5.5,19.063138,38.051736,38.677543


In [8]:
# rename apparent_temperature to 99th percentile
thresholds = thresholds.rename(columns={"apparent_temperature": "99th_percentile"})

In [9]:
# merge the thresold with the weather data
weather_final = weather_df.merge(thresholds,on=['longitude','latitude'], how='left').drop(columns = ['d2m','t2m'])

In [10]:
# inspect the datframe
weather_final.tail()

Unnamed: 0,longitude,latitude,wind_2AM,dew_point_2AM,temperature_2AM,uvb_2AM,lat_long,date,apparent_temperature_2AM,wind_11AM,...,country,NUTS1,NUTS2,NUTS3,MRT_2AM,MRT_11AM,MRT_4PM,MRT_11PM,Week_Number,99th_percentile
3891091,29.75,61.75,2.28,5.96,6.96,0.0,"('61.75', '29.75')",2021-09-26,4.81,1.17,...,FI,FI1,FI1C,FI1C5,4.13,18.18,6.6,0.1,38,26.917881
3891092,29.75,61.75,-0.01,2.83,3.14,0.0,"('61.75', '29.75')",2021-09-27,0.6,0.18,...,FI,FI1,FI1C,FI1C5,0.34,20.55,3.04,0.14,39,26.917881
3891093,29.75,61.75,-1.48,3.6,3.85,0.0,"('61.75', '29.75')",2021-09-28,1.37,-1.15,...,FI,FI1,FI1C,FI1C5,3.95,17.04,6.69,5.08,39,26.917881
3891094,29.75,61.75,-1.2,5.68,6.56,0.0,"('61.75', '29.75')",2021-09-29,4.36,-0.11,...,FI,FI1,FI1C,FI1C5,4.56,16.11,6.52,5.48,39,26.917881
3891095,29.75,61.75,-1.08,5.78,6.76,0.0,"('61.75', '29.75')",2021-09-30,4.58,-0.67,...,FI,FI1,FI1C,FI1C5,4.37,15.13,6.3,3.14,39,26.917881


In [11]:
# transform date to datetime and get day month and year of date
weather_final['date'] = pd.to_datetime(weather_final['date'])
weather_final['day'] = weather_final.date.dt.day
weather_final['month'] = weather_final.date.dt.month
weather_final['year'] = weather_final.date.dt.year

In [12]:
# inspect info
weather_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3891096 entries, 0 to 3891095
Data columns (total 37 columns):
 #   Column                     Dtype         
---  ------                     -----         
 0   longitude                  float64       
 1   latitude                   float64       
 2   wind_2AM                   float64       
 3   dew_point_2AM              float64       
 4   temperature_2AM            float64       
 5   uvb_2AM                    float64       
 6   lat_long                   object        
 7   date                       datetime64[ns]
 8   apparent_temperature_2AM   float64       
 9   wind_11AM                  float64       
 10  dew_point_11AM             float64       
 11  temperature_11AM           float64       
 12  uvb_11AM                   float64       
 13  apparent_temperature_11AM  float64       
 14  wind_4PM                   float64       
 15  dew_point_4PM              float64       
 16  temperature_4PM            float64  

In [13]:
# new binary column indicating if threshold was exceeded
weather_final['threshold_exceeded'] = np.where(weather_final['apparent_temperature_4PM'] > weather_final['99th_percentile'],1,0)

In [14]:
# inspect data
weather_final.head(5)

Unnamed: 0,longitude,latitude,wind_2AM,dew_point_2AM,temperature_2AM,uvb_2AM,lat_long,date,apparent_temperature_2AM,wind_11AM,...,MRT_2AM,MRT_11AM,MRT_4PM,MRT_11PM,Week_Number,99th_percentile,day,month,year,threshold_exceeded
0,-10.0,54.25,-3.17,7.57,10.66,-0.02,"('54.25', '-10.0')",2000-05-01,8.82,0.16,...,2.96,23.1,15.57,3.84,18,20.898722,1,5,2000,0
1,-10.0,54.25,-1.2,8.22,9.81,-0.02,"('54.25', '-10.0')",2000-05-02,8.13,-1.53,...,3.03,23.49,25.09,5.6,18,20.898722,2,5,2000,0
2,-10.0,54.25,-4.29,6.97,8.52,-0.02,"('54.25', '-10.0')",2000-05-03,6.56,-4.15,...,3.64,27.86,23.63,0.11,18,20.898722,3,5,2000,0
3,-10.0,54.25,-4.35,6.38,8.56,-0.02,"('54.25', '-10.0')",2000-05-04,6.47,-4.6,...,1.9,21.78,17.08,0.12,18,20.898722,4,5,2000,0
4,-10.0,54.25,-3.99,7.33,9.03,-0.02,"('54.25', '-10.0')",2000-05-05,7.15,-3.51,...,-0.44,12.57,14.37,5.4,18,20.898722,5,5,2000,0


In [15]:
# show how often the threshold was exceeded
weather_final['threshold_exceeded'].value_counts()

0    3808602
1      82494
Name: threshold_exceeded, dtype: int64

In [16]:
# check if there are duplicates in the data
duplicate = weather_df[weather_final.duplicated(['wind_2AM', 'dew_point_2AM', 'temperature_2AM', 'uvb_2AM', 'apparent_temperature_2AM', 'wind_11AM'])]
  
print("Duplicate Rows based on Name and Age :")
  
# Print the resultant Dataframe
duplicate

  duplicate = weather_df[weather_final.duplicated(['wind_2AM', 'dew_point_2AM', 'temperature_2AM', 'uvb_2AM', 'apparent_temperature_2AM', 'wind_11AM'])]


Duplicate Rows based on Name and Age :


Unnamed: 0,longitude,latitude,wind_2AM,dew_point_2AM,temperature_2AM,uvb_2AM,lat_long,date,apparent_temperature_2AM,wind_11AM,...,apparent_temperature_11PM,country,NUTS1,NUTS2,NUTS3,MRT_2AM,MRT_11AM,MRT_4PM,MRT_11PM,Week_Number


In [17]:
# set coordinates and date as index
weather_final = weather_final.set_index(['lat_long', 'date'])

In [18]:
# show value counts per NUTS 2
weather_final['NUTS2'].value_counts()

FI1D    255816
SE33    191862
NO07    148104
SE32     84150
SE31     77418
         ...  
UKG1      3366
NL23      3366
UKK4      3366
UKF2      3366
DK02      3366
Name: NUTS2, Length: 249, dtype: int64

#### Some NUTS 2 regions appear to be very big.  For the four largest NUTS 2 regions, NUTS 3 regions will be used to reduce the size of regions.

In [20]:
changed_regions = ['FI1D', 'SE31', 'SE32', 'SE33', 'NO07']

weather_final['NUTS2'] = np.where(weather_final['NUTS2'].isin(changed_regions), weather_final['NUTS3'], weather_final['NUTS2'])
weather_final['NUTS2'].value_counts()

FI1D7    127908
SE332    124542
NO074    100980
NO0A      67320
SE331     67320
          ...  
UKG1       3366
UKK4       3366
UKF2       3366
UKH3       3366
DK02       3366
Name: NUTS2, Length: 260, dtype: int64

In [22]:
# exception for the netherlands as data is only available on the national level for a long time period
exception = ['NL']

weather_final['NUTS2'] = np.where(weather_final['country'].isin(exception), weather_final['country'], weather_final['NUTS2'])
weather_final['NUTS2'].value_counts()


FI1D7    127908
SE332    124542
NO074    100980
NO0A      67320
SE331     67320
          ...  
DEA5       3366
UKJ3       3366
ITI2       3366
BE32       3366
DK02       3366
Name: NUTS2, Length: 253, dtype: int64

#### After the change, Finnish, Norwegian and Swedish NUTS 3 regions still remain big, but the improvement is considered sufficient.

In [23]:
weather_final.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,longitude,latitude,wind_2AM,dew_point_2AM,temperature_2AM,uvb_2AM,apparent_temperature_2AM,wind_11AM,dew_point_11AM,temperature_11AM,...,MRT_2AM,MRT_11AM,MRT_4PM,MRT_11PM,Week_Number,99th_percentile,day,month,year,threshold_exceeded
lat_long,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
"('54.25', '-10.0')",2000-05-01,-10.0,54.25,-3.17,7.57,10.66,-0.02,8.82,0.16,8.74,13.7,...,2.96,23.1,15.57,3.84,18,20.898722,1,5,2000,0
"('54.25', '-10.0')",2000-05-02,-10.0,54.25,-1.2,8.22,9.81,-0.02,8.13,-1.53,9.74,11.66,...,3.03,23.49,25.09,5.6,18,20.898722,2,5,2000,0
"('54.25', '-10.0')",2000-05-03,-10.0,54.25,-4.29,6.97,8.52,-0.02,6.56,-4.15,7.06,10.37,...,3.64,27.86,23.63,0.11,18,20.898722,3,5,2000,0
"('54.25', '-10.0')",2000-05-04,-10.0,54.25,-4.35,6.38,8.56,-0.02,6.47,-4.6,7.83,11.56,...,1.9,21.78,17.08,0.12,18,20.898722,4,5,2000,0
"('54.25', '-10.0')",2000-05-05,-10.0,54.25,-3.99,7.33,9.03,-0.02,7.15,-3.51,7.76,11.78,...,-0.44,12.57,14.37,5.4,18,20.898722,5,5,2000,0


In [24]:
# define function to count number of consecutive days of threshold exceeded, 
# source: https://stackoverflow.com/questions/43616174/pandas-dataframe-how-to-count-the-number-of-1-rows-in-a-binary-column
def cumsum_bincount(a):  
    # Append 0 & look for a [0,1] pattern. Form a binned array based off 1s groups
    ids = a*(np.diff(np.r_[0,a])==1).cumsum()

    # Get the bincount, index into the count with ids and  mask out 0s
    return a*np.bincount(ids)[ids]

In [25]:
# apply function
weather_final['consecutive_HW_days'] = cumsum_bincount(weather_final.threshold_exceeded.values)

In [26]:
# sort by longest heat wave recorded, it lasted 22 days
pd.set_option('display.max_columns', None)
weather_final.sort_values(by=['consecutive_HW_days'], ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,longitude,latitude,wind_2AM,dew_point_2AM,temperature_2AM,uvb_2AM,apparent_temperature_2AM,wind_11AM,dew_point_11AM,temperature_11AM,uvb_11AM,apparent_temperature_11AM,wind_4PM,dew_point_4PM,temperature_4PM,uvb_4PM,apparent_temperature_4PM,wind_11PM,dew_point_11PM,temperature_11PM,uvb_11PM,apparent_temperature_11PM,country,NUTS1,NUTS2,NUTS3,MRT_2AM,MRT_11AM,MRT_4PM,MRT_11PM,Week_Number,99th_percentile,day,month,year,threshold_exceeded,consecutive_HW_days
lat_long,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
"('60.25', '20.75')",2014-07-25,20.75,60.25,0.54,19.43,21.59,-0.02,24.58,0.76,19.86,21.54,298751.06,24.8,-0.43,19.19,21.82,142510.39,24.67,0.06,18.55,22.29,-0.02,24.77,FI,FI2,FI20,FI200,13.11,23.14,20.64,12.94,30,21.44443,25,7,2014,1,22
"('60.25', '20.75')",2014-08-03,20.75,60.25,-5.97,17.31,21.21,-0.02,23.01,-7.78,19.09,21.53,124010.25,24.33,-7.96,19.76,23.15,129143.94,26.33,-6.19,20.21,23.5,-0.02,26.96,FI,FI2,FI20,FI200,14.74,30.85,22.95,16.04,31,21.44443,3,8,2014,1,22
"('60.25', '20.75')",2014-07-24,20.75,60.25,1.52,17.63,20.97,-0.02,22.94,1.1,17.23,21.33,299977.22,23.09,0.72,19.55,21.0,136514.34,24.07,-0.16,19.29,21.68,-0.02,24.59,FI,FI2,FI20,FI200,12.5,23.87,21.39,13.33,30,21.44443,24,7,2014,1,22
"('60.25', '20.75')",2014-08-10,20.75,60.25,4.66,18.36,20.49,-0.02,22.87,2.32,16.16,20.19,240710.72,21.41,1.56,17.04,19.99,85197.67,21.66,-0.65,15.9,19.31,-0.02,20.41,FI,FI2,FI20,FI200,15.93,26.51,25.36,11.13,32,21.44443,10,8,2014,1,22
"('60.25', '20.75')",2014-08-09,20.75,60.25,3.75,17.6,20.05,-0.02,22.02,0.99,18.6,20.2,278398.2,22.72,2.23,18.07,20.47,119759.12,22.69,2.99,18.65,20.43,-0.02,22.98,FI,FI2,FI20,FI200,13.87,22.16,19.67,14.72,32,21.44443,9,8,2014,1,22


In [27]:
# show value counts, each number in the right column should be divided by the value in the left column to obtain the unque count
# of heat wave lengths, e.g. 22/22 = 1 --> 1 heat wave with length 22 days
weather_final['consecutive_HW_days'].value_counts()

0     3808602
1       25863
2       20830
3       12780
4        8784
5        5805
6        3318
7        1582
11       1023
8         880
9         540
10        390
12        360
13         91
16         64
20         60
17         34
22         22
21         21
18         18
15         15
14         14
Name: consecutive_HW_days, dtype: int64

In [28]:
# indicate whether day is a heat wave day
weather_final['heat_wave_day'] = np.where((weather_final['threshold_exceeded'] == 1) & (weather_final['consecutive_HW_days'] >= 4),1,0)

In [29]:
# show frequency of heat wave days
weather_final['heat_wave_day'].value_counts()

0    3868075
1      23021
Name: heat_wave_day, dtype: int64

In [30]:
# group number of heat wave days by year
weather_final.groupby('year')['heat_wave_day'].value_counts()

year  heat_wave_day
2000  0                176554
      1                   314
2001  0                176797
      1                    71
2002  0                176550
      1                   318
2003  0                173429
      1                  3439
2004  0                176710
      1                   158
2005  0                176598
      1                   270
2006  0                176405
      1                   463
2007  0                175732
      1                  1136
2008  0                176511
      1                   357
2009  0                176864
      1                     4
2010  0                175391
      1                  1477
2011  0                176487
      1                   381
2012  0                176030
      1                   838
2013  0                176182
      1                   686
2014  0                175345
      1                  1523
2015  0                174848
      1                  2020
2016  0             

## Recorded heat waves per region and year

In [31]:
# write concatenated data in csv
#weather_final.to_csv('Heat_wave.csv')

In [32]:
# show all NUTS regions
weather_final['NUTS2'].unique()

array(['IE04', 'IE05', 'ES11', 'PT16', 'PT11', 'PT18', 'UKN0', 'IE06',
       'UKM6', 'ES43', 'ES61', 'ES41', 'UKM9', 'UKL1', 'UKK3', 'ES42',
       'UKM7', 'UKK4', 'FRH0', 'ES13', 'ES30', 'UKD1', 'UKL2', 'UKM5',
       'UKC2', 'UKD3', 'UKG2', 'UKG1', 'UKK2', 'ES21', 'ES23', 'UKE2',
       'UKE3', 'UKK1', 'FRG0', 'ES22', 'ES24', 'ES62', 'UKF2', 'UKJ1',
       'UKJ3', 'FRD1', 'FRI3', 'FRI1', 'ES52', 'UKF3', 'UKH2', 'UKJ2',
       'FRJ2', 'UKH1', 'UKH3', 'UKJ4', 'FRD2', 'FRB0', 'ES51', 'FRI2',
       'FRE1', 'FRE2', 'FR10', 'FRJ1', 'FRK1', 'NL', 'FRC1', 'BE23',
       'BE32', 'FRF2', 'FRK2', 'NO0A', 'BE21', 'BE35', 'FRL0', 'BE33',
       'BE34', 'FRF3', 'FRC2', 'DEA1', 'DEA2', 'LU00', 'CH01', 'NO09',
       'DE94', 'DEA3', 'DEB1', 'FRF1', 'CH02', 'ITC1', 'NO02', 'NO08',
       'DEA5', 'DE72', 'DEB3', 'DE13', 'CH03', 'DK04', 'DK03', 'DE93',
       'DE92', 'DEA4', 'DE73', 'DE71', 'DE12', 'DE14', 'CH04', 'CH06',
       'ITC4', 'ITC3', 'FRM0', 'ITG2', 'NO06', 'DEF0', 'DE11', 'CH05',
       '

# Checkpoint 1: Reading full weather data to avoid data overload

In [1]:
# import needed libraries again and load data obtained from the steps before
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
#final_df = pd.read_csv('Heat_wave.csv')
final_df.tail()

Unnamed: 0,lat_long,date,longitude,latitude,wind_2AM,dew_point_2AM,temperature_2AM,uvb_2AM,apparent_temperature_2AM,wind_11AM,...,MRT_4PM,MRT_11PM,Week_Number,99th_percentile,day,month,year,threshold_exceeded,consecutive_HW_days,heat_wave_day
3891091,"('61.75', '29.75')",2021-09-26,29.75,61.75,2.28,5.96,6.96,0.0,4.81,1.17,...,6.6,0.1,38,26.917881,26,9,2021,0,0,0
3891092,"('61.75', '29.75')",2021-09-27,29.75,61.75,-0.01,2.83,3.14,0.0,0.6,0.18,...,3.04,0.14,39,26.917881,27,9,2021,0,0,0
3891093,"('61.75', '29.75')",2021-09-28,29.75,61.75,-1.48,3.6,3.85,0.0,1.37,-1.15,...,6.69,5.08,39,26.917881,28,9,2021,0,0,0
3891094,"('61.75', '29.75')",2021-09-29,29.75,61.75,-1.2,5.68,6.56,0.0,4.36,-0.11,...,6.52,5.48,39,26.917881,29,9,2021,0,0,0
3891095,"('61.75', '29.75')",2021-09-30,29.75,61.75,-1.08,5.78,6.76,0.0,4.58,-0.67,...,6.3,3.14,39,26.917881,30,9,2021,0,0,0


In [2]:
# inspect shape
final_df.shape

(3891096, 40)

# Merging with health expenditure and disease prevalence data

In [3]:
# read health expenditure csv
expend = pd.read_csv('health_expenditure.csv', index_col=0)
expend.head(2)

Unnamed: 0,country,year,health_expend_p_capita
0,BE,2000,2176.507157
1,BG,2000,270.016329


In [4]:
# merge health expenditure with weather data on country and year
final_df = final_df.merge(expend, how='left', on=['country', 'year'])#.drop(['TIME_y'], axis = 1)
final_df.head(2)

Unnamed: 0,lat_long,date,longitude,latitude,wind_2AM,dew_point_2AM,temperature_2AM,uvb_2AM,apparent_temperature_2AM,wind_11AM,...,MRT_11PM,Week_Number,99th_percentile,day,month,year,threshold_exceeded,consecutive_HW_days,heat_wave_day,health_expend_p_capita
0,"('54.25', '-10.0')",2000-05-01,-10.0,54.25,-3.17,7.57,10.66,-0.02,8.82,0.16,...,3.84,18,20.898722,1,5,2000,0,0,0,3760.622022
1,"('54.25', '-10.0')",2000-05-02,-10.0,54.25,-1.2,8.22,9.81,-0.02,8.13,-1.53,...,5.6,18,20.898722,2,5,2000,0,0,0,3760.622022


In [5]:
# read disease prevalence csv
prevalence = pd.read_csv('prevalence.csv', index_col=0)
prevalence = prevalence.rename(columns={'Unnamed: 10': 'country'})
prevalence.index.names = ['c']
prevalence.head(2)

Unnamed: 0_level_0,Asthma,Chronic_depression,Chronic_lower_respiratory_diseases,Coronary_heart_disease_or_angina_pectoris,Diabetes,Heart_attack_or_chronic_consequences_of_heart_attack,High_blood_pressure,Kidney_problems,Obesity,Stroke_or_chronic_consequences_of_stroke,country
c,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,4.35,7.6,4.4,2.7,5.45,1.35,21.45,1.95,15.7,1.15,AT
1,5.05,7.0,4.0,1.5,5.55,0.9,16.95,0.95,15.0,0.9,BE


In [6]:
# merge on country
final_df = final_df.merge(prevalence, how='left', on=['country'])#.drop(['TIME_y'], axis = 1)
final_df.head(2)

Unnamed: 0,lat_long,date,longitude,latitude,wind_2AM,dew_point_2AM,temperature_2AM,uvb_2AM,apparent_temperature_2AM,wind_11AM,...,Asthma,Chronic_depression,Chronic_lower_respiratory_diseases,Coronary_heart_disease_or_angina_pectoris,Diabetes,Heart_attack_or_chronic_consequences_of_heart_attack,High_blood_pressure,Kidney_problems,Obesity,Stroke_or_chronic_consequences_of_stroke
0,"('54.25', '-10.0')",2000-05-01,-10.0,54.25,-3.17,7.57,10.66,-0.02,8.82,0.16,...,8.15,8.0,2.5,2.4,4.05,1.2,13.55,2.8,22.05,0.7
1,"('54.25', '-10.0')",2000-05-02,-10.0,54.25,-1.2,8.22,9.81,-0.02,8.13,-1.53,...,8.15,8.0,2.5,2.4,4.05,1.2,13.55,2.8,22.05,0.7


# Merging with mortality data

In [8]:
# read mortality data
mortality = pd.read_csv('Mortality1.csv', index_col=0)
mortality.head()

Unnamed: 0,TIME,GEO,GEO_LABEL,deaths,country,week_nr,year,population,density,population_65+,share_over_65,death_p_100k,code_length
17,2000W18,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,196.0,BE,W18,2000,959318.0,5974.3,160908.0,16.773166,20.431181,3.0
18,2000W19,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,172.0,BE,W19,2000,959318.0,5974.3,160908.0,16.773166,17.929404,3.0
19,2000W20,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,158.0,BE,W20,2000,959318.0,5974.3,160908.0,16.773166,16.470034,3.0
20,2000W21,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,193.0,BE,W21,2000,959318.0,5974.3,160908.0,16.773166,20.118459,3.0
21,2000W22,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,170.0,BE,W22,2000,959318.0,5974.3,160908.0,16.773166,17.720923,3.0


In [9]:
# describe deaths per 100,000
mortality['death_p_100k'].describe()

count    129597.000000
mean         18.561734
std           4.409888
min           0.000000
25%          15.789557
50%          18.111807
75%          20.923414
max          56.800635
Name: death_p_100k, dtype: float64

In [10]:
# inspect shape
mortality.shape

(167486, 13)

In [11]:
# drop unnecessary column and rename others
mortality = mortality.drop('week_nr', 1)
mortality = mortality.rename(columns={'TIME': 'week_nr', 'GEO': 'NUTS2'})
mortality.head()

Unnamed: 0,week_nr,NUTS2,GEO_LABEL,deaths,country,year,population,density,population_65+,share_over_65,death_p_100k,code_length
17,2000W18,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,196.0,BE,2000,959318.0,5974.3,160908.0,16.773166,20.431181,3.0
18,2000W19,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,172.0,BE,2000,959318.0,5974.3,160908.0,16.773166,17.929404,3.0
19,2000W20,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,158.0,BE,2000,959318.0,5974.3,160908.0,16.773166,16.470034,3.0
20,2000W21,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,193.0,BE,2000,959318.0,5974.3,160908.0,16.773166,20.118459,3.0
21,2000W22,BE1,Région de Bruxelles-Capitale/Brussels Hoofdste...,170.0,BE,2000,959318.0,5974.3,160908.0,16.773166,17.720923,3.0


In [12]:
# change and create new variables that match the variables in the mortality data for merging
final_df['year'] = final_df['year'].astype(str)
final_df['Week_Number'] = final_df['Week_Number'].astype(str)
final_df['Week_Number'] = 'W' + final_df['Week_Number']
final_df["week_nr"] = final_df['year'] + final_df['Week_Number']
final_df.head()

Unnamed: 0,lat_long,date,longitude,latitude,wind_2AM,dew_point_2AM,temperature_2AM,uvb_2AM,apparent_temperature_2AM,wind_11AM,...,Chronic_depression,Chronic_lower_respiratory_diseases,Coronary_heart_disease_or_angina_pectoris,Diabetes,Heart_attack_or_chronic_consequences_of_heart_attack,High_blood_pressure,Kidney_problems,Obesity,Stroke_or_chronic_consequences_of_stroke,week_nr
0,"('54.25', '-10.0')",2000-05-01,-10.0,54.25,-3.17,7.57,10.66,-0.02,8.82,0.16,...,8.0,2.5,2.4,4.05,1.2,13.55,2.8,22.05,0.7,2000W18
1,"('54.25', '-10.0')",2000-05-02,-10.0,54.25,-1.2,8.22,9.81,-0.02,8.13,-1.53,...,8.0,2.5,2.4,4.05,1.2,13.55,2.8,22.05,0.7,2000W18
2,"('54.25', '-10.0')",2000-05-03,-10.0,54.25,-4.29,6.97,8.52,-0.02,6.56,-4.15,...,8.0,2.5,2.4,4.05,1.2,13.55,2.8,22.05,0.7,2000W18
3,"('54.25', '-10.0')",2000-05-04,-10.0,54.25,-4.35,6.38,8.56,-0.02,6.47,-4.6,...,8.0,2.5,2.4,4.05,1.2,13.55,2.8,22.05,0.7,2000W18
4,"('54.25', '-10.0')",2000-05-05,-10.0,54.25,-3.99,7.33,9.03,-0.02,7.15,-3.51,...,8.0,2.5,2.4,4.05,1.2,13.55,2.8,22.05,0.7,2000W18


In [13]:
# inspect shape
final_df.shape

(3891096, 52)

# Merge weather data with mortality

In [14]:
# merge the weather data with mortality on week number and NUTS2
final_df = final_df.merge(mortality, how='left', on=['NUTS2','week_nr']).drop(['country_y', 'year_y'], axis = 1)
final_df = final_df.rename(columns={'GEO_LABEL': 'NUTS2_Label', 'country_x': 'country', 'year_x': 'year'})
final_df.shape

(3891096, 60)

In [15]:
# inspect data
pd.set_option('display.max_columns', None)
final_df.head(2)

Unnamed: 0,lat_long,date,longitude,latitude,wind_2AM,dew_point_2AM,temperature_2AM,uvb_2AM,apparent_temperature_2AM,wind_11AM,dew_point_11AM,temperature_11AM,uvb_11AM,apparent_temperature_11AM,wind_4PM,dew_point_4PM,temperature_4PM,uvb_4PM,apparent_temperature_4PM,wind_11PM,dew_point_11PM,temperature_11PM,uvb_11PM,apparent_temperature_11PM,country,NUTS1,NUTS2,NUTS3,MRT_2AM,MRT_11AM,MRT_4PM,MRT_11PM,Week_Number,99th_percentile,day,month,year,threshold_exceeded,consecutive_HW_days,heat_wave_day,health_expend_p_capita,Asthma,Chronic_depression,Chronic_lower_respiratory_diseases,Coronary_heart_disease_or_angina_pectoris,Diabetes,Heart_attack_or_chronic_consequences_of_heart_attack,High_blood_pressure,Kidney_problems,Obesity,Stroke_or_chronic_consequences_of_stroke,week_nr,NUTS2_Label,deaths,population,density,population_65+,share_over_65,death_p_100k,code_length
0,"('54.25', '-10.0')",2000-05-01,-10.0,54.25,-3.17,7.57,10.66,-0.02,8.82,0.16,8.74,13.7,247851.27,12.13,-1.44,8.74,14.12,232877.31,12.55,-1.94,8.07,10.32,-0.02,8.6,IE,IE0,IE04,IE042,2.96,23.1,15.57,3.84,W18,20.898722,1,5,2000,0,0,0,3760.622022,8.15,8.0,2.5,2.4,4.05,1.2,13.55,2.8,22.05,0.7,2000W18,,,,,,,,
1,"('54.25', '-10.0')",2000-05-02,-10.0,54.25,-1.2,8.22,9.81,-0.02,8.13,-1.53,9.74,11.66,244230.16,10.39,-2.43,8.64,11.02,180512.1,9.44,-3.55,7.47,8.99,-0.02,7.14,IE,IE0,IE04,IE042,3.03,23.49,25.09,5.6,W18,20.898722,2,5,2000,0,0,0,3760.622022,8.15,8.0,2.5,2.4,4.05,1.2,13.55,2.8,22.05,0.7,2000W18,,,,,,,,


## Dropping days with NaN for mortality

In [21]:
# dropping observations with NaN for mortality
final_df = final_df.dropna(subset=['death_p_100k'])
final_df.shape

(2550190, 60)

The number of rows with available mortality data equals 2,555,190, down from 3,891,096

# Checkpoint 2: Writing dataframe without NaN for mortality to CSV

In [2]:
# 2nd checkpoint aimed to prevent data overload
# read and load data
#final_df.to_csv('data_no_NaN.csv')
final_df = pd.read_csv('data_no_NaN.csv')
final_df = final_df.iloc[: , 1:]

In [3]:
# inspect data
final_df.head()

Unnamed: 0,lat_long,date,longitude,latitude,wind_2AM,dew_point_2AM,temperature_2AM,uvb_2AM,apparent_temperature_2AM,wind_11AM,...,Stroke_or_chronic_consequences_of_stroke,week_nr,NUTS2_Label,deaths,population,density,population_65+,share_over_65,death_p_100k,code_length
0,"('43.0', '-9.25')",2000-05-01,-9.25,43.0,-2.12,11.14,12.81,-0.02,11.98,-3.62,...,0.9,2000W18,Galicia,541.0,2702471.0,91.3,544528.0,20.149263,20.018716,
1,"('43.0', '-9.25')",2000-05-02,-9.25,43.0,-1.48,12.8,13.52,-0.02,13.29,1.25,...,0.9,2000W18,Galicia,541.0,2702471.0,91.3,544528.0,20.149263,20.018716,
2,"('43.0', '-9.25')",2000-05-03,-9.25,43.0,-3.79,12.36,13.44,-0.02,13.05,-4.98,...,0.9,2000W18,Galicia,541.0,2702471.0,91.3,544528.0,20.149263,20.018716,
3,"('43.0', '-9.25')",2000-05-04,-9.25,43.0,-1.34,13.01,14.0,-0.02,13.86,0.34,...,0.9,2000W18,Galicia,541.0,2702471.0,91.3,544528.0,20.149263,20.018716,
4,"('43.0', '-9.25')",2000-05-05,-9.25,43.0,-2.68,12.54,13.75,-0.02,13.42,0.05,...,0.9,2000W18,Galicia,541.0,2702471.0,91.3,544528.0,20.149263,20.018716,


In [4]:
# # To reducte the number of features, wind and dew_point will be kept as a daily average instead of hourly

# wind
final_df['wind_2AM'] = abs(final_df['wind_2AM'])
final_df['wind_11AM'] = abs(final_df['wind_11AM'])
final_df['wind_4PM'] = abs(final_df['wind_4PM'])
final_df['wind_11PM'] = abs(final_df['wind_11PM'])
final_df['wind_day'] = (final_df['wind_2AM'] + final_df['wind_11AM'] + final_df['wind_4PM'] + final_df['wind_11PM']) / 4
final_df = final_df.drop(['wind_2AM', 'wind_11AM', 'wind_4PM', 'wind_11PM'],1)

# dew_point
final_df['dew_point_2AM'] = final_df['dew_point_2AM']
final_df['dew_point_11AM'] = final_df['dew_point_11AM']
final_df['dew_point_4PM'] = final_df['dew_point_4PM']
final_df['dew_point_11PM'] = final_df['dew_point_11PM']
final_df['dew_point_day'] = (final_df['dew_point_2AM'] + final_df['dew_point_11AM'] + final_df['dew_point_4PM'] + final_df['dew_point_11PM']) / 4
final_df = final_df.drop(['dew_point_2AM', 'dew_point_11AM', 'dew_point_4PM', 'dew_point_11PM'],1)


In [5]:
# uvb radiation measures during the evening, night and before noon are considered irrelevant and dropped
final_df = final_df.drop(['uvb_2AM', 'uvb_11AM', 'uvb_11PM'],1)

## Computing three-year rolling average in mortality

In [6]:
# for each location, compute the yearly mean
avg_demo = pd.DataFrame(final_df.groupby(['lat_long', 'year'])['death_p_100k'].mean())
avg_demo = avg_demo.add_suffix('_avg')
avg_demo = avg_demo.reset_index(level=['lat_long', 'year'])
avg_demo = avg_demo.rename(columns={'NUTS1_avg': 'NUTS1'})
#df.groupby('A').mean()
avg_demo.head()

Unnamed: 0,lat_long,year,death_p_100k_avg
0,"('37.0', '-2.5')",2000,14.380536
1,"('37.0', '-2.5')",2001,15.033509
2,"('37.0', '-2.5')",2002,14.396432
3,"('37.0', '-2.5')",2003,15.871519
4,"('37.0', '-2.5')",2004,14.475298


In [7]:
avg_demo.shape

(16704, 3)

In [8]:
# get three-year rolling average of weekly mortality centered at year per locations
avg_demo['death_p_100k_3_avg'] = avg_demo.groupby('lat_long')['death_p_100k_avg'].rolling(window=3, center=True).mean().reset_index(0,drop=True)
avg_demo.head(25)

Unnamed: 0,lat_long,year,death_p_100k_avg,death_p_100k_3_avg
0,"('37.0', '-2.5')",2000,14.380536,
1,"('37.0', '-2.5')",2001,15.033509,14.603492
2,"('37.0', '-2.5')",2002,14.396432,15.100487
3,"('37.0', '-2.5')",2003,15.871519,14.914416
4,"('37.0', '-2.5')",2004,14.475298,14.755544
5,"('37.0', '-2.5')",2005,13.919816,14.247227
6,"('37.0', '-2.5')",2006,14.346568,14.144199
7,"('37.0', '-2.5')",2007,14.166212,14.198117
8,"('37.0', '-2.5')",2008,14.081571,13.975246
9,"('37.0', '-2.5')",2009,13.677954,13.889492


Missing values for 2021 and 2000 as no average centered at these year can be computed due to data from 1999 and 2022 not being available.

In [9]:
# impute NaN of years 2000 with the average centered at 2001
df_21 = avg_demo[avg_demo['year'] == 2021]
avg_demo = avg_demo[avg_demo['year'] != 2021]
avg_demo['death_p_100k_3_avg'].fillna(method='bfill', inplace=True)
avg_demo.head(5)

Unnamed: 0,lat_long,year,death_p_100k_avg,death_p_100k_3_avg
0,"('37.0', '-2.5')",2000,14.380536,14.603492
1,"('37.0', '-2.5')",2001,15.033509,14.603492
2,"('37.0', '-2.5')",2002,14.396432,15.100487
3,"('37.0', '-2.5')",2003,15.871519,14.914416
4,"('37.0', '-2.5')",2004,14.475298,14.755544


In [10]:
# impute NaN for 2021 with the averages for only 2021 to account for higher death rates because of Covid
df_21['death_p_100k_3_avg'] = df_21['death_p_100k_avg']
frames = [avg_demo,df_21]
avg_demo = pd.concat(frames)
avg_demo.sort_values(by=['lat_long', 'year'], inplace=True)
avg_demo.head(25)

Unnamed: 0,lat_long,year,death_p_100k_avg,death_p_100k_3_avg
0,"('37.0', '-2.5')",2000,14.380536,14.603492
1,"('37.0', '-2.5')",2001,15.033509,14.603492
2,"('37.0', '-2.5')",2002,14.396432,15.100487
3,"('37.0', '-2.5')",2003,15.871519,14.914416
4,"('37.0', '-2.5')",2004,14.475298,14.755544
5,"('37.0', '-2.5')",2005,13.919816,14.247227
6,"('37.0', '-2.5')",2006,14.346568,14.144199
7,"('37.0', '-2.5')",2007,14.166212,14.198117
8,"('37.0', '-2.5')",2008,14.081571,13.975246
9,"('37.0', '-2.5')",2009,13.677954,13.889492


In [11]:
# keep only the three-year averages for each year for each location
avg_demo = avg_demo.drop(['death_p_100k_avg'],1)
avg_demo = avg_demo.rename(columns={'death_p_100k_3_avg': 'death_p_100k_avg'})
avg_demo.head(1)

Unnamed: 0,lat_long,year,death_p_100k_avg
0,"('37.0', '-2.5')",2000,14.603492


In [12]:
# write data to csv
avg_demo.to_csv('avg_deaths_100k.csv')

## Get minimum and maximum temperature

In [13]:
temperatures = final_df[['apparent_temperature_2AM', 'apparent_temperature_11AM', 'apparent_temperature_4PM', 'apparent_temperature_11PM']]
# find the maximum and minimum values of each row
temperatures['max_apparent_temperature'] = temperatures.max(axis = 1)
temperatures['min_apparent_temperature'] = temperatures.min(axis = 1)
temperatures = temperatures[['max_apparent_temperature', 'min_apparent_temperature']]
temperatures

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temperatures['max_apparent_temperature'] = temperatures.max(axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temperatures['min_apparent_temperature'] = temperatures.min(axis = 1)


Unnamed: 0,max_apparent_temperature,min_apparent_temperature
0,15.87,11.98
1,15.83,12.95
2,18.53,13.05
3,15.94,13.86
4,16.34,13.42
...,...,...
2550185,8.36,1.22
2550186,9.65,0.60
2550187,6.55,1.37
2550188,6.85,4.36


In [14]:
# drop hourly apparent temperature features and other ones that are deemed irrelevant for the modelling
final_df = final_df.drop(['apparent_temperature_2AM', 'apparent_temperature_11AM', 'apparent_temperature_4PM', 
                          'apparent_temperature_11PM', 'temperature_2AM', 'temperature_4PM', 'MRT_11AM', 'MRT_11PM',
                          'temperature_11AM','temperature_11PM'],1)
# join with obtained max and minimum apparent temperature 
final_df = final_df.join(temperatures)
final_df

Unnamed: 0,lat_long,date,longitude,latitude,uvb_4PM,country,NUTS1,NUTS2,NUTS3,MRT_2AM,...,population,density,population_65+,share_over_65,death_p_100k,code_length,wind_day,dew_point_day,max_apparent_temperature,min_apparent_temperature
0,"('43.0', '-9.25')",2000-05-01,-9.25,43.00,165578.40,ES,ES1,ES11,ES111,6.35,...,2702471.0,91.3,544528.0,20.149263,20.018716,,2.2075,12.3350,15.87,11.98
1,"('43.0', '-9.25')",2000-05-02,-9.25,43.00,164847.47,ES,ES1,ES11,ES111,8.06,...,2702471.0,91.3,544528.0,20.149263,20.018716,,1.7075,12.9650,15.83,12.95
2,"('43.0', '-9.25')",2000-05-03,-9.25,43.00,244458.16,ES,ES1,ES11,ES111,5.84,...,2702471.0,91.3,544528.0,20.149263,20.018716,,3.4325,12.8600,18.53,13.05
3,"('43.0', '-9.25')",2000-05-04,-9.25,43.00,150456.94,ES,ES1,ES11,ES111,9.93,...,2702471.0,91.3,544528.0,20.149263,20.018716,,1.2650,13.1650,15.94,13.86
4,"('43.0', '-9.25')",2000-05-05,-9.25,43.00,100901.43,ES,ES1,ES11,ES111,6.88,...,2702471.0,91.3,544528.0,20.149263,20.018716,,1.2750,12.7725,16.34,13.42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2550185,"('61.75', '29.75')",2021-09-26,29.75,61.75,4301.75,FI,FI1,FI1C,FI1C5,4.13,...,1148794.0,36.6,291137.0,25.342838,24.982721,,1.4200,5.4525,8.36,1.22
2550186,"('61.75', '29.75')",2021-09-27,29.75,61.75,4378.33,FI,FI1,FI1C,FI1C5,0.34,...,1148794.0,36.6,291137.0,25.342838,23.067669,,0.5250,4.5175,9.65,0.60
2550187,"('61.75', '29.75')",2021-09-28,29.75,61.75,2498.92,FI,FI1,FI1C,FI1C5,3.95,...,1148794.0,36.6,291137.0,25.342838,23.067669,,1.1925,5.4500,6.55,1.37
2550188,"('61.75', '29.75')",2021-09-29,29.75,61.75,2234.41,FI,FI1,FI1C,FI1C5,4.56,...,1148794.0,36.6,291137.0,25.342838,23.067669,,1.0125,5.2575,6.85,4.36


### Compute averages of weather variables per location as the deviation from the average per location might be an informative feature

In [15]:
# get average of weather variables per locations
avg = pd.DataFrame(final_df.groupby('lat_long')['wind_day', 'dew_point_day', 'uvb_4PM','max_apparent_temperature', 'min_apparent_temperature',
                                                'MRT_2AM', 'MRT_4PM'].mean())
avg = avg.add_suffix('_avg')
avg = avg.reset_index(level=['lat_long'])
avg = avg.rename(columns={'NUTS1_avg': 'NUTS1'})
#df.groupby('A').mean()
avg.head()

  avg = pd.DataFrame(final_df.groupby('lat_long')['wind_day', 'dew_point_day', 'uvb_4PM','max_apparent_temperature', 'min_apparent_temperature',


Unnamed: 0,lat_long,wind_day_avg,dew_point_day_avg,uvb_4PM_avg,max_apparent_temperature_avg,min_apparent_temperature_avg,MRT_2AM_avg,MRT_4PM_avg
0,"('37.0', '-2.5')",1.359658,11.427329,230179.699985,27.170423,17.870845,10.19106,35.969705
1,"('37.0', '-3.25')",0.940888,8.217627,232642.714719,23.415918,13.653343,6.793954,32.01799
2,"('37.0', '-4.0')",0.852583,12.090778,238715.081325,26.272641,16.18479,8.756076,34.119667
3,"('37.0', '-4.75')",1.172296,12.823071,243280.345409,28.053876,16.880438,8.883504,34.742236
4,"('37.0', '-5.5')",1.696267,12.777655,247985.496707,30.056937,18.812245,10.98181,37.806311


In [16]:
# store columns that can be compared to the average in a list
b = final_df.columns.values
b = b.tolist()
b = [e for e in b if e not in ('lat_long', 'date', 'longitude', 'latitude','Week_Number', '99th_percentile', 'day', 'month', 'year',
         'threshold_exceeded', 'consecutive_HW_days', 'heat_wave_day','week_nr', 'NUTS1_Label', 'deaths', 'population', 'population_65+', 'share_over_65', 'density',
         'country', 'NUTS1', 'NUTS2', 'NUTS3', 'health_expend_p_capita', 'Asthma', 'Chronic_depression', 'Chronic_lower_respiratory_diseases',
                               'Coronary_heart_disease_or_angina_pectoris', 'Diabetes', 'Heart_attack_or_chronic_consequences_of_heart_attack', 
                               'High_blood_pressure', 'Kidney_problems', 'Stroke_or_chronic_consequences_of_stroke', 'Obesity', 'NUTS2_Label','code_length')]

print(b)

['uvb_4PM', 'MRT_2AM', 'MRT_4PM', 'death_p_100k', 'wind_day', 'dew_point_day', 'max_apparent_temperature', 'min_apparent_temperature']


In [17]:
# adding averages per location to dataframe
final_df = final_df.merge(avg, how='inner', on=['lat_long'])
final_df.head()

Unnamed: 0,lat_long,date,longitude,latitude,uvb_4PM,country,NUTS1,NUTS2,NUTS3,MRT_2AM,...,dew_point_day,max_apparent_temperature,min_apparent_temperature,wind_day_avg,dew_point_day_avg,uvb_4PM_avg,max_apparent_temperature_avg,min_apparent_temperature_avg,MRT_2AM_avg,MRT_4PM_avg
0,"('43.0', '-9.25')",2000-05-01,-9.25,43.0,165578.4,ES,ES1,ES11,ES111,6.35,...,12.335,15.87,11.98,2.809025,13.979091,225349.668363,18.963855,15.72365,10.002745,24.7651
1,"('43.0', '-9.25')",2000-05-02,-9.25,43.0,164847.47,ES,ES1,ES11,ES111,8.06,...,12.965,15.83,12.95,2.809025,13.979091,225349.668363,18.963855,15.72365,10.002745,24.7651
2,"('43.0', '-9.25')",2000-05-03,-9.25,43.0,244458.16,ES,ES1,ES11,ES111,5.84,...,12.86,18.53,13.05,2.809025,13.979091,225349.668363,18.963855,15.72365,10.002745,24.7651
3,"('43.0', '-9.25')",2000-05-04,-9.25,43.0,150456.94,ES,ES1,ES11,ES111,9.93,...,13.165,15.94,13.86,2.809025,13.979091,225349.668363,18.963855,15.72365,10.002745,24.7651
4,"('43.0', '-9.25')",2000-05-05,-9.25,43.0,100901.43,ES,ES1,ES11,ES111,6.88,...,12.7725,16.34,13.42,2.809025,13.979091,225349.668363,18.963855,15.72365,10.002745,24.7651


In [18]:
# merge final_df with average mortality per year and location
final_df = final_df.merge(avg_demo, how='inner', on=['lat_long', 'year'])
final_df.head()

Unnamed: 0,lat_long,date,longitude,latitude,uvb_4PM,country,NUTS1,NUTS2,NUTS3,MRT_2AM,...,max_apparent_temperature,min_apparent_temperature,wind_day_avg,dew_point_day_avg,uvb_4PM_avg,max_apparent_temperature_avg,min_apparent_temperature_avg,MRT_2AM_avg,MRT_4PM_avg,death_p_100k_avg
0,"('43.0', '-9.25')",2000-05-01,-9.25,43.0,165578.4,ES,ES1,ES11,ES111,6.35,...,15.87,11.98,2.809025,13.979091,225349.668363,18.963855,15.72365,10.002745,24.7651,18.451122
1,"('43.0', '-9.25')",2000-05-02,-9.25,43.0,164847.47,ES,ES1,ES11,ES111,8.06,...,15.83,12.95,2.809025,13.979091,225349.668363,18.963855,15.72365,10.002745,24.7651,18.451122
2,"('43.0', '-9.25')",2000-05-03,-9.25,43.0,244458.16,ES,ES1,ES11,ES111,5.84,...,18.53,13.05,2.809025,13.979091,225349.668363,18.963855,15.72365,10.002745,24.7651,18.451122
3,"('43.0', '-9.25')",2000-05-04,-9.25,43.0,150456.94,ES,ES1,ES11,ES111,9.93,...,15.94,13.86,2.809025,13.979091,225349.668363,18.963855,15.72365,10.002745,24.7651,18.451122
4,"('43.0', '-9.25')",2000-05-05,-9.25,43.0,100901.43,ES,ES1,ES11,ES111,6.88,...,16.34,13.42,2.809025,13.979091,225349.668363,18.963855,15.72365,10.002745,24.7651,18.451122


In [19]:
# compute the deviation of each value from the average value for the location (done for all weather and mortality variables)
pd.set_option('display.max_columns', None)
for i in b:                      
    final_df['{}_vs_avg'.format(i)] = (final_df[i] / final_df['{}_avg'.format(i)])
final_df.head() 

Unnamed: 0,lat_long,date,longitude,latitude,uvb_4PM,country,NUTS1,NUTS2,NUTS3,MRT_2AM,MRT_4PM,Week_Number,99th_percentile,day,month,year,threshold_exceeded,consecutive_HW_days,heat_wave_day,health_expend_p_capita,Asthma,Chronic_depression,Chronic_lower_respiratory_diseases,Coronary_heart_disease_or_angina_pectoris,Diabetes,Heart_attack_or_chronic_consequences_of_heart_attack,High_blood_pressure,Kidney_problems,Obesity,Stroke_or_chronic_consequences_of_stroke,week_nr,NUTS2_Label,deaths,population,density,population_65+,share_over_65,death_p_100k,code_length,wind_day,dew_point_day,max_apparent_temperature,min_apparent_temperature,wind_day_avg,dew_point_day_avg,uvb_4PM_avg,max_apparent_temperature_avg,min_apparent_temperature_avg,MRT_2AM_avg,MRT_4PM_avg,death_p_100k_avg,uvb_4PM_vs_avg,MRT_2AM_vs_avg,MRT_4PM_vs_avg,death_p_100k_vs_avg,wind_day_vs_avg,dew_point_day_vs_avg,max_apparent_temperature_vs_avg,min_apparent_temperature_vs_avg
0,"('43.0', '-9.25')",2000-05-01,-9.25,43.0,165578.4,ES,ES1,ES11,ES111,6.35,28.64,W18,25.658925,1,5,2000,0,0,0,1164.962074,4.3,6.75,3.0,0.75,7.15,0.85,19.0,2.6,16.1,0.9,2000W18,Galicia,541.0,2702471.0,91.3,544528.0,20.149263,20.018716,,2.2075,12.335,15.87,11.98,2.809025,13.979091,225349.668363,18.963855,15.72365,10.002745,24.7651,18.451122,0.734762,0.634826,1.156466,1.084959,0.78586,0.882389,0.836855,0.76191
1,"('43.0', '-9.25')",2000-05-02,-9.25,43.0,164847.47,ES,ES1,ES11,ES111,8.06,24.69,W18,25.658925,2,5,2000,0,0,0,1164.962074,4.3,6.75,3.0,0.75,7.15,0.85,19.0,2.6,16.1,0.9,2000W18,Galicia,541.0,2702471.0,91.3,544528.0,20.149263,20.018716,,1.7075,12.965,15.83,12.95,2.809025,13.979091,225349.668363,18.963855,15.72365,10.002745,24.7651,18.451122,0.731519,0.805779,0.996968,1.084959,0.607862,0.927457,0.834746,0.8236
2,"('43.0', '-9.25')",2000-05-03,-9.25,43.0,244458.16,ES,ES1,ES11,ES111,5.84,23.74,W18,25.658925,3,5,2000,0,0,0,1164.962074,4.3,6.75,3.0,0.75,7.15,0.85,19.0,2.6,16.1,0.9,2000W18,Galicia,541.0,2702471.0,91.3,544528.0,20.149263,20.018716,,3.4325,12.86,18.53,13.05,2.809025,13.979091,225349.668363,18.963855,15.72365,10.002745,24.7651,18.451122,1.084795,0.58384,0.958607,1.084959,1.221954,0.919945,0.977122,0.82996
3,"('43.0', '-9.25')",2000-05-04,-9.25,43.0,150456.94,ES,ES1,ES11,ES111,9.93,27.42,W18,25.658925,4,5,2000,0,0,0,1164.962074,4.3,6.75,3.0,0.75,7.15,0.85,19.0,2.6,16.1,0.9,2000W18,Galicia,541.0,2702471.0,91.3,544528.0,20.149263,20.018716,,1.265,13.165,15.94,13.86,2.809025,13.979091,225349.668363,18.963855,15.72365,10.002745,24.7651,18.451122,0.66766,0.992728,1.107203,1.084959,0.450334,0.941764,0.840546,0.881475
4,"('43.0', '-9.25')",2000-05-05,-9.25,43.0,100901.43,ES,ES1,ES11,ES111,6.88,26.17,W18,25.658925,5,5,2000,0,0,0,1164.962074,4.3,6.75,3.0,0.75,7.15,0.85,19.0,2.6,16.1,0.9,2000W18,Galicia,541.0,2702471.0,91.3,544528.0,20.149263,20.018716,,1.275,12.7725,16.34,13.42,2.809025,13.979091,225349.668363,18.963855,15.72365,10.002745,24.7651,18.451122,0.447755,0.687811,1.056729,1.084959,0.453894,0.913686,0.861639,0.853491


The results are new column that indicate the deviation from the mean with suffix 'vs_avg'. If the columns 'vs_avg' = 1, they are equal to the average, if > 1, they are larger than the average.

In [21]:
# combine coordinates with week number in new variable to get a unique identifier for each calendar week at each location
final_df['unique_week'] = final_df['lat_long'] + final_df['week_nr']
final_df.head(1)

Unnamed: 0,lat_long,date,longitude,latitude,uvb_4PM,country,NUTS1,NUTS2,NUTS3,MRT_2AM,MRT_4PM,Week_Number,99th_percentile,day,month,year,threshold_exceeded,consecutive_HW_days,heat_wave_day,health_expend_p_capita,Asthma,Chronic_depression,Chronic_lower_respiratory_diseases,Coronary_heart_disease_or_angina_pectoris,Diabetes,Heart_attack_or_chronic_consequences_of_heart_attack,High_blood_pressure,Kidney_problems,Obesity,Stroke_or_chronic_consequences_of_stroke,week_nr,NUTS2_Label,deaths,population,density,population_65+,share_over_65,death_p_100k,code_length,wind_day,dew_point_day,max_apparent_temperature,min_apparent_temperature,wind_day_avg,dew_point_day_avg,uvb_4PM_avg,max_apparent_temperature_avg,min_apparent_temperature_avg,MRT_2AM_avg,MRT_4PM_avg,death_p_100k_avg,uvb_4PM_vs_avg,MRT_2AM_vs_avg,MRT_4PM_vs_avg,death_p_100k_vs_avg,wind_day_vs_avg,dew_point_day_vs_avg,max_apparent_temperature_vs_avg,min_apparent_temperature_vs_avg,unique_week
0,"('43.0', '-9.25')",2000-05-01,-9.25,43.0,165578.4,ES,ES1,ES11,ES111,6.35,28.64,W18,25.658925,1,5,2000,0,0,0,1164.962074,4.3,6.75,3.0,0.75,7.15,0.85,19.0,2.6,16.1,0.9,2000W18,Galicia,541.0,2702471.0,91.3,544528.0,20.149263,20.018716,,2.2075,12.335,15.87,11.98,2.809025,13.979091,225349.668363,18.963855,15.72365,10.002745,24.7651,18.451122,0.734762,0.634826,1.156466,1.084959,0.78586,0.882389,0.836855,0.76191,"('43.0', '-9.25')2000W18"


In [22]:
# get the sum of consecutive days of the threshold exceeded starting from the previous instance
final_df['consecutive_HW_days_prev'] = final_df['consecutive_HW_days'].shift()
final_df.head()

Unnamed: 0,lat_long,date,longitude,latitude,uvb_4PM,country,NUTS1,NUTS2,NUTS3,MRT_2AM,MRT_4PM,Week_Number,99th_percentile,day,month,year,threshold_exceeded,consecutive_HW_days,heat_wave_day,health_expend_p_capita,Asthma,Chronic_depression,Chronic_lower_respiratory_diseases,Coronary_heart_disease_or_angina_pectoris,Diabetes,Heart_attack_or_chronic_consequences_of_heart_attack,High_blood_pressure,Kidney_problems,Obesity,Stroke_or_chronic_consequences_of_stroke,week_nr,NUTS2_Label,deaths,population,density,population_65+,share_over_65,death_p_100k,code_length,wind_day,dew_point_day,max_apparent_temperature,min_apparent_temperature,wind_day_avg,dew_point_day_avg,uvb_4PM_avg,max_apparent_temperature_avg,min_apparent_temperature_avg,MRT_2AM_avg,MRT_4PM_avg,death_p_100k_avg,uvb_4PM_vs_avg,MRT_2AM_vs_avg,MRT_4PM_vs_avg,death_p_100k_vs_avg,wind_day_vs_avg,dew_point_day_vs_avg,max_apparent_temperature_vs_avg,min_apparent_temperature_vs_avg,unique_week,consecutive_HW_days_prev
0,"('43.0', '-9.25')",2000-05-01,-9.25,43.0,165578.4,ES,ES1,ES11,ES111,6.35,28.64,W18,25.658925,1,5,2000,0,0,0,1164.962074,4.3,6.75,3.0,0.75,7.15,0.85,19.0,2.6,16.1,0.9,2000W18,Galicia,541.0,2702471.0,91.3,544528.0,20.149263,20.018716,,2.2075,12.335,15.87,11.98,2.809025,13.979091,225349.668363,18.963855,15.72365,10.002745,24.7651,18.451122,0.734762,0.634826,1.156466,1.084959,0.78586,0.882389,0.836855,0.76191,"('43.0', '-9.25')2000W18",
1,"('43.0', '-9.25')",2000-05-02,-9.25,43.0,164847.47,ES,ES1,ES11,ES111,8.06,24.69,W18,25.658925,2,5,2000,0,0,0,1164.962074,4.3,6.75,3.0,0.75,7.15,0.85,19.0,2.6,16.1,0.9,2000W18,Galicia,541.0,2702471.0,91.3,544528.0,20.149263,20.018716,,1.7075,12.965,15.83,12.95,2.809025,13.979091,225349.668363,18.963855,15.72365,10.002745,24.7651,18.451122,0.731519,0.805779,0.996968,1.084959,0.607862,0.927457,0.834746,0.8236,"('43.0', '-9.25')2000W18",0.0
2,"('43.0', '-9.25')",2000-05-03,-9.25,43.0,244458.16,ES,ES1,ES11,ES111,5.84,23.74,W18,25.658925,3,5,2000,0,0,0,1164.962074,4.3,6.75,3.0,0.75,7.15,0.85,19.0,2.6,16.1,0.9,2000W18,Galicia,541.0,2702471.0,91.3,544528.0,20.149263,20.018716,,3.4325,12.86,18.53,13.05,2.809025,13.979091,225349.668363,18.963855,15.72365,10.002745,24.7651,18.451122,1.084795,0.58384,0.958607,1.084959,1.221954,0.919945,0.977122,0.82996,"('43.0', '-9.25')2000W18",0.0
3,"('43.0', '-9.25')",2000-05-04,-9.25,43.0,150456.94,ES,ES1,ES11,ES111,9.93,27.42,W18,25.658925,4,5,2000,0,0,0,1164.962074,4.3,6.75,3.0,0.75,7.15,0.85,19.0,2.6,16.1,0.9,2000W18,Galicia,541.0,2702471.0,91.3,544528.0,20.149263,20.018716,,1.265,13.165,15.94,13.86,2.809025,13.979091,225349.668363,18.963855,15.72365,10.002745,24.7651,18.451122,0.66766,0.992728,1.107203,1.084959,0.450334,0.941764,0.840546,0.881475,"('43.0', '-9.25')2000W18",0.0
4,"('43.0', '-9.25')",2000-05-05,-9.25,43.0,100901.43,ES,ES1,ES11,ES111,6.88,26.17,W18,25.658925,5,5,2000,0,0,0,1164.962074,4.3,6.75,3.0,0.75,7.15,0.85,19.0,2.6,16.1,0.9,2000W18,Galicia,541.0,2702471.0,91.3,544528.0,20.149263,20.018716,,1.275,12.7725,16.34,13.42,2.809025,13.979091,225349.668363,18.963855,15.72365,10.002745,24.7651,18.451122,0.447755,0.687811,1.056729,1.084959,0.453894,0.913686,0.861639,0.853491,"('43.0', '-9.25')2000W18",0.0


In [23]:
# get a unique identifier for periods with constant value in 'consecutive_HW_days column'
final_df['cumsum'] = (final_df['consecutive_HW_days'] != final_df['consecutive_HW_days_prev']).cumsum()
final_df

Unnamed: 0,lat_long,date,longitude,latitude,uvb_4PM,country,NUTS1,NUTS2,NUTS3,MRT_2AM,MRT_4PM,Week_Number,99th_percentile,day,month,year,threshold_exceeded,consecutive_HW_days,heat_wave_day,health_expend_p_capita,Asthma,Chronic_depression,Chronic_lower_respiratory_diseases,Coronary_heart_disease_or_angina_pectoris,Diabetes,Heart_attack_or_chronic_consequences_of_heart_attack,High_blood_pressure,Kidney_problems,Obesity,Stroke_or_chronic_consequences_of_stroke,week_nr,NUTS2_Label,deaths,population,density,population_65+,share_over_65,death_p_100k,code_length,wind_day,dew_point_day,max_apparent_temperature,min_apparent_temperature,wind_day_avg,dew_point_day_avg,uvb_4PM_avg,max_apparent_temperature_avg,min_apparent_temperature_avg,MRT_2AM_avg,MRT_4PM_avg,death_p_100k_avg,uvb_4PM_vs_avg,MRT_2AM_vs_avg,MRT_4PM_vs_avg,death_p_100k_vs_avg,wind_day_vs_avg,dew_point_day_vs_avg,max_apparent_temperature_vs_avg,min_apparent_temperature_vs_avg,unique_week,consecutive_HW_days_prev,cumsum
0,"('43.0', '-9.25')",2000-05-01,-9.25,43.00,165578.40,ES,ES1,ES11,ES111,6.35,28.64,W18,25.658925,1,5,2000,0,0,0,1164.962074,4.30,6.75,3.0,0.75,7.15,0.85,19.0,2.60,16.1,0.9,2000W18,Galicia,541.0,2702471.0,91.3,544528.0,20.149263,20.018716,,2.2075,12.3350,15.87,11.98,2.809025,13.979091,225349.668363,18.963855,15.723650,10.002745,24.765100,18.451122,0.734762,0.634826,1.156466,1.084959,0.785860,0.882389,0.836855,0.761910,"('43.0', '-9.25')2000W18",,1
1,"('43.0', '-9.25')",2000-05-02,-9.25,43.00,164847.47,ES,ES1,ES11,ES111,8.06,24.69,W18,25.658925,2,5,2000,0,0,0,1164.962074,4.30,6.75,3.0,0.75,7.15,0.85,19.0,2.60,16.1,0.9,2000W18,Galicia,541.0,2702471.0,91.3,544528.0,20.149263,20.018716,,1.7075,12.9650,15.83,12.95,2.809025,13.979091,225349.668363,18.963855,15.723650,10.002745,24.765100,18.451122,0.731519,0.805779,0.996968,1.084959,0.607862,0.927457,0.834746,0.823600,"('43.0', '-9.25')2000W18",0.0,1
2,"('43.0', '-9.25')",2000-05-03,-9.25,43.00,244458.16,ES,ES1,ES11,ES111,5.84,23.74,W18,25.658925,3,5,2000,0,0,0,1164.962074,4.30,6.75,3.0,0.75,7.15,0.85,19.0,2.60,16.1,0.9,2000W18,Galicia,541.0,2702471.0,91.3,544528.0,20.149263,20.018716,,3.4325,12.8600,18.53,13.05,2.809025,13.979091,225349.668363,18.963855,15.723650,10.002745,24.765100,18.451122,1.084795,0.583840,0.958607,1.084959,1.221954,0.919945,0.977122,0.829960,"('43.0', '-9.25')2000W18",0.0,1
3,"('43.0', '-9.25')",2000-05-04,-9.25,43.00,150456.94,ES,ES1,ES11,ES111,9.93,27.42,W18,25.658925,4,5,2000,0,0,0,1164.962074,4.30,6.75,3.0,0.75,7.15,0.85,19.0,2.60,16.1,0.9,2000W18,Galicia,541.0,2702471.0,91.3,544528.0,20.149263,20.018716,,1.2650,13.1650,15.94,13.86,2.809025,13.979091,225349.668363,18.963855,15.723650,10.002745,24.765100,18.451122,0.667660,0.992728,1.107203,1.084959,0.450334,0.941764,0.840546,0.881475,"('43.0', '-9.25')2000W18",0.0,1
4,"('43.0', '-9.25')",2000-05-05,-9.25,43.00,100901.43,ES,ES1,ES11,ES111,6.88,26.17,W18,25.658925,5,5,2000,0,0,0,1164.962074,4.30,6.75,3.0,0.75,7.15,0.85,19.0,2.60,16.1,0.9,2000W18,Galicia,541.0,2702471.0,91.3,544528.0,20.149263,20.018716,,1.2750,12.7725,16.34,13.42,2.809025,13.979091,225349.668363,18.963855,15.723650,10.002745,24.765100,18.451122,0.447755,0.687811,1.056729,1.084959,0.453894,0.913686,0.861639,0.853491,"('43.0', '-9.25')2000W18",0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2550185,"('45.25', '29.0')",2021-09-26,29.00,45.25,9097.72,RO,RO2,RO22,RO225,4.35,16.80,W38,33.860439,26,9,2021,0,0,0,814.707973,1.75,1.25,1.6,2.45,4.90,1.15,16.4,1.55,10.0,1.2,2021W38,Sud-Est,819.0,2377101.0,70.9,476792.0,20.057709,34.453732,,1.8575,10.4225,22.73,10.54,1.590740,14.227184,65180.684485,26.355047,17.306311,10.900262,23.424316,28.978199,0.139577,0.399073,0.717203,1.188954,1.167696,0.732576,0.862453,0.609026,"('45.25', '29.0')2021W38",0.0,61803
2550186,"('45.25', '29.0')",2021-09-27,29.00,45.25,10893.59,RO,RO2,RO22,RO225,6.33,10.38,W39,33.860439,27,9,2021,0,0,0,814.707973,1.75,1.25,1.6,2.45,4.90,1.15,16.4,1.55,10.0,1.2,2021W39,Sud-Est,949.0,2377101.0,70.9,476792.0,20.057709,39.922578,,2.0000,10.5525,20.49,10.17,1.590740,14.227184,65180.684485,26.355047,17.306311,10.900262,23.424316,28.978199,0.167129,0.580720,0.443129,1.377676,1.257277,0.741714,0.777460,0.587647,"('45.25', '29.0')2021W39",0.0,61803
2550187,"('45.25', '29.0')",2021-09-28,29.00,45.25,3932.83,RO,RO2,RO22,RO225,5.44,12.95,W39,33.860439,28,9,2021,0,0,0,814.707973,1.75,1.25,1.6,2.45,4.90,1.15,16.4,1.55,10.0,1.2,2021W39,Sud-Est,949.0,2377101.0,70.9,476792.0,20.057709,39.922578,,1.9050,6.7975,15.37,8.58,1.590740,14.227184,65180.684485,26.355047,17.306311,10.900262,23.424316,28.978199,0.060337,0.499071,0.552844,1.377676,1.197556,0.477783,0.583190,0.495773,"('45.25', '29.0')2021W39",0.0,61803
2550188,"('45.25', '29.0')",2021-09-29,29.00,45.25,7232.23,RO,RO2,RO22,RO225,5.20,10.11,W39,33.860439,29,9,2021,0,0,0,814.707973,1.75,1.25,1.6,2.45,4.90,1.15,16.4,1.55,10.0,1.2,2021W39,Sud-Est,949.0,2377101.0,70.9,476792.0,20.057709,39.922578,,2.5125,7.7650,15.73,8.35,1.590740,14.227184,65180.684485,26.355047,17.306311,10.900262,23.424316,28.978199,0.110957,0.477053,0.431603,1.377676,1.579454,0.545786,0.596850,0.482483,"('45.25', '29.0')2021W39",0.0,61803


In [24]:
# inspect shape
final_df.shape

(2550190, 62)

In [25]:
# drop the averages
final_df = final_df.drop(['wind_day_avg', 'dew_point_day_avg', 'uvb_4PM_avg', 'MRT_2AM_avg', 'MRT_4PM_avg', 'max_apparent_temperature_avg',
                          'min_apparent_temperature_avg', 'death_p_100k_avg'], 1)


In [26]:
# filter only instances that are part of a period with more than 3 consecutive days exceeding the 99th percentile
heatwave = final_df[final_df['consecutive_HW_days'] > 3]
# rename the previously obtained column comsum to heatwave_id
heatwave = heatwave.rename(columns={'cumsum': 'heatwave_id'})
heatwave['heatwave_id'] = heatwave['heatwave_id'].astype(str)
heatwave.head()

Unnamed: 0,lat_long,date,longitude,latitude,uvb_4PM,country,NUTS1,NUTS2,NUTS3,MRT_2AM,MRT_4PM,Week_Number,99th_percentile,day,month,year,threshold_exceeded,consecutive_HW_days,heat_wave_day,health_expend_p_capita,Asthma,Chronic_depression,Chronic_lower_respiratory_diseases,Coronary_heart_disease_or_angina_pectoris,Diabetes,Heart_attack_or_chronic_consequences_of_heart_attack,High_blood_pressure,Kidney_problems,Obesity,Stroke_or_chronic_consequences_of_stroke,week_nr,NUTS2_Label,deaths,population,density,population_65+,share_over_65,death_p_100k,code_length,wind_day,dew_point_day,max_apparent_temperature,min_apparent_temperature,uvb_4PM_vs_avg,MRT_2AM_vs_avg,MRT_4PM_vs_avg,death_p_100k_vs_avg,wind_day_vs_avg,dew_point_day_vs_avg,max_apparent_temperature_vs_avg,min_apparent_temperature_vs_avg,unique_week,consecutive_HW_days_prev,heatwave_id
991,"('43.0', '-9.25')",2006-07-14,-9.25,43.0,295581.34,ES,ES1,ES11,ES111,12.59,29.4,W28,25.658925,14,7,2006,1,4,1,1769.92,4.3,6.75,3.0,0.75,7.15,0.85,19.0,2.6,16.1,0.9,2006W28,Galicia,558.0,2730107.0,92.5,585324.0,21.439599,20.438759,,4.64,17.075,26.6,19.76,1.311656,1.258655,1.187155,1.082837,1.651819,1.221467,1.402668,1.256706,"('43.0', '-9.25')2006W28",0.0,28
992,"('43.0', '-9.25')",2006-07-15,-9.25,43.0,288340.84,ES,ES1,ES11,ES111,14.2,28.87,W28,25.658925,15,7,2006,1,4,1,1769.92,4.3,6.75,3.0,0.75,7.15,0.85,19.0,2.6,16.1,0.9,2006W28,Galicia,558.0,2730107.0,92.5,585324.0,21.439599,20.438759,,2.89,17.7875,26.05,21.49,1.279526,1.41961,1.165753,1.082837,1.028827,1.272436,1.373666,1.366731,"('43.0', '-9.25')2006W28",4.0,28
993,"('43.0', '-9.25')",2006-07-16,-9.25,43.0,302635.12,ES,ES1,ES11,ES111,14.23,27.08,W28,25.658925,16,7,2006,1,4,1,1769.92,4.3,6.75,3.0,0.75,7.15,0.85,19.0,2.6,16.1,0.9,2006W28,Galicia,558.0,2730107.0,92.5,585324.0,21.439599,20.438759,,3.69,18.3775,26.57,23.08,1.342958,1.42261,1.093474,1.082837,1.313623,1.314642,1.401086,1.467853,"('43.0', '-9.25')2006W28",4.0,28
994,"('43.0', '-9.25')",2006-07-17,-9.25,43.0,249648.77,ES,ES1,ES11,ES111,13.51,32.36,W29,25.658925,17,7,2006,1,4,1,1769.92,4.3,6.75,3.0,0.75,7.15,0.85,19.0,2.6,16.1,0.9,2006W29,Galicia,607.0,2730107.0,92.5,585324.0,21.439599,22.233561,,2.0625,17.26,28.31,21.71,1.107828,1.350629,1.306678,1.177925,0.734241,1.234701,1.49284,1.380723,"('43.0', '-9.25')2006W29",4.0,28
2050,"('43.0', '-9.25')",2013-07-05,-9.25,43.0,304197.66,ES,ES1,ES11,ES111,9.85,24.1,W27,25.658925,5,7,2013,1,5,1,1984.38,4.3,6.75,3.0,0.75,7.15,0.85,19.0,2.6,16.1,0.9,2013W27,Galicia,665.0,2761989.0,93.9,638250.0,23.108347,24.076852,,5.3,16.6975,26.65,17.73,1.349892,0.98473,0.973144,1.250805,1.886776,1.194463,1.405305,1.127601,"('43.0', '-9.25')2013W27",0.0,44


In [27]:
# aggregate data by heatwave_id to get mean values of the numerical variables over that heat wave
heatwave = heatwave.groupby('heatwave_id').agg({'heatwave_id': 'first',
                                         'country': 'first',
                                         'NUTS1': 'first',
                                         'NUTS2': 'first',
                                         'NUTS2_Label': 'first',
                                         'lat_long': 'first',
                                         'date': ['min', 'max'],
                                         'latitude': 'mean',
                                         'longitude': 'mean',
                                         'min_apparent_temperature': 'mean',
                                         'min_apparent_temperature_vs_avg': 'mean',
                                         'max_apparent_temperature': 'mean',
                                         'max_apparent_temperature_vs_avg': 'mean',
                                         'uvb_4PM': 'mean',
                                         'uvb_4PM_vs_avg': 'mean',
                                         'wind_day': 'mean',
                                         'wind_day_vs_avg': 'mean',
                                         'dew_point_day': 'mean',
                                         'dew_point_day_vs_avg': 'mean',
                                         'MRT_2AM': 'mean',
                                         'MRT_2AM_vs_avg': 'mean',
                                         'MRT_4PM': 'mean',
                                         'MRT_4PM_vs_avg': 'mean',
                                         'NUTS2': 'first',
                                         'NUTS3': 'first',
                                         'Week_Number': 'first',
                                         '99th_percentile': 'mean',
                                         'month': 'mean',
                                         'year': 'first',
                                         'threshold_exceeded': 'sum',
                                         'consecutive_HW_days': 'max',
                                         'heat_wave_day': 'sum',
                                         'health_expend_p_capita': 'mean',
                                         'Asthma': 'mean',
                                         'Chronic_depression': 'mean',
                                         'Chronic_lower_respiratory_diseases': 'mean',
                                         'Coronary_heart_disease_or_angina_pectoris': 'mean',
                                         'Diabetes': 'mean',
                                         'Heart_attack_or_chronic_consequences_of_heart_attack': 'mean',
                                         'High_blood_pressure': 'mean',
                                         'Kidney_problems': 'mean',
                                         'Stroke_or_chronic_consequences_of_stroke': 'mean',
                                         'Obesity': 'mean',
                                         'week_nr': 'first',
                                         'deaths': 'mean',
                                         'population': 'mean',
                                         'share_over_65': 'mean',
                                         'density': 'mean',
                                         'death_p_100k': 'mean',
                                         'death_p_100k_vs_avg': 'mean',})
# inspect data
heatwave.head(7)

Unnamed: 0_level_0,heatwave_id,country,NUTS1,NUTS2,NUTS2_Label,lat_long,date,date,latitude,longitude,min_apparent_temperature,min_apparent_temperature_vs_avg,max_apparent_temperature,max_apparent_temperature_vs_avg,uvb_4PM,uvb_4PM_vs_avg,wind_day,wind_day_vs_avg,dew_point_day,dew_point_day_vs_avg,MRT_2AM,MRT_2AM_vs_avg,MRT_4PM,MRT_4PM_vs_avg,NUTS3,Week_Number,99th_percentile,month,year,threshold_exceeded,consecutive_HW_days,heat_wave_day,health_expend_p_capita,Asthma,Chronic_depression,Chronic_lower_respiratory_diseases,Coronary_heart_disease_or_angina_pectoris,Diabetes,Heart_attack_or_chronic_consequences_of_heart_attack,High_blood_pressure,Kidney_problems,Stroke_or_chronic_consequences_of_stroke,Obesity,week_nr,deaths,population,share_over_65,density,death_p_100k,death_p_100k_vs_avg
Unnamed: 0_level_1,first,first,first,first,first,first,min,max,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,first,first,mean,mean,first,sum,max,sum,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,first,mean,mean,mean,mean,mean,mean
heatwave_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2
100,100,PT,PT1,PT16,Centro (PT),"('39.25', '-9.25')",2018-08-02,2018-08-05,39.25,-9.25,23.6575,1.397669,32.34,1.454426,286961.135,1.117221,0.97625,0.564086,16.53875,1.148659,15.0375,1.497314,36.3975,1.235334,PT16B,W31,30.174927,8.0,2018,4,4,4,1878.02,5.55,12.05,5.65,4.15,9.65,1.55,25.95,4.45,1.9,16.9,2018W31,540.0,2231346.0,23.987539,79.5,24.200639,1.124589
10008,10008,CH,CH0,CH03,Nordwestschweiz,"('47.5', '8.0')",2011-08-20,2011-08-23,47.5,8.0,19.965,1.789727,31.7425,1.589615,147769.05,1.184928,0.926875,0.616491,16.796875,1.488345,12.29,1.876485,31.2375,1.24574,CH033,W33,30.511531,8.0,2011,4,4,4,6587.09,7.675,8.95,5.775,3.1,8.3,1.6,21.4,2.45,1.475,16.275,2011W33,162.5,1070820.0,17.454568,551.6,15.175286,1.035444
10030,10030,CH,CH0,CH03,Nordwestschweiz,"('47.5', '8.0')",2015-07-01,2015-07-07,47.5,8.0,21.122857,1.893521,33.584286,1.681849,186158.305714,1.492763,1.0725,0.71335,17.035714,1.509508,12.742857,1.945629,35.944286,1.433445,CH033,W27,30.511531,7.0,2015,7,7,7,8410.01,7.675,8.95,5.775,3.1,8.3,1.6,21.4,2.45,1.475,16.275,2015W27,202.0,1117158.0,18.370812,575.9,18.081596,1.210127
1004,1004,PT,PT1,PT11,Norte,"('41.5', '-7.0')",2018-08-02,2018-08-06,41.5,-7.0,20.998,1.561944,35.66,1.448005,281444.72,1.206473,1.604,0.903057,12.047,1.212355,14.262,1.95374,40.842,1.252998,PT11E,W31,34.298939,8.0,2018,5,5,5,1878.02,5.55,12.05,5.65,4.15,9.65,1.55,25.95,4.45,1.9,16.9,2018W31,676.2,3576205.0,20.024859,169.3,18.908312,1.166277
10058,10058,CH,CH0,CH03,Nordwestschweiz,"('47.5', '8.0')",2018-08-02,2018-08-06,47.5,8.0,21.196,1.900078,32.0,1.60251,163035.102,1.307343,1.0765,0.716011,17.3395,1.536426,13.73,2.09635,33.454,1.334134,CH033,W31,30.511531,8.0,2018,5,5,5,8159.59,7.675,8.95,5.775,3.1,8.3,1.6,21.4,2.45,1.475,16.275,2018W31,168.6,1151919.0,18.871987,598.0,14.636446,1.010846
10064,10064,CH,CH0,CH03,Nordwestschweiz,"('47.5', '8.0')",2019-07-23,2019-07-26,47.5,8.0,18.3975,1.649211,32.8,1.642573,187458.6925,1.503191,1.184375,0.787761,14.92375,1.32237,12.06,1.841368,36.0875,1.439157,CH033,W30,30.511531,7.0,2019,4,4,4,8604.56,7.675,8.95,5.775,3.1,8.3,1.6,21.4,2.45,1.475,16.275,2019W30,172.0,1161105.0,19.087852,603.0,14.813475,1.016646
10092,10092,CH,CH0,CH02,Espace Mittelland,"('46.75', '8.0')",2003-08-03,2003-08-13,46.75,8.0,10.272727,2.284291,23.063636,1.788616,164366.635455,1.306543,0.512273,0.851105,10.199318,1.434151,3.120909,2.834497,24.615455,1.251404,CH021,W31,22.016499,8.0,2003,11,11,11,4106.4393,7.675,8.95,5.775,3.1,8.3,1.6,21.4,2.45,1.475,16.275,2003W31,290.727273,1675111.0,16.54571,171.6,17.355702,1.062025


In [28]:
# reduce column index levels to 1
heatwave.columns = heatwave.columns.get_level_values(0)
heatwave.head(1)

Unnamed: 0_level_0,heatwave_id,country,NUTS1,NUTS2,NUTS2_Label,lat_long,date,date,latitude,longitude,min_apparent_temperature,min_apparent_temperature_vs_avg,max_apparent_temperature,max_apparent_temperature_vs_avg,uvb_4PM,uvb_4PM_vs_avg,wind_day,wind_day_vs_avg,dew_point_day,dew_point_day_vs_avg,MRT_2AM,MRT_2AM_vs_avg,MRT_4PM,MRT_4PM_vs_avg,NUTS3,Week_Number,99th_percentile,month,year,threshold_exceeded,consecutive_HW_days,heat_wave_day,health_expend_p_capita,Asthma,Chronic_depression,Chronic_lower_respiratory_diseases,Coronary_heart_disease_or_angina_pectoris,Diabetes,Heart_attack_or_chronic_consequences_of_heart_attack,High_blood_pressure,Kidney_problems,Stroke_or_chronic_consequences_of_stroke,Obesity,week_nr,deaths,population,share_over_65,density,death_p_100k,death_p_100k_vs_avg
heatwave_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1
100,100,PT,PT1,PT16,Centro (PT),"('39.25', '-9.25')",2018-08-02,2018-08-05,39.25,-9.25,23.6575,1.397669,32.34,1.454426,286961.135,1.117221,0.97625,0.564086,16.53875,1.148659,15.0375,1.497314,36.3975,1.235334,PT16B,W31,30.174927,8.0,2018,4,4,4,1878.02,5.55,12.05,5.65,4.15,9.65,1.55,25.95,4.45,1.9,16.9,2018W31,540.0,2231346.0,23.987539,79.5,24.200639,1.124589


In [29]:
# inspect shape
heatwave.shape

(3350, 50)

In [29]:
# write data to csv
heatwave.to_csv('heatwave1.csv')

## Used in Notebook of the modelling part --> Modelling.ipnyb

# END