## DS4A Concat Final Health Datasets and Merge to Pollution Data

In [307]:
import pandas as pd
pd.set_option("display.max_rows", None, "display.max_columns", None)
import os
pd.set_option('display.max_colwidth', -1)    # stop the column text truncating... 
pd.set_option("display.max_rows", None, "display.max_columns", None)
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

  pd.set_option('display.max_colwidth', -1)    # stop the column text truncating...


### 1. Load in the Datasets

In [308]:
admissions_df = pd.read_csv('../../data_output/hospital_admissions_resp_crdv/admissions_resp_crdv_inc_strk.csv')
prevalence_df = pd.read_csv('../../data_output/nhs_qof_dataset/nhs_qof_dataset_ccg.csv')
admissions_df = admissions_df.rename(columns={'Time period Sortable':'Year'})
prevalence_df = prevalence_df.rename(columns={'Time period Sortable':'Year'})
mortality_df = pd.read_csv('mortality_data_final.csv')


In [309]:
# remove the erroneous indicator 
admissions_df = admissions_df[admissions_df['Area Name']!= 'NHS Basildon And Brentwood CCG']
prevalence_df = prevalence_df[prevalence_df['Area Name']!= 'NHS Basildon And Brentwood CCG']

### 2. Rename Indicators 

In [310]:
indicator_dict = {
       'Emergency hospital admissions for COPD, all ages': 'COPD Admissions',
       'Emergency hospital admissions for respiratory disease': 'Respiratory Admissions',
       'Admissions for lower respiratory tract infections in infants aged 1 year': 'Respiratory Tract Admissions 1yr',
       'Admissions for lower respiratory tract infections in infants aged under 1 year': 'Respiratory Tract Admissions U1yr',
       'Emergency hospital admissions for asthma in adults (aged 19 years and over)': 'Asthma Admissions Over 19yr',
       'Hospital admissions for asthma (under 19 years)':'Asthma Admissions Under 19yr',
       'Emergency hospital admissions for pneumonia': 'Pneumonia Admissions',
       'CHD admissions (all ages)': 'CHD Admissions', 
       'Heart failure admissions (all ages)': 'Heart Failure Admissions',
       'Stroke admissions (Sentinel Stroke National Audit Programme)': 'Stroke Admissions',
       'Stroke admissions with history of atrial fibrillation not prescribed anticoagulation prior to stroke': 'Stroke Atrial Fib Admissions'   
}

respiratory = [
    'COPD Admissions',
    'Respiratory Admissions',
    'Respiratory Tract Admissions 1yr',
    'Respiratory Tract Admissions U1yr',
    'Asthma Admissions Over 19yr',
    'Asthma Admissions Under 19yr',
    'Pneumonia Admissions',
]
cardiov_ = [
    'CHD Admissions',
    'Heart Failure Admissions',
    'Stroke Admissions',
    'Stroke Atrial Fib Admissions' 
]

admissions_df['Indicator Name'] = admissions_df['Indicator Name'].map(indicator_dict)
admissions_df['Indicator Name'].unique()

array(['COPD Admissions', 'Respiratory Admissions',
       'Respiratory Tract Admissions 1yr',
       'Respiratory Tract Admissions U1yr', 'Asthma Admissions Over 19yr',
       'Asthma Admissions Under 19yr', 'Pneumonia Admissions',
       'CHD Admissions', 'Heart Failure Admissions', 'Stroke Admissions',
       'Stroke Atrial Fib Admissions'], dtype=object)

In [311]:
indicator_dict_q = {
       'Hypertension: QOF prevalence (all ages)': 'Hypertension Prevalence',
       'Diabetes: QOF prevalence (17+)': 'Diabetes Prevalence', 
       'CHD: QOF prevalence (all ages)': 'CHD Prevalence',
       'Heart Failure: QOF prevalence (all ages)': 'Heart Failure Prevalence' ,
       'CKD: QOF prevalence (18+)': 'CKD Prevalence', 
       'Stroke: QOF prevalence (all ages)': 'Stroke: Prevalence',
       'Atrial fibrillation: QOF prevalence': 'Atrial Fibrillation Prevalence',
       'COPD: QOF prevalence (all ages)': 'COPD Prevalence',
       'Asthma: QOF prevalence (all ages)': 'Asthma Prevalence'
}

resp_p = [
    'COPD Prevalence',
    'Asthma Prevalence'  
]

cardio_p = [
    'Hypertension Prevalence',
    'Diabetes Prevalence', 
    'CHD Prevalence',
    'Heart Failure Prevalence',
    'CKD Prevalence', 
    'Stroke: Prevalence',
    'Atrial Fibrillation Prevalence',
]

prevalence_df['Indicator Name'] = prevalence_df['Indicator Name'].map(indicator_dict_q)
prevalence_df['Indicator Name'].unique()

array(['Hypertension Prevalence', 'Diabetes Prevalence', 'CHD Prevalence',
       'Heart Failure Prevalence', 'CKD Prevalence', 'Stroke: Prevalence',
       'Atrial Fibrillation Prevalence', 'COPD Prevalence',
       'Asthma Prevalence'], dtype=object)

In [312]:
# remove locations not supported by pollution data
not_supported_poll = [
    'NHS Brent CCG',
    'NHS Croydon CCG',
    'NHS Guildford and Waverley CCG',
    'NHS Harrow CCG',
    'NHS Basildon And Brentwood CCG'
]

prevalence_df = prevalence_df[~prevalence_df['Area Name'].isin(not_supported_poll)]
admissions_df = admissions_df[~admissions_df['Area Name'].isin(not_supported_poll)]

# restrict prevalence_df and admissions_df to PERSONS only (NOT male/female, as we do not have the right indicators)
prevalence_df = prevalence_df[prevalence_df['Sex'] == 'Persons']
admissions_df = admissions_df[admissions_df['Sex'] == 'Persons']

#### Join the health data sets up (treat mortality first)

In [313]:
prevalence_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Indicator ID,Indicator Name,Parent Code,Parent Name,Area Code,Area Name,Area Type,Sex,Age,Category Type,Category,Time period,Value,Lower CI 95.0 limit,Upper CI 95.0 limit,Lower CI 99.8 limit,Upper CI 99.8 limit,Count,Denominator,Value note,Recent Trend,Compared to England value or percentiles,Compared to percentiles,Year,New data,Compared to goal,CCG Location
3,19852,19852,219,Hypertension Prevalence,E92000001,England,E38000048,NHS Ealing CCG,CCGs (2019/20),Persons,All ages,,,2009/10,12.02698,11.92071,12.134068,11.859797,12.196195,42954.0,357147.0,Aggregated from all known lower geography values,,Lower,Not compared,2009,,,Inner Radius
4,19869,19869,219,Hypertension Prevalence,E92000001,England,E38000070,NHS Hammersmith And Fulham CCG,CCGs (2019/20),Persons,All ages,,,2009/10,9.423858,9.290173,9.559265,9.213858,9.638136,17070.0,181136.0,Aggregated from all known lower geography values,,Lower,Not compared,2009,,,Outer Radius
6,19877,19877,219,Hypertension Prevalence,E92000001,England,E38000079,NHS Herts Valleys CCG,CCGs (2019/20),Persons,All ages,,,2009/10,12.546181,12.457431,12.635472,12.406497,12.68721,66730.0,531875.0,Aggregated from all known lower geography values,,Lower,Not compared,2009,,,Outer Radius
7,19880,19880,219,Hypertension Prevalence,E92000001,England,E38000082,NHS Hillingdon CCG,CCGs (2019/20),Persons,All ages,,,2009/10,12.690711,12.564328,12.818179,12.491938,12.89218,33530.0,264209.0,Aggregated from all known lower geography values,,Lower,Not compared,2009,,,Inner Radius
8,19882,19882,219,Hypertension Prevalence,E92000001,England,E38000084,NHS Hounslow CCG,CCGs (2019/20),Persons,All ages,,,2009/10,11.489722,11.359422,11.621321,11.28487,11.697803,26176.0,227821.0,Aggregated from all known lower geography values,,Lower,Not compared,2009,,,Inner Radius


In [314]:
mortality_df = mortality_df.rename(columns={'Indicator': 'Indicator Name', 'Radius': 'CCG Location', 'Mortality Rate': 'Value'})

# set the indicator type field... 
mortality_df['Indicator Type'] = 'Health - Mortality'
prevalence_df['Indicator Type'] = 'Health - Prevalence'
admissions_df['Indicator Type'] = 'Health- Admission'


In [315]:
concat_cols = ['Year', 'Indicator Type', 'Indicator Name', 'Area Name', 'CCG Location', 'Value']


print((mortality_df[concat_cols]).shape)
print((prevalence_df[concat_cols]).shape)
print((admissions_df[concat_cols]).shape)

health_df = pd.concat([mortality_df[concat_cols], prevalence_df[concat_cols], admissions_df[concat_cols]], axis=0)


print(health_df.shape)

(2260, 6)
(1529, 6)
(1333, 6)
(5122, 6)


In [316]:
health_df['Indicator Name'].unique()

array(['Under 75 Resp Disease', 'Pneumonia (all mentions)',
       'COPD (contributory cause)', '<75 Coronary Heart Disease',
       'Asthma', 'Hypertension Prevalence', 'Diabetes Prevalence',
       'CHD Prevalence', 'Heart Failure Prevalence', 'CKD Prevalence',
       'Stroke: Prevalence', 'Atrial Fibrillation Prevalence',
       'COPD Prevalence', 'Asthma Prevalence', 'COPD Admissions',
       'Respiratory Admissions', 'Respiratory Tract Admissions 1yr',
       'Respiratory Tract Admissions U1yr', 'Asthma Admissions Over 19yr',
       'Asthma Admissions Under 19yr', 'Pneumonia Admissions',
       'CHD Admissions', 'Heart Failure Admissions', 'Stroke Admissions',
       'Stroke Atrial Fib Admissions'], dtype=object)

### 3. Confirm CCG Region Location with Shape files and Get NHS CCG Distances from Heathrow Airport

The team would like to redefine the NHS CCG region location assumptions (inner/outer), intially made visually, with measured distance from heathrow in metres. 

In [317]:
# load in the shape file, with distance calculated in the geography notebook.
geog = pd.read_csv('../../../geography/nhs_ccg_geometry.csv', encoding='utf-8')
geog.head(2)
# fix naming convention
geog['Area Name'] = geog['Area Name'].apply(lambda x: 'NHS Hammersmith And Fulham CCG' if x=='NHS Hammersmith and Fulham CCG' else x)

In [318]:
health_df_dist = pd.merge(health_df, geog[['Area Name', 'heathrow_distance', 'LAT', 'LONG']], how='left', on='Area Name')
print(health_df.shape)
health_df_dist.shape

(5122, 6)


(5122, 9)

In [319]:
health_df_dist.head(5)

Unnamed: 0,Year,Indicator Type,Indicator Name,Area Name,CCG Location,Value,heathrow_distance,LAT,LONG
0,2006,Health - Mortality,Under 75 Resp Disease,NHS Ealing CCG,Inner,38.388443,11465.530929,51.5244,-0.31405
1,2006,Health - Mortality,Under 75 Resp Disease,NHS Hillingdon CCG,Inner,36.176786,7458.185529,51.5366,-0.4418
2,2006,Health - Mortality,Under 75 Resp Disease,NHS Hounslow CCG,Inner,39.003001,5336.984712,51.4624,-0.37847
3,2006,Health - Mortality,Under 75 Resp Disease,NHS Buckinghamshire CCG,Inner,25.653613,38818.061527,51.7414,-0.80644
4,2006,Health - Mortality,Under 75 Resp Disease,NHS East Berkshire CCG,Inner,31.198377,15803.713434,51.4541,-0.68026


In [320]:
health_df_dist[health_df_dist['Value'].isna()]
# the team will address the two points at the same time as the pollutant nans... 

Unnamed: 0,Year,Indicator Type,Indicator Name,Area Name,CCG Location,Value,heathrow_distance,LAT,LONG
3991,2015,Health- Admission,Respiratory Tract Admissions 1yr,NHS Surrey Heath CCG,Outer Radius,,23497.239589,51.3281,-0.70434
4007,2016,Health- Admission,Respiratory Tract Admissions 1yr,NHS Surrey Heath CCG,Outer Radius,,23497.239589,51.3281,-0.70434


In [321]:
health_df_dist['CCG Location'] = health_df_dist['CCG Location'].apply(
    lambda x: x.replace('Radius', '').strip() 
)

In [322]:
# re-define the CCG locations
health_df_dist['CCG Location'] = np.where(
health_df_dist['heathrow_distance'] < 15000, 'Inner', 'Outer'
)
health_df_dist.rename(columns={'CCG Location': 'Radius Location'}, inplace=True)

In [324]:
# output to CSV 
health_df_dist.to_csv('health_data_final.csv', encoding='utf-8')

In [282]:
# Buckinghamshire, Harrow, Guildford and Waverley, Surrey Heath, Brent, Croydon


# • Buckinghamshire (excluded from pollution (no data) but keep for health analysis)
# • Harrow to be (excluded from pollution (no data) but keep for health analysis)
# • Brent (de-prioritised, only if time)  
# • Guildford/waverley - (excluded from pollution (no data) but keep for health analysis)
# • Surrey Heath CCG (excluded from pollution (no data) but keep for health analysis) 


In [325]:
# we need to create a pollution CCG mapping (as some of the CCG locations have shared pollution indicators)
pollution_map = {
    'NHS Buckinghamshire CCG': 'No Pollution Readings',
    'NHS Ealing CCG': 'Ealing', 
    'NHS East Berkshire CCG': 'East Berkshire', 
    'NHS Hillingdon CCG': 'Hillingdon',
    'NHS Hounslow CCG': 'Hounslow', 
    'NHS North West Surrey CCG': 'Surrey Heartlands',
    'NHS Berkshire West CCG': 'Reading', 
    'NHS Brent CCG': 'No Pollution Readings',
    'NHS Croydon CCG': 'No Pollution Readings',
    'NHS Guildford and Waverley CCG': 'No Pollution Readings',
    'NHS Hammersmith And Fulham CCG': 'Hammersmith and Fulham', 
    'NHS Harrow CCG': 'No Pollution Readings',
    'NHS Herts Valleys CCG': 'Watford', 
    'NHS Kingston CCG': 'South West London', 
    'NHS Merton CCG': 'South West London', 
    'NHS Oxfordshire CCG': 'Oxford', 
    'NHS Richmond CCG': 'South West London', 
    'NHS Surrey Heath CCG': 'No Pollution Readings',
    'NHS Sutton CCG': 'South West London', 
    'NHS Wandsworth CCG': 'South West London'
}


health_df_dist['CCG_Poll_Map'] = health_df_dist['Area Name'].map(pollution_map)


In [326]:
health_df_dist['CCG_Poll_Map'].unique()

array(['Ealing', 'Hillingdon', 'Hounslow', 'No Pollution Readings',
       'East Berkshire', 'Surrey Heartlands', 'Hammersmith and Fulham',
       'Watford', 'Oxford', 'Reading', 'South West London'], dtype=object)

### 4. Transform Health DF into Multi-Variate format

In [329]:
health_df_pivot.head()

Indicator Name,Year,Area Name,Radius Location,heathrow_distance,CCG_Poll_Map,LAT,LONG,<75 Coronary Heart Disease,Asthma,Asthma Admissions Over 19yr,Asthma Admissions Under 19yr,Asthma Prevalence,Atrial Fibrillation Prevalence,CHD Admissions,CHD Prevalence,CKD Prevalence,COPD (contributory cause),COPD Admissions,COPD Prevalence,Diabetes Prevalence,Heart Failure Admissions,Heart Failure Prevalence,Hypertension Prevalence,Pneumonia (all mentions),Pneumonia Admissions,Respiratory Admissions,Respiratory Tract Admissions 1yr,Respiratory Tract Admissions U1yr,Stroke Admissions,Stroke Atrial Fib Admissions,Stroke: Prevalence,Under 75 Resp Disease
0,2003,NHS Berkshire West CCG,Outer,49791.742293,Reading,51.4458,-1.16963,57.82,,,,,,639.353,,,,,,,118.793,,,,,,,,,,,
1,2003,NHS Brent CCG,Outer,16247.542449,No Pollution Readings,51.5644,-0.27568,76.04,,,,,,,,,,,,,,,,,,,,,,,,
2,2003,NHS Buckinghamshire CCG,Outer,38818.061527,No Pollution Readings,51.7414,-0.80644,51.21,,,,,,730.138,,,,,,,107.011,,,,,,,,,,,
3,2003,NHS Croydon CCG,Outer,28645.840703,No Pollution Readings,51.366,-0.07761,65.63,,,,,,,,,,,,,,,,,,,,,,,,
4,2003,NHS Ealing CCG,Inner,11465.530929,Ealing,51.5244,-0.31405,82.91,,,,,,954.994,,,,,,,193.386,,,,,,,,,,,


In [330]:
# restrict the columns to those requried 
health_df_dist_wide = health_df_dist.copy()
health_df_dist_wide = health_df_dist_wide[['Year', 'Area Name','Indicator Name', 'Value', 'Radius Location', 'heathrow_distance', 'CCG_Poll_Map', 'LAT', 'LONG']]

health_df_pivot = health_df_dist_wide.pivot_table(
    values='Value',
    index= ['Year', 'Area Name', 'Radius Location', 'heathrow_distance', 'CCG_Poll_Map', 'LAT', 'LONG'],
    columns='Indicator Name'
).reset_index().rename(columns={'Indicator Name': 'Index'})

health_df_pivot.to_csv('health_data_final_wide_format.csv', encoding='utf-8')

### 5. Combine with Pollution Data 

In [331]:
outer_poll = pd.read_csv('../../../../pollution_data/yearlyOuterCCGgrouped.csv', encoding='utf-8')
inner_poll = pd.read_csv('../../../../pollution_data/inner_ccg_agg.csv', encoding='utf-8')

In [332]:
# return only the year from the full date string
inner_poll['Date'] = inner_poll['Date'].apply(lambda x: x[:4])

# rename columns to match health data 
inner_poll.rename(columns={'CCG':'CCG_Poll_Map', 'Date':'Year'}, inplace=1)
outer_poll.rename(columns={'CCG':'CCG_Poll_Map', 'Date':'Year'}, inplace=1)

# append reading ccg name for some values 
outer_poll['CCG_Poll_Map']  = outer_poll['CCG_Poll_Map'].apply(lambda x: 'Reading' if 'Reading' in x else x)

# fix the year column
inner_poll['Year'] =  pd.to_datetime(
    (inner_poll['Year']), 
    format='%Y').dt.year

# fix the year column
outer_poll['Year'] =  pd.to_datetime(
    (inner_poll['Year']), 
    format='%Y').dt.year


In [333]:

# pivot the table so it is the same format as health and outer pollution
inner_poll_p = inner_poll[['CCG_Poll_Map', 'Year', 'Pollutant', 'Indicator Value (R µg/m3)']].pivot_table(
    index = ['CCG_Poll_Map', 'Year',],
    columns= 'Pollutant',
    values = 'Indicator Value (R µg/m3)'
).reset_index()

In [289]:
inner_poll_p.head(5)

Pollutant,CCG_Poll_Map,Year,Carbon monoxide,Nitric Oxide,Nitrogen dioxide,Oxides of Nitrogen,Ozone,PM10 Particulate matter,PM10 particulate matter (Hourly measured),PM2.5 particulate matter (Hourly measured),Sulphur dioxide
0,Ealing,2004,0.770049,,57.882938,,33.99208,,23.333576,15.391342,10.043232
1,Ealing,2005,0.76215,,52.862974,,28.594453,,25.916647,21.009842,8.370425
2,Ealing,2006,0.730095,,55.035635,,32.527539,,25.443425,18.450045,6.206311
3,Ealing,2007,0.651978,,54.247691,,28.473891,,24.401769,,6.530478
4,Ealing,2008,0.502517,,58.848371,,29.085305,,22.141475,11.795321,5.656652


we will drop the indicators that are present in closer regions and not present in further away regions (as we need the same indicators bewteen both regions for comparisons) 

In [334]:
set(inner_poll_p.columns.tolist()) - set(outer_poll.columns.tolist())

{'Carbon monoxide',
 'Nitric Oxide',
 'Oxides of Nitrogen',
 'PM10 Particulate matter'}

In [335]:
inner_poll_p.drop(columns=['Carbon monoxide',
 'Nitric Oxide',
 'Oxides of Nitrogen',
 'PM10 Particulate matter',], inplace=True)

In [336]:
inner_poll_p.head()

Pollutant,CCG_Poll_Map,Year,Nitrogen dioxide,Ozone,PM10 particulate matter (Hourly measured),PM2.5 particulate matter (Hourly measured),Sulphur dioxide
0,Ealing,2004,57.882938,33.99208,23.333576,15.391342,10.043232
1,Ealing,2005,52.862974,28.594453,25.916647,21.009842,8.370425
2,Ealing,2006,55.035635,32.527539,25.443425,18.450045,6.206311
3,Ealing,2007,54.247691,28.473891,24.401769,,6.530478
4,Ealing,2008,58.848371,29.085305,22.141475,11.795321,5.656652


concat the inner/outer pollution datasets

In [337]:
poll_df = pd.concat([inner_poll_p, outer_poll], axis=0)

In [338]:
poll_df.head()

Unnamed: 0,CCG_Poll_Map,Year,Nitrogen dioxide,Ozone,PM10 particulate matter (Hourly measured),PM2.5 particulate matter (Hourly measured),Sulphur dioxide
0,Ealing,2004,57.882938,33.99208,23.333576,15.391342,10.043232
1,Ealing,2005,52.862974,28.594453,25.916647,21.009842,8.370425
2,Ealing,2006,55.035635,32.527539,25.443425,18.450045,6.206311
3,Ealing,2007,54.247691,28.473891,24.401769,,6.530478
4,Ealing,2008,58.848371,29.085305,22.141475,11.795321,5.656652


In [339]:
print(inner_poll_p.shape)
print(outer_poll.shape)

print(poll_df.shape)
poll_df.isna().sum()

(102, 7)
(65, 7)
(167, 7)


CCG_Poll_Map                                  0  
Year                                          0  
Nitrogen dioxide                              0  
Ozone                                         97 
PM10 particulate matter (Hourly measured)     18 
PM2.5 particulate matter (Hourly measured)    92 
Sulphur dioxide                               133
dtype: int64

As the health inidcators (admissions, mortality) are on CCG level, we will need to aggregate one level up (on CCG) 

Finally, rename the CCGs to match the health indicators

In [340]:
# shorten pollution indicator names
poll_df.rename(columns= {
    'PM10 particulate matter (Hourly measured)': 'PM10',
    'PM2.5 particulate matter (Hourly measured)': 'PM2.5',
}, inplace=True)


and merge pollution and health data


In [341]:
health_poll_df = pd.merge(health_df_pivot, poll_df, how='left', on=['Year','CCG_Poll_Map' ])

In [342]:
health_poll_df.head()

Unnamed: 0,Year,Area Name,Radius Location,heathrow_distance,CCG_Poll_Map,LAT,LONG,<75 Coronary Heart Disease,Asthma,Asthma Admissions Over 19yr,Asthma Admissions Under 19yr,Asthma Prevalence,Atrial Fibrillation Prevalence,CHD Admissions,CHD Prevalence,CKD Prevalence,COPD (contributory cause),COPD Admissions,COPD Prevalence,Diabetes Prevalence,Heart Failure Admissions,Heart Failure Prevalence,Hypertension Prevalence,Pneumonia (all mentions),Pneumonia Admissions,Respiratory Admissions,Respiratory Tract Admissions 1yr,Respiratory Tract Admissions U1yr,Stroke Admissions,Stroke Atrial Fib Admissions,Stroke: Prevalence,Under 75 Resp Disease,Nitrogen dioxide,Ozone,PM10,PM2.5,Sulphur dioxide
0,2003,NHS Berkshire West CCG,Outer,49791.742293,Reading,51.4458,-1.16963,57.82,,,,,,639.353,,,,,,,118.793,,,,,,,,,,,,,,,,
1,2003,NHS Brent CCG,Outer,16247.542449,No Pollution Readings,51.5644,-0.27568,76.04,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2003,NHS Buckinghamshire CCG,Outer,38818.061527,No Pollution Readings,51.7414,-0.80644,51.21,,,,,,730.138,,,,,,,107.011,,,,,,,,,,,,,,,,
3,2003,NHS Croydon CCG,Outer,28645.840703,No Pollution Readings,51.366,-0.07761,65.63,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2003,NHS Ealing CCG,Inner,11465.530929,Ealing,51.5244,-0.31405,82.91,,,,,,954.994,,,,,,,193.386,,,,,,,,,,,,,,,,


In [343]:
# output datasets to CSV in WIDE and long format.
# (also remember to include the INdicator TYPE (get manually from the column names))
health_poll_df_long = health_poll_df.melt(
    id_vars = ['Year', 'Area Name', 'heathrow_distance', 'CCG_Poll_Map', 'Radius Location', 'LAT', 'LONG'],
    var_name = 'Indicator Name',
    value_name = 'Value',
)

In [344]:
health_poll_df_long['Indicator Name'].unique()

array(['<75 Coronary Heart Disease', 'Asthma',
       'Asthma Admissions Over 19yr', 'Asthma Admissions Under 19yr',
       'Asthma Prevalence', 'Atrial Fibrillation Prevalence',
       'CHD Admissions', 'CHD Prevalence', 'CKD Prevalence',
       'COPD (contributory cause)', 'COPD Admissions', 'COPD Prevalence',
       'Diabetes Prevalence', 'Heart Failure Admissions',
       'Heart Failure Prevalence', 'Hypertension Prevalence',
       'Pneumonia (all mentions)', 'Pneumonia Admissions',
       'Respiratory Admissions', 'Respiratory Tract Admissions 1yr',
       'Respiratory Tract Admissions U1yr', 'Stroke Admissions',
       'Stroke Atrial Fib Admissions', 'Stroke: Prevalence',
       'Under 75 Resp Disease', 'Nitrogen dioxide', 'Ozone', 'PM10',
       'PM2.5', 'Sulphur dioxide'], dtype=object)

In [345]:
print([x for x in health_poll_df_long['Indicator Name'].unique() if 'admission' in x.lower()])
print('\n')
print([x for x in health_poll_df_long['Indicator Name'].unique() if 'prevalence' in x.lower()])


admissions = ['Asthma Admissions Over 19yr', 'Asthma Admissions Under 19yr', 'CHD Admissions', 'COPD Admissions', 'Heart Failure Admissions', 'Pneumonia Admissions', 'Respiratory Admissions', 'Respiratory Tract Admissions 1yr', 'Respiratory Tract Admissions U1yr', 'Stroke Admissions', 'Stroke Atrial Fib Admissions']
prev = ['Asthma Prevalence', 'Atrial Fibrillation Prevalence', 'CHD Prevalence', 'CKD Prevalence', 'COPD Prevalence', 'Diabetes Prevalence', 'Heart Failure Prevalence', 'Hypertension Prevalence', 'Stroke: Prevalence']
pollution = ['Nitrogen dioxide', 'Ozone', 'PM10', 'PM2.5', 'Sulphur dioxide']


health_poll_df_long['Indicator Type'] = np.where(
health_poll_df_long['Indicator Name'].isin(admissions), 'Health - Admissions',
    np.where(health_poll_df_long['Indicator Name'].isin(prev), 'Health - Prevalence',    
             (np.where(health_poll_df_long['Indicator Name'].isin(pollution), 'Air Pollutant', 'Health - Mortality'))))



['Asthma Admissions Over 19yr', 'Asthma Admissions Under 19yr', 'CHD Admissions', 'COPD Admissions', 'Heart Failure Admissions', 'Pneumonia Admissions', 'Respiratory Admissions', 'Respiratory Tract Admissions 1yr', 'Respiratory Tract Admissions U1yr', 'Stroke Admissions', 'Stroke Atrial Fib Admissions']


['Asthma Prevalence', 'Atrial Fibrillation Prevalence', 'CHD Prevalence', 'CKD Prevalence', 'COPD Prevalence', 'Diabetes Prevalence', 'Heart Failure Prevalence', 'Hypertension Prevalence', 'Stroke: Prevalence']


In [346]:
health_poll_df_long.head(5)

Unnamed: 0,Year,Area Name,heathrow_distance,CCG_Poll_Map,Radius Location,LAT,LONG,Indicator Name,Value,Indicator Type
0,2003,NHS Berkshire West CCG,49791.742293,Reading,Outer,51.4458,-1.16963,<75 Coronary Heart Disease,57.82,Health - Mortality
1,2003,NHS Brent CCG,16247.542449,No Pollution Readings,Outer,51.5644,-0.27568,<75 Coronary Heart Disease,76.04,Health - Mortality
2,2003,NHS Buckinghamshire CCG,38818.061527,No Pollution Readings,Outer,51.7414,-0.80644,<75 Coronary Heart Disease,51.21,Health - Mortality
3,2003,NHS Croydon CCG,28645.840703,No Pollution Readings,Outer,51.366,-0.07761,<75 Coronary Heart Disease,65.63,Health - Mortality
4,2003,NHS Ealing CCG,11465.530929,Ealing,Inner,51.5244,-0.31405,<75 Coronary Heart Disease,82.91,Health - Mortality


In [347]:
#check that all indicators have been classified
mo = health_poll_df_long[health_poll_df_long['Indicator Type'] == 'Health - Mortality']['Indicator Name'].unique()
pv = health_poll_df_long[health_poll_df_long['Indicator Type'] == 'Health - Prevalence']['Indicator Name'].unique()
ad = health_poll_df_long[health_poll_df_long['Indicator Type'] == 'Health - Admissions']['Indicator Name'].unique()
po = health_poll_df_long[health_poll_df_long['Indicator Type'] == 'Air Pollutant']['Indicator Name'].unique()

# empty set means well done
print(set(list(health_poll_df_long['Indicator Name'])) - set(list(mo) + list(pv) + list(ad) + list(po)))

set()


Output the data and be happy! 

In [348]:
# output datasets to CSV in WIDE and long format.
# (also remember to include the INdicator TYPE (get manually from the column names))
health_poll_df.to_csv('health_pollution_final_wide.csv', encoding='utf-8')
health_poll_df_long.to_csv('health_pollution_final.csv', encoding='utf-8')



### 6. Run Multivariate EDA analysis 

In [305]:
# # lets make a correlation heatmap matrix!
# sns.set(rc={'figure.figsize':(12,9)})

# # https://seaborn.pydata.org/generated/seaborn.diverging_palette.html
# cmap= sns.diverging_palette(240, 10, as_cmap=True)

# admission_corr = admission_df_poll.corr()
# sns.heatmap(admission_corr, cmap=cmap)

# plt.title('Correlation Heatmap of Hospital Admissions and Air Pollution Indicators \n on Selected Areas Around Heathrow Airport')
# plt.show()

NameError: name 'admission_df_poll' is not defined

Observations of interest:

* Various hospital admissions are negatively correlated with heathrow distance, supporting our theory that residents living closer to heathrow experience more health ailments
* Observed pollutants appear to be (weakly) positively correlated with distance, further investigation will be required
* Nitrogen Dioxide is positively correlated with 6 hospital admissions
* PM10 particulate matter is positively correlated with with 4 hospital admissions

In [None]:
# # lets make a correlation heatmap matrix!
# sns.set(rc={'figure.figsize':(12,9)})

# # https://seaborn.pydata.org/generated/seaborn.diverging_palette.html
# # cmap = sns.diverging_palette(220, 20, as_cmap=True)
# cmap= sns.diverging_palette(240, 10, as_cmap=True)

# prev_corr = prevalence_df_poll.corr()
# sns.heatmap(prev_corr, cmap=cmap)

# plt.title('Correlation Heatmap of Disease Prevalence and Air Pollution Indicators \n on Selected Areas Around Heathrow Airport')
# plt.show()

Observations of interest:
* Distance from the airport is positively corrleated with most disease prevalence indicators), which contradicts our theory that that residents living closer to heathrow experience more health ailments we would have expected the opposite
* PM10 particulate matter is positively correlated with Asthma, Hypertension 
* Ozone is positively correlated with COPD prevalence (Further investigation required)
    

We will use pair plots to give an alternative visualisation

In [None]:
# g = sns.pairplot(
#     data=prevalence_df_poll,
#     x_vars=['Year', 'heathrow_distance','Nitrogen dioxide', 'Ozone', 'PM10', 'PM2.5', 'Sulphur dioxide'],
#     y_vars=['Year','heathrow_distance', 'Asthma Prevalence', 'Atrial Fibrillation Prevalence', 'CHD Prevalence', 'CKD Prevalence', 'COPD Prevalence', 'Diabetes Prevalence', 'Heart Failure Prevalence', 'Hypertension Prevalence', 'Stroke: Prevalence',]
# )

# g.fig.suptitle("Multivariate Correlation Plots for London Disease Prevalence and Air Pollution (2009-2019)", y=1.02, size=16) # y= some height>1

# plt.show()

In [None]:
# sns.pairplot(
#     data=admission_df_poll,
#     x_vars=['Year', 'heathrow_distance','Nitrogen dioxide', 'Ozone', 'PM10', 'PM2.5', 'Sulphur dioxide'],
#     y_vars=['Year','heathrow_distance', 'Asthma Admissions Over 19yr', 'Asthma Admissions Under 19yr', 'CHD Admissions', 'COPD Admissions', 'Heart Failure Admissions', 'Pneumonia Admissions', 'Respiratory Admissions', 'Respiratory Tract Admissions 1yr', 'Respiratory Tract Admissions U1yr', 'Stroke Admissions', 'Stroke Atrial Fib Admissions',],
#     height=3, 
# #     aspect=0.8
# )
    
    
# g.fig.suptitle("Multivariate Correlation Plots for London Hospital Admissions and Air Pollution (2009-2019/2013-2018)", y=1.02, size=16) # y= some height>1
# plt.tight_layout()
# plt.show()

### Export file (Used for Front End and Model)

In [None]:
# geog.head()

In [None]:
# # get lat long (for front end design)
# admission_df_poll_latlong = pd.merge(admission_df_poll.copy(), geog_ccg[['Area Name', 'LAT', 'LONG']], how='inner', on='Area Name') 
# prevalence_df_poll_latlong = pd.merge(prevalence_df_poll.copy(), geog_ccg[['Area Name', 'LAT', 'LONG']], how='inner', on='Area Name')


# # output the data files - to be cleaned up this week... 
# admission_df_poll_latlong.to_csv('../data_output/joined_health_pollution/admission_pollution.csv', encoding='utf-8')
# prevalence_df_poll_latlong.to_csv('../data_output/joined_health_pollution/prevalence_pollution.csv', encoding='utf-8')

# print(admission_df_poll_latlong.shape)
# print(prevalence_df_poll_latlong.shape)

Output melted DF files to CSV (preferred format for front end)

In [None]:
# admission_df_poll.head()

# pd.melt(admission_df_poll_latlong, id_vars=['Year', 'Area Name', 'CCG Location', 'CCG_Poll_Map', 'LAT', 'LONG'],
#         var_name='Indicator Name', value_name='Value').to_csv('../data_output/joined_health_pollution/admission_pollution_melt.csv')


# pd.melt(prevalence_df_poll_latlong, id_vars=['Year', 'Area Name', 'CCG Location', 'CCG_Poll_Map', 'LAT', 'LONG'],
#         var_name='Indicator Name', value_name='Value').to_csv('../data_output/joined_health_pollution/prevalence_pollution_melt.csv')


In [None]:
# dff = pd.melt(prevalence_df_poll_latlong, id_vars=['Year', 'Area Name', 'CCG Location', 'CCG_Poll_Map'],
#         var_name='Indicator Name', value_name='Value')
# dff.head()

In [None]:
# dff[dff['Area Name']=='NHS Oxfordshire CCG' ]

**Conclusions**


**The correlation heatmaps indicate:**

**Disease Prevalence Vs Pollution**
* Distance from the airport is positively corrleated with most disease prevalence indicators), which contradicts our theory that that residents living closer to heathrow experience more health ailments, we would have expected the opposite - further investigation will be required
* PM10 particulate matter is positively correlated with Asthma, Hypertension
* Ozone is positively correlated with COPD prevalence (Further investigation required)


**Hospital Admissions Vs Pollution**
* Various hospital admissions are negatively correlated with heathrow distance, supporting our theory that residents living closer to heathrow experience more health ailments
* Nitrogen Dioxide is positively correlated with 6 hospital admissions
* PM10 particulate matter is positively correlated with with 4 hospital admissions

**Extra**
* Observed pollutants appear to be (weakly) positively correlated with distance, further investigation will be required

**The Multivariate Correlation Plots indicate:**
* There are not many points avaialble for the Sulphur Dioxdide indicator - follow on actions will be given to assess the feasibility of continuing with this indicator. 
* Similarly, action will be undetaken to assess the 2.5OPM indicator (low points)