In [95]:
import pandas as pd
pd.set_option("display.max_rows", None, "display.max_columns", None)
import os
pd.set_option('display.max_colwidth', -1)    # stop the column text truncating... 
pd.set_option("display.max_rows", None, "display.max_columns", None)
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

### 1. Load in the Datasets

In [96]:
admissions_df = pd.read_csv('../data_output/hospital_admissions_resp_crdv/admissions_resp_crdv_inc_strk.csv')
prevalence_df = pd.read_csv('../data_output/nhs_qof_dataset/nhs_qof_dataset_ccg.csv')
admissions_df = admissions_df.rename(columns={'Time period Sortable':'Year'})
prevalence_df = prevalence_df.rename(columns={'Time period Sortable':'Year'})

### 2. Rename Indicators 

In [97]:
indicator_dict = {
       'Emergency hospital admissions for COPD, all ages': 'COPD Admissions',
       'Emergency hospital admissions for respiratory disease': 'Respiratory Admissions',
       'Admissions for lower respiratory tract infections in infants aged 1 year': 'Respiratory Tract Admissions 1yr',
       'Admissions for lower respiratory tract infections in infants aged under 1 year': 'Respiratory Tract Admissions U1yr',
       'Emergency hospital admissions for asthma in adults (aged 19 years and over)': 'Asthma Admissions Over 19yr',
       'Hospital admissions for asthma (under 19 years)':'Asthma Admissions Under 19yr',
       'Emergency hospital admissions for pneumonia': 'Pneumonia Admissions',
       'CHD admissions (all ages)': 'CHD Admissions', 
       'Heart failure admissions (all ages)': 'Heart Failure Admissions',
       'Stroke admissions (Sentinel Stroke National Audit Programme)': 'Stroke Admissions',
       'Stroke admissions with history of atrial fibrillation not prescribed anticoagulation prior to stroke': 'Stroke Atrial Fib Admissions'   
}

respiratory = [
    'COPD Admissions',
    'Respiratory Admissions',
    'Respiratory Tract Admissions 1yr',
    'Respiratory Tract Admissions U1yr',
    'Asthma Admissions Over 19yr',
    'Asthma Admissions Under 19yr',
    'Pneumonia Admissions',
]
cardiov_ = [
    'CHD Admissions',
    'Heart Failure Admissions',
    'Stroke Admissions',
    'Stroke Atrial Fib Admissions' 
]

admissions_df['Indicator Name'] = admissions_df['Indicator Name'].map(indicator_dict)
admissions_df['Indicator Name'].unique()

array(['COPD Admissions', 'Respiratory Admissions',
       'Respiratory Tract Admissions 1yr',
       'Respiratory Tract Admissions U1yr', 'Asthma Admissions Over 19yr',
       'Asthma Admissions Under 19yr', 'Pneumonia Admissions',
       'CHD Admissions', 'Heart Failure Admissions', 'Stroke Admissions',
       'Stroke Atrial Fib Admissions'], dtype=object)

In [98]:
indicator_dict_q = {
       'Hypertension: QOF prevalence (all ages)': 'Hypertension Prevalence',
       'Diabetes: QOF prevalence (17+)': 'Diabetes Prevalence', 
       'CHD: QOF prevalence (all ages)': 'CHD Prevalence',
       'Heart Failure: QOF prevalence (all ages)': 'Heart Failure Prevalence' ,
       'CKD: QOF prevalence (18+)': 'CKD Prevalence', 
       'Stroke: QOF prevalence (all ages)': 'Stroke: Prevalence',
       'Atrial fibrillation: QOF prevalence': 'Atrial Fibrillation Prevalence',
       'COPD: QOF prevalence (all ages)': 'COPD Prevalence',
       'Asthma: QOF prevalence (all ages)': 'Asthma Prevalence'
}

resp_p = [
    'COPD Prevalence',
    'Asthma Prevalence'  
]

cardio_p = [
    'Hypertension Prevalence',
    'Diabetes Prevalence', 
    'CHD Prevalence',
    'Heart Failure Prevalence',
    'CKD Prevalence', 
    'Stroke: Prevalence',
    'Atrial Fibrillation Prevalence',
]

prevalence_df['Indicator Name'] = prevalence_df['Indicator Name'].map(indicator_dict_q)
prevalence_df['Indicator Name'].unique()

array(['Hypertension Prevalence', 'Diabetes Prevalence', 'CHD Prevalence',
       'Heart Failure Prevalence', 'CKD Prevalence', 'Stroke: Prevalence',
       'Atrial Fibrillation Prevalence', 'COPD Prevalence',
       'Asthma Prevalence'], dtype=object)

In [99]:
prevalence_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Indicator ID,Indicator Name,Parent Code,Parent Name,Area Code,Area Name,Area Type,Sex,Age,Category Type,Category,Time period,Value,Lower CI 95.0 limit,Upper CI 95.0 limit,Lower CI 99.8 limit,Upper CI 99.8 limit,Count,Denominator,Value note,Recent Trend,Compared to England value or percentiles,Compared to percentiles,Year,New data,Compared to goal,CCG Location
0,19819,19819,219,Hypertension Prevalence,E92000001,England,E38000007,NHS Basildon And Brentwood CCG,CCGs (2019/20),Persons,All ages,,,2009/10,14.259531,14.11577,14.404511,14.03342,14.488671,32132.0,225337.0,Aggregated from all known lower geography values,,Higher,Not compared,2009,,,Outer Radius
1,19829,19829,219,Hypertension Prevalence,E92000001,England,E38000020,NHS Brent CCG,CCGs (2019/20),Persons,All ages,,,2009/10,11.175806,11.062693,11.289928,10.997922,11.3562,33014.0,295406.0,Aggregated from all known lower geography values,,Lower,Not compared,2009,,,Outer Radius
2,19845,19845,219,Hypertension Prevalence,E92000001,England,E38000040,NHS Croydon CCG,CCGs (2019/20),Persons,All ages,,,2009/10,11.892499,11.782284,12.003604,11.71913,12.068081,39091.0,328703.0,Aggregated from all known lower geography values,,Lower,Not compared,2009,,,Outer Radius
3,19852,19852,219,Hypertension Prevalence,E92000001,England,E38000048,NHS Ealing CCG,CCGs (2019/20),Persons,All ages,,,2009/10,12.02698,11.92071,12.134068,11.859797,12.196195,42954.0,357147.0,Aggregated from all known lower geography values,,Lower,Not compared,2009,,,Inner Radius
4,19869,19869,219,Hypertension Prevalence,E92000001,England,E38000070,NHS Hammersmith And Fulham CCG,CCGs (2019/20),Persons,All ages,,,2009/10,9.423858,9.290173,9.559265,9.213858,9.638136,17070.0,181136.0,Aggregated from all known lower geography values,,Lower,Not compared,2009,,,Outer Radius


### 3. Get NHS CCG Distances

In [100]:
inner_ccgs = ['NHS Buckinghamshire CCG', 'NHS Ealing CCG', 'NHS East Berkshire CCG', 'NHS Hillingdon CCG',
 'NHS Hounslow CCG', 'NHS North West Surrey CCG']

outer_ccgs =  ['NHS Basildon And Brentwood CCG', 'NHS Berkshire West CCG', 'NHS Brent CCG', 'NHS Croydon CCG',
               'NHS Guildford and Waverley CCG', 'NHS Hammersmith And Fulham CCG', 'NHS Harrow CCG', 
               'NHS Herts Valleys CCG', 'NHS Kingston CCG', 'NHS Merton CCG', 'NHS Oxfordshire CCG', 
               'NHS Richmond CCG', 'NHS Surrey Heath CCG', 'NHS Sutton CCG', 'NHS Wandsworth CCG']

# HJammersmith & Fulham and Basingham(?) renamed to match the lists above. alternatively, make lowercase prior to join
geog = pd.read_csv('../../geography/nhs_ccg_geometry.csv', encoding='utf-8')
geog_ccg = geog[geog['Area Name'].isin(inner_ccgs + outer_ccgs)]

In [101]:
prevalence_df_dist = pd.merge(prevalence_df, geog_ccg[['Area Name', 'heathrow_distance']], how='left', on='Area Name')
admissions_df_dist = pd.merge(admissions_df, geog_ccg[['Area Name', 'heathrow_distance']], how='left', on='Area Name')

### 4. Transform Health DF into Multi-Variate format

In [111]:
# restrict the columns to those requried 
prevalence_df_multi = prevalence_df_dist.copy()
prevalence_df_multi = prevalence_df_multi[['Year', 'Area Name','Indicator Name', 'Value', 'CCG Location', 'heathrow_distance']]

prevalence_df_pivot = prevalence_df_multi.pivot_table(
    values='Value',
    index= ['Year', 'Area Name', 'CCG Location', 'heathrow_distance'],
    columns='Indicator Name'
).reset_index().rename(columns={'Indicator Name': 'Index'})


# # restrict the columns to those requried 
admissions_df_multi = admissions_df_dist.copy()
admissions_df_multi = admissions_df_multi[['Year', 'Area Name','Indicator Name', 'Value', 'CCG Location', 'heathrow_distance']]

admissions_df_pivot = admissions_df_multi.pivot_table(
    values='Value',
    index= ['Year', 'Area Name', 'CCG Location', 'heathrow_distance'],
    columns='Indicator Name'
).reset_index()

### 5. Transform Health DF into Multi-Variate format