In [11]:
import pandas as pd

In [12]:
# Load the two CSV files into DataFrames
file_2011_path = 'Data/Raw/Health/2011.csv'
file_2021_path = 'Data/Raw/Health/2021.csv'

# Reading the CSV files
df_2011 = pd.read_csv(file_2011_path)
df_2021 = pd.read_csv(file_2021_path)

# Calculating scores for the 2011 data
df_2011['Score_2011'] = (
    df_2011['Very good health'] * 5 +
    df_2011['Good health'] * 4 +
    df_2011['Fair health'] * 3 +
    df_2011['Bad health'] * 2 +
    df_2011['Very bad health'] * 1
)

# Calculating scores for the 2021 data
df_2021['Score_2021'] = (
    df_2021['Very good health'] * 5 +
    df_2021['Good health'] * 4 +
    df_2021['Fair health'] * 3 +
    df_2021['Bad health'] * 2 +
    df_2021['Very bad health'] * 1
)

# Keeping only the necessary columns for export
df_2011_final = df_2011[['LSOA code', 'Score_2011']]
df_2021_final = df_2021[['LSOA code', 'Score_2021']]

# Exporting the final DataFrames to CSV files
df_2011_final_path = 'Data/Revised/Health/2011.csv'
df_2021_final_path = 'Data/Revised/Health/2021.csv'

df_2011_final.to_csv(df_2011_final_path, index=False)
df_2021_final.to_csv(df_2021_final_path, index=False)

In [14]:
# Load the initial data files
ml_data = pd.read_csv('Data/Revised/Geography/Lookup/11-21-ML.csv')
lsoa_data = pd.read_csv('Data/Revised/Population/LSOA_2021.csv')
msoa_data = pd.read_csv('Data/Revised/Population/MSOA_2021.csv')

# Merge LSOA totals into the first table
ml_data_merged = pd.merge(
    ml_data, 
    lsoa_data[['LSOA 2021 Code', 'Total']], 
    left_on='LSOA21CD', 
    right_on='LSOA 2021 Code', 
    how='left'
)

# Rename the merged column to 'LSOA'
ml_data_merged.rename(columns={'Total': 'LSOA'}, inplace=True)

# Merge MSOA totals into the updated first table
ml_data_final = pd.merge(
    ml_data_merged, 
    msoa_data[['MSOA 2021 Code', 'Total']], 
    left_on='MSOA21CD', 
    right_on='MSOA 2021 Code', 
    how='left'
)

# Rename the merged column to 'MSOA'
ml_data_final.rename(columns={'Total': 'MSOA'}, inplace=True)

# Drop unnecessary columns used for merging
ml_data_final.drop(columns=['LSOA 2021 Code', 'MSOA 2021 Code'], inplace=True)

# Ensure that 'LSOA' and 'MSOA' columns are numeric for calculation
ml_data_final['LSOA'] = ml_data_final['LSOA'].str.replace(',', '').astype(float)
ml_data_final['MSOA'] = ml_data_final['MSOA'].str.replace(',', '').astype(float)

# Calculate the proportion of LSOA to MSOA
ml_data_final['Proportion'] = (ml_data_final['LSOA'] / ml_data_final['MSOA']).round(2)

In [15]:
# ---- Processing for 2011 ----

# Load the 2011 data file
data_2011 = pd.read_csv('Data/Revised/Health/2011.csv')

# Merge Score_2011 from the 2011 data into the existing Data With Proportion data
merged_data_with_2011 = pd.merge(
    ml_data_final, 
    data_2011, 
    left_on='LSOA21CD', 
    right_on='LSOA code', 
    how='left'
)

# Drop the redundant 'LSOA code' column after merging
merged_data_with_2011.drop(columns=['LSOA code'], inplace=True)

# Calculate the new column 'Score' by multiplying 'Proportion' with 'Score_2011'
merged_data_with_2011['Score'] = (merged_data_with_2011['Proportion'] * merged_data_with_2011['Score_2011']).round(0).astype(int)

# Calculate the sum of 'Score' for each unique 'MSOA21CD'
msoa_score_sum = merged_data_with_2011.groupby('MSOA21CD')['Score'].sum().reset_index()

# Merge the summed 'Score' back to the original data as a new column '2011'
merged_data_with_msoa_sum = pd.merge(
    merged_data_with_2011,
    msoa_score_sum,
    on='MSOA21CD',
    how='left'
)

# Rename the summed column to '2011'
merged_data_with_msoa_sum.rename(columns={'Score_y': '2011'}, inplace=True)

# Drop the redundant 'Score_x' column
merged_data_with_msoa_sum = merged_data_with_msoa_sum.rename(columns={'Score_x': 'Score'})

# Select only the columns 'MSOA21CD', 'MSOA21NM', and '2011'
filtered_data = merged_data_with_msoa_sum[['MSOA21CD', 'MSOA21NM', '2011']]

# Remove duplicate rows based on 'MSOA21CD' and 'MSOA21NM'
filtered_data_unique = filtered_data.drop_duplicates(subset=['MSOA21CD', 'MSOA21NM'])

# Export the filtered unique data to a CSV file named 'Health_2011.csv'
filtered_data_unique.to_csv('Data/Revised/Health/Health_2011.csv', index=False)

In [16]:
# ---- Processing for 2021 ----

# Load the 2021 data file
data_2021 = pd.read_csv('Data/Revised/Health/2021.csv')

# Merge Score_2011 from the 2011 data into the existing Data With Proportion data
merged_data_with_2021 = pd.merge(
    ml_data_final, 
    data_2021, 
    left_on='LSOA21CD', 
    right_on='LSOA code', 
    how='left'
)

# Drop the redundant 'LSOA code' column after merging
merged_data_with_2021.drop(columns=['LSOA code'], inplace=True)

# Calculate the new column 'Score' by multiplying 'Proportion' with 'Score_2021'
merged_data_with_2021['Score'] = (merged_data_with_2021['Proportion'] * merged_data_with_2021['Score_2021']).round(0).astype(int)

# Calculate the sum of 'Score' for each unique 'MSOA21CD'
msoa_score_sum = merged_data_with_2021.groupby('MSOA21CD')['Score'].sum().reset_index()

# Merge the summed 'Score' back to the original data as a new column '2021'
merged_data_with_msoa_sum = pd.merge(
    merged_data_with_2021,
    msoa_score_sum,
    on='MSOA21CD',
    how='left'
)

# Rename the summed column to '2021'
merged_data_with_msoa_sum.rename(columns={'Score_y': '2021'}, inplace=True)

# Drop the redundant 'Score_x' column
merged_data_with_msoa_sum = merged_data_with_msoa_sum.rename(columns={'Score_x': 'Score'})

# Select only the columns 'MSOA21CD', 'MSOA21NM', and '2021'
filtered_data = merged_data_with_msoa_sum[['MSOA21CD', 'MSOA21NM', '2021']]

# Remove duplicate rows based on 'MSOA21CD' and 'MSOA21NM'
filtered_data_unique = filtered_data.drop_duplicates(subset=['MSOA21CD', 'MSOA21NM'])

# Export the filtered unique data to a CSV file named 'Health_2021.csv'
filtered_data_unique.to_csv('Data/Revised/Health/Health_2021.csv', index=False)

In [20]:
# Load the data
file_path_1 = 'Data/Revised/Health/Health_2011.csv'
file_path_2 = 'Data/Revised/Health/Health_2021.csv'
data_1 = pd.read_csv(file_path_1)
data_2 = pd.read_csv(file_path_2)

# Calculate the deciles for the  column
data_1['Decile_Before'] = pd.qcut(data_1['2011'], 10, labels=False) + 1
data_2['Decile_After'] = pd.qcut(data_2['2021'], 10, labels=False) + 1

# Export 
data_1.to_csv('Data/Revised/Health/Health_2011.csv', index=False)
data_2.to_csv('Data/Revised/Health/Health_2021.csv', index=False)