# Dataset Preprocessing

In [7]:
import pandas as pd

In [8]:
iod_df = pd.read_csv("raw/indices_of_deprivation.csv")
population_df = pd.read_csv("raw/london_population.csv")

print(iod_df.head())
print(population_df.head())

  LSOA code (2011)     LSOA name (2011) Local Authority District code (2019)  \
0        E01000001  City of London 001A                            E09000001   
1        E01000002  City of London 001B                            E09000001   
2        E01000003  City of London 001C                            E09000001   
3        E01000005  City of London 001E                            E09000001   
4        E01032739  City of London 001F                            E09000001   

  Local Authority District name (2019)  \
0                       City of London   
1                       City of London   
2                       City of London   
3                       City of London   
4                       City of London   

   Index of Multiple Deprivation (IMD) Score  \
0                                        6.2   
1                                        5.1   
2                                       19.4   
3                                       28.7   
4                         

In [9]:
print("iod_df columns:", iod_df.columns)
print("population_df columns:", population_df.columns)

iod_df columns: Index(['LSOA code (2011)', 'LSOA name (2011)',
       'Local Authority District code (2019)',
       'Local Authority District name (2019)',
       'Index of Multiple Deprivation (IMD) Score',
       'Index of Multiple Deprivation (IMD) Rank (where 1 is most deprived)',
       'Index of Multiple Deprivation (IMD) Decile (where 1 is most deprived 10% of LSOAs)',
       'Income Score (rate)', 'Income Rank (where 1 is most deprived)',
       'Income Decile (where 1 is most deprived 10% of LSOAs)',
       'Employment Score (rate)', 'Employment Rank (where 1 is most deprived)',
       'Employment Decile (where 1 is most deprived 10% of LSOAs)',
       'Education, Skills and Training Score',
       'Education, Skills and Training Rank (where 1 is most deprived)',
       'Education, Skills and Training Decile (where 1 is most deprived 10% of LSOAs)',
       'Health Deprivation and Disability Score',
       'Health Deprivation and Disability Rank (where 1 is most deprived)',
  

In [10]:
iod_df.rename(columns={"LSOA code (2011)": "LSOA11CD"}, inplace=True)
merged_df = iod_df.merge(population_df, on="LSOA11CD", how="inner")

In [11]:
merged_df = merged_df[['LSOA11CD', 'All Ages', 'Index of Multiple Deprivation (IMD) Score',]]

merged_df['All Ages'] = pd.to_numeric(merged_df['All Ages'], errors='coerce')

merged_df = merged_df.groupby('LSOA11CD', as_index=False).agg({
    'All Ages': 'sum',
    'Index of Multiple Deprivation (IMD) Score': 'first'  # Assuming IMD score remains the same for each LSOA11CD
})

merged_df = merged_df.rename(columns={
    'All Ages': 'population',
    'Index of Multiple Deprivation (IMD) Score': 'IMD score'
})

merged_df

Unnamed: 0,LSOA11CD,population,IMD score
0,E01000001,1749.0,6.2
1,E01000002,1678.0,5.1
2,E01000003,1900.0,19.4
3,E01000005,2181.0,28.7
4,E01000006,2117.0,19.8
...,...,...,...
4830,E01033742,1705.0,13.1
4831,E01033743,2330.0,21.0
4832,E01033744,1810.0,29.4
4833,E01033745,1987.0,22.1


In [12]:
merged_df.to_csv("socioeconomic_data.csv", index=False)