# 4.5.4 Final join of census data

The purpose of this notebook is to join the cleaned Idealista property data with the processed INE data

### Import libraries

In [17]:
import pandas as pd

In [18]:
pd.set_option('display.max_columns', None)

### Read data

In [19]:
df_madrid = pd.read_csv('../../../../data/4_data_cleaned/madrid_cleaned_base_features.csv')
df_barcelona = pd.read_csv('../../../../data/4_data_cleaned/barcelona_cleaned_base_features.csv')
df_valencia = pd.read_csv('../../../../data/4_data_cleaned/valencia_cleaned_base_features.csv')

dfs = [df_barcelona, df_valencia, df_madrid]

ine_data = pd.read_csv('../../../../data/3_external_data/census/mapping_data/INE_DATA_1.csv')
mapping_df = pd.read_csv('../../../../data/3_external_data/census/mapping_data/map_cs_pc.csv')

### Merge cleaned Idealista data with census data

In [20]:
len(df_madrid['ZIP_CODE'].unique())

64

In [21]:
mapping_df.drop(columns='Unnamed: 0', inplace=True)

In [22]:
mapping_df['Postal_Code'] = mapping_df['Postal_Code'].astype(str).str.zfill(5)
df_barcelona['ZIP_CODE'] = df_barcelona['ZIP_CODE'].astype(str).str.zfill(5)
ine_data['Census section'] = ine_data['Census section'].astype(str).str.zfill(10)
mapping_df['Nearest_Centroid_ID'] = mapping_df['Nearest_Centroid_ID'].astype(str).str.zfill(10)
mapping_df['Distance_to_Centroid'] = pd.to_numeric(mapping_df['Distance_to_Centroid'], errors='coerce')

duplicates = mapping_df[mapping_df.duplicated('Postal_Code', keep=False)]

filtered_df = duplicates.loc[duplicates.groupby('Postal_Code')['Distance_to_Centroid'].idxmin()]
unique_postal_codes = mapping_df.drop_duplicates('Postal_Code', keep=False)

final_df = pd.concat([filtered_df, unique_postal_codes])
final_df = final_df.reset_index(drop=True)

ine_data['Census section'] = ine_data['Census section'].astype(str)

In [23]:
final_df = final_df.rename(columns={'Nearest_Centroid_ID': 'Census section'})
merged_df = pd.merge(final_df, ine_data, on='Census section')

In [24]:
merged_df = (merged_df.drop(columns=['Latitude', 'Longitude', 'Admin_Code3', 'Distance_to_Centroid'])
                     .rename(columns={'Postal_Code':'ZIP_CODE'}))

In [25]:
merged_df['ZIP_CODE'] = merged_df['ZIP_CODE'].astype(str)
df_barcelona['ZIP_CODE'] = df_barcelona['ZIP_CODE'].astype(str)
df_madrid['ZIP_CODE'] = df_madrid['ZIP_CODE'].astype(str)
df_valencia['ZIP_CODE'] = df_valencia['ZIP_CODE'].astype(str)

In [26]:
def final_join(city_df, merged_df):  
    joined_df = pd.merge(city_df, merged_df, on='ZIP_CODE')
    return joined_df

In [27]:
madrid_final = final_join(df_madrid, merged_df)
barcelona_final = final_join(df_barcelona, merged_df)
valencia_final = final_join(df_valencia, merged_df)

### Write joined data to CSV

In [29]:
madrid_final.to_csv('../../../../data/5_cleaned_and_feature_engineering/cleaned_with_ine/madrid_cleaned_with_ine.csv')
barcelona_final.to_csv('../../../../data/5_cleaned_and_feature_engineering/cleaned_with_ine/barcelona_cleaned_with_ine.csv')
valencia_final.to_csv('../../../../data/5_cleaned_and_feature_engineering/cleaned_with_ine/valencia_cleaned_with_ine.csv')