# Notebook 4.2 Feature Engineering (Income)

# Import libraries

In [187]:
import pandas as pd

# Choose the city

In [188]:
#Choose city ("Madrid", "Barcelona", or "Valencia")
city = "Madrid"

# Load cleaned data incl zip code

In [189]:
#Read corresponding files (cleaned data and income data)
if city == "Madrid":
    sale_data_cleaned = pd.read_csv('../../data/4_data_cleaned/madrid_cleaned_base_features.csv', dtype={'ZIP_CODE': str})
    income_data = pd.read_csv('../../data/3_external_data/income/madrid_income_data.csv')
elif city == "Barcelona":
    sale_data_cleaned = pd.read_csv('../../data/4_data_cleaned/barcelona_cleaned_base_features.csv', dtype={'ZIP_CODE': str})
    income_data = pd.read_csv('../../data/3_external_data/income/barcelona_income_data.csv')
elif city == "Valencia":
    sale_data_cleaned = pd.read_csv('../../data/4_data_cleaned/valencia_cleaned_base_features.csv', dtype={'ZIP_CODE': str})
    income_data = pd.read_csv('../../data/3_external_data/income/valencia_income_data.csv')
else:
    raise ValueError("City not recognized. Please choose either 'Madrid', 'Barcelona', or 'Valencia'.")

# Create income feature

__Rename columns (from spanish to english in income dataset)__

In [190]:
# Define a dictionary to map Spanish column names to English
column_rename_map = {
    'Area': 'AREA',
    'Numero de declaraciones': 'NUMER_OF_DECLARATIONS',
    'Renta bruta media': 'AVERAGE_GROSS_INCOME',
    'Renta disponible media': 'AVERAGE_DISPOSABLE_INCOME',
    'Rentas del trabajo': 'LABOR_INCOME',
    'Rentas exentas': 'EXEMPT_INCOME',
    'Renta bruta': 'GROSS_INCOME',
    'Cotizaciones sociales a la SS': 'SOCIAL_SECURITY_CONTRIBUTIONS',
    'Cuota resultante de autoliquidación': 'SELF_ASSESSMENT_QUOTA',
    'Renta disponible': 'DISPOSABLE_INCOME'
}

# Rename columns in rent data
income_data.rename(columns=column_rename_map, inplace=True)

#Check results of renaming
income_data.columns

Index(['AREA', 'NUMER_OF_DECLARATIONS', 'AVERAGE_GROSS_INCOME',
       'AVERAGE_DISPOSABLE_INCOME', 'LABOR_INCOME', 'EXEMPT_INCOME',
       'GROSS_INCOME', 'SOCIAL_SECURITY_CONTRIBUTIONS',
       'SELF_ASSESSMENT_QUOTA', 'DISPOSABLE_INCOME'],
      dtype='object')

__Extract the zipcode__ 


In [191]:
# Extract the ZIP code from the Area column, drop Area afterwards
income_data['ZIP_CODE'] = income_data['AREA'].str.extract(r'(\d{5})')

# When Area says "Resto", ZIP_CODE should say "Rest"
income_data.loc[income_data['AREA'] == 'Resto', 'ZIP_CODE'] = 'Rest'

#Drop Area column
income_data = income_data.drop(columns=['AREA'])

# Convert ZIP code columns to strings
sale_data_cleaned['ZIP_CODE'] = sale_data_cleaned['ZIP_CODE'].astype(str)
income_data['ZIP_CODE'] = income_data['ZIP_CODE'].astype(str)

# Verify data types
print(sale_data_cleaned['ZIP_CODE'].dtype)
print(income_data['ZIP_CODE'].dtype)

object
object


In [192]:
sale_data_cleaned.head()

Unnamed: 0,ASSETID,PRICE,CONSTRUCTEDAREA,ROOMNUMBER,BATHNUMBER,AMENITYID,HASPARKINGSPACE,PARKINGSPACEPRICE,HASTERRACE,HASLIFT,...,DISTANCE_TO_METRO,DISTANCE_TO_MAIN_STREET,LONGITUDE,LATITUDE,NEIGHBORHOOD,ZIP_CODE,PERIOD_201803,PERIOD_201806,PERIOD_201809,PERIOD_201812
0,A10000037964896093228,255000,97,3,2,3,0,1.0,0,1,...,0.254412,3.027988,-3.650253,40.473921,Pinar del Rey,28033,0,0,1,0
1,A10000072440601830803,82000,62,2,1,3,0,1.0,0,1,...,0.268472,4.693939,-3.640243,40.384968,Palomeras sureste,28018,0,1,0,0
2,A10000538600815177437,133000,67,3,1,3,0,1.0,1,0,...,1.061146,2.623258,-3.665263,40.384547,San Diego,28018,1,0,0,0
3,A10000654405436195291,204000,180,3,2,3,0,1.0,0,1,...,0.427977,3.131739,-3.65179,40.430336,Ventas,28017,1,0,0,0
4,A10000872160480475600,161000,54,2,1,3,0,1.0,0,0,...,0.377045,2.702218,-3.725637,40.384103,Buena Vista,28019,0,0,0,1


In [193]:
income_data.tail()

Unnamed: 0,NUMER_OF_DECLARATIONS,AVERAGE_GROSS_INCOME,AVERAGE_DISPOSABLE_INCOME,LABOR_INCOME,EXEMPT_INCOME,GROSS_INCOME,SOCIAL_SECURITY_CONTRIBUTIONS,SELF_ASSESSMENT_QUOTA,DISPOSABLE_INCOME,ZIP_CODE
52,4.77,32.327,25.763,131.171.315,7.675.070,154.198.537,7.925.100,23.383.895,122.889.543,28052
53,26.49,21.022,17.881,481.905.745,25.332.565,556.885.435,25.332.565,57.885.197,473.678.613,28053
54,16.334,35.015,27.842,498.211.190,20.338.276,571.943.019,28.583.667,88.587.835,454.771.806,28054
55,12.286,63.064,46.667,600.161.494,36.820.739,774.800.442,46.432.831,175.030.538,573.349.198,28055
56,41.769,34.4,26.979,1.115.520.078,52.161.118,1.436.873.713,60.496.019,258.362.359,1.126.891.821,Rest


__Merge the dataset__

In [194]:
# Merge the datasets on the ZIP code
data_incl_income = pd.merge(sale_data_cleaned, income_data, left_on='ZIP_CODE', right_on='ZIP_CODE', how='left')

# Display the first few rows of the merged dataset
print(data_incl_income.head())

                 ASSETID   PRICE  CONSTRUCTEDAREA  ROOMNUMBER  BATHNUMBER  \
0  A10000037964896093228  255000               97           3           2   
1  A10000072440601830803   82000               62           2           1   
2  A10000538600815177437  133000               67           3           1   
3  A10000654405436195291  204000              180           3           2   
4  A10000872160480475600  161000               54           2           1   

   AMENITYID  HASPARKINGSPACE  PARKINGSPACEPRICE  HASTERRACE  HASLIFT  ...  \
0          3                0                1.0           0        1  ...   
1          3                0                1.0           0        1  ...   
2          3                0                1.0           1        0  ...   
3          3                0                1.0           0        1  ...   
4          3                0                1.0           0        0  ...   

   PERIOD_201812  NUMER_OF_DECLARATIONS  AVERAGE_GROSS_INCOME  \
0  

__Check whether there are any houses in zip codes not available in income data__

In [195]:
num_unmatched_rows = data_incl_income["DISPOSABLE_INCOME"].isna().sum()
num_unmatched_rows

4431

__Assign income data of "rest of city" to houses in zip code not available__

In [196]:
# Identify rows with NaN values in the 'DISPOSABLE_INCOME' column after the merge
unmatched_rows = data_incl_income[data_incl_income['DISPOSABLE_INCOME'].isna()]
unmatched_rows.head()

Unnamed: 0,ASSETID,PRICE,CONSTRUCTEDAREA,ROOMNUMBER,BATHNUMBER,AMENITYID,HASPARKINGSPACE,PARKINGSPACEPRICE,HASTERRACE,HASLIFT,...,PERIOD_201812,NUMER_OF_DECLARATIONS,AVERAGE_GROSS_INCOME,AVERAGE_DISPOSABLE_INCOME,LABOR_INCOME,EXEMPT_INCOME,GROSS_INCOME,SOCIAL_SECURITY_CONTRIBUTIONS,SELF_ASSESSMENT_QUOTA,DISPOSABLE_INCOME
20,A10005090920840447561,81000,64,2,1,3,0,1.0,1,0,...,0,,,,,,,,,
29,A10007409135024974279,2901000,420,6,5,3,1,1.0,1,1,...,0,,,,,,,,,
55,A10011318022231015213,226333,89,2,2,3,1,1.0,1,1,...,1,,,,,,,,,
60,A10012221470240925424,174000,41,1,1,3,0,1.0,0,0,...,0,,,,,,,,,
73,A10015633877553382753,246000,60,2,1,3,0,1.0,0,1,...,1,,,,,,,,,


In [197]:
unmatched_rows.columns

Index(['ASSETID', 'PRICE', 'CONSTRUCTEDAREA', 'ROOMNUMBER', 'BATHNUMBER',
       'AMENITYID', 'HASPARKINGSPACE', 'PARKINGSPACEPRICE', 'HASTERRACE',
       'HASLIFT', 'HASAIRCONDITIONING', 'HASNORTHORIENTATION',
       'HASSOUTHORIENTATION', 'HASEASTORIENTATION', 'HASWESTORIENTATION',
       'HASBOXROOM', 'HASWARDROBE', 'HASSWIMMINGPOOL', 'HASDOORMAN',
       'HASGARDEN', 'ISDUPLEX', 'ISSTUDIO', 'HASEXTERNALVIEW', 'ISINTOPFLOOR',
       'FLOORCLEAN', 'CADMAXBUILDINGFLOOR', 'CADCONSTRUCTIONYEAR',
       'CADDWELLINGCOUNT', 'CADASTRALQUALITYID', 'BUILTTYPEID_1',
       'BUILTTYPEID_2', 'BUILTTYPEID_3', 'DISTANCE_TO_CITY_CENTER',
       'DISTANCE_TO_METRO', 'DISTANCE_TO_MAIN_STREET', 'LONGITUDE', 'LATITUDE',
       'NEIGHBORHOOD', 'ZIP_CODE', 'PERIOD_201803', 'PERIOD_201806',
       'PERIOD_201809', 'PERIOD_201812', 'NUMER_OF_DECLARATIONS',
       'AVERAGE_GROSS_INCOME', 'AVERAGE_DISPOSABLE_INCOME', 'LABOR_INCOME',
       'EXEMPT_INCOME', 'GROSS_INCOME', 'SOCIAL_SECURITY_CONTRIBUTIONS',
  

In [198]:
# Remove these rows from the merged DataFrame
data_incl_income = data_incl_income.dropna(subset=['DISPOSABLE_INCOME'])

# Ensure ZIP_CODE is of type string
unmatched_rows['ZIP_CODE'] = unmatched_rows['ZIP_CODE'].astype(str)

# Set ZIP_CODE to "Rest" for the rows with NaN values
unmatched_rows['ZIP_CODE'] = 'Rest'

#Drop all columns from the income data (to be added in next step again)
columns_to_drop = [
    'NUMER_OF_DECLARATIONS', 'AVERAGE_GROSS_INCOME', 'AVERAGE_DISPOSABLE_INCOME',
    'LABOR_INCOME', 'EXEMPT_INCOME', 'GROSS_INCOME', 
    'SOCIAL_SECURITY_CONTRIBUTIONS', 'SELF_ASSESSMENT_QUOTA', 'DISPOSABLE_INCOME'
]
unmatched_rows = unmatched_rows.drop(columns=columns_to_drop)

# Merge these rows with the income_data DataFrame using the ZIP_CODE value "Rest"
unmatched_rows = pd.merge(unmatched_rows, income_data, on='ZIP_CODE', how='left')

# Append the corrected rows back to the main DataFrame
data_incl_income = pd.concat([data_incl_income, unmatched_rows], ignore_index=True)

In [199]:
#Check that no missing values are left
num_unmatched_rows_after_cleaning = data_incl_income["DISPOSABLE_INCOME"].isna().sum()
num_unmatched_rows_after_cleaning

0

__Check that the number of houses is the same as before__

In [200]:
print("Number of rows before merging:", len(sale_data_cleaned))
print("Number of rows after merging:", len(data_incl_income))

Number of rows before merging: 75747
Number of rows after merging: 75747


__Ensure added columns are of the correct data type__

In [201]:
# List of columns to be converted to float
columns_to_convert = [
    'NUMER_OF_DECLARATIONS', 
    'AVERAGE_GROSS_INCOME', 
    'AVERAGE_DISPOSABLE_INCOME',
    'LABOR_INCOME', 
    'EXEMPT_INCOME', 
    'GROSS_INCOME', 
    'SOCIAL_SECURITY_CONTRIBUTIONS', 
    'SELF_ASSESSMENT_QUOTA', 
    'DISPOSABLE_INCOME'
]

# Removing commas and converting the specified columns to float data type
for column in columns_to_convert:
    # Convert to string first
    data_incl_income[column] = data_incl_income[column].astype(str)
    # Replace thousand separators and convert to float
    data_incl_income[column] = data_incl_income[column].str.replace('.', '', regex=False).str.replace(',', '.', regex=False).astype(float)

In [202]:
data_incl_income['ZIP_CODE'] = data_incl_income['ZIP_CODE'].astype(str)

__Check final merged dataframe__

In [203]:
data_incl_income.head()

Unnamed: 0,ASSETID,PRICE,CONSTRUCTEDAREA,ROOMNUMBER,BATHNUMBER,AMENITYID,HASPARKINGSPACE,PARKINGSPACEPRICE,HASTERRACE,HASLIFT,...,PERIOD_201812,NUMER_OF_DECLARATIONS,AVERAGE_GROSS_INCOME,AVERAGE_DISPOSABLE_INCOME,LABOR_INCOME,EXEMPT_INCOME,GROSS_INCOME,SOCIAL_SECURITY_CONTRIBUTIONS,SELF_ASSESSMENT_QUOTA,DISPOSABLE_INCOME
0,A10000037964896093228,255000,97,3,2,3,0,1.0,0,1,...,0,47929.0,42679.0,33099.0,1597692000.0,76274309.0,2045548000.0,62934177.0,396256316.0,1586413000.0
1,A10000072440601830803,82000,62,2,1,3,0,1.0,0,1,...,0,33225.0,24517.0,20458.0,696542400.0,36917201.0,814568400.0,35486524.0,99378073.0,679721800.0
2,A10000538600815177437,133000,67,3,1,3,0,1.0,1,0,...,0,33225.0,24517.0,20458.0,696542400.0,36917201.0,814568400.0,35486524.0,99378073.0,679721800.0
3,A10000654405436195291,204000,180,3,2,3,0,1.0,0,1,...,0,55687.0,27477.0,22618.0,1311609000.0,54700472.0,1530129000.0,61816758.0,208884677.0,1259502000.0
4,A10000872160480475600,161000,54,2,1,3,0,1.0,0,0,...,1,47809.0,27218.0,22399.0,1097802000.0,48959836.0,1301282000.0,51743635.0,178718832.0,1070854000.0


# Write data incl. new feature to csv

In [204]:
if city == "Madrid":
    data_incl_income.to_csv("../../data/5_cleaned_and_feature_engineering/feature_income/madrid_cleaned_incl_income.csv", index=False)
elif city == "Barcelona":
    data_incl_income.to_csv("../../data/5_cleaned_and_feature_engineering/feature_income/barcelona_cleaned_incl_income.csv", index=False)
elif city == "Valencia":
    data_incl_income.to_csv("../../data/5_cleaned_and_feature_engineering/feature_income/valencia_cleaned_incl_income.csv", index=False)