In [494]:
# Import the necessary libraries and load data
import pandas as pd

df = pd.read_csv('../immo_eliza_analysis/cleaned-data.csv')

In [495]:
# Find duplicates
#df[df.duplicated(['commune', 'price', 'living_area'])].sort_values(by= 'commune').head()

# After closer examination and cross validation for a couple of the duplicates,
# it has been discovered that some of the "duplicates" are properties that are part
# of real state projects, thus having similar information.
# However, it is important to notice that after the dataset is further processed and some features are removed,
# more duplicates might arise. Thus, duplicates will be evaluated again at a later stage.

In [496]:
# Since the data set was cleaned for the analysis part, it now contains no missing values.
df.isna().sum()

zip_code                      0
commune                       0
province                      0
type_of_property              0
subtype_of_property           0
price                         0
building_condition            0
facade_number                 0
living_area                   0
equipped_kitchen              0
bedroom_nr                    0
swimming_pool                 0
furnished                     0
open_fire                     0
terrace                       0
garden                        0
plot_surface                  0
sub_property_group_encoded    0
dtype: int64

In [497]:
df.head()

Unnamed: 0,zip_code,commune,province,type_of_property,subtype_of_property,price,building_condition,facade_number,living_area,equipped_kitchen,bedroom_nr,swimming_pool,furnished,open_fire,terrace,garden,plot_surface,sub_property_group_encoded
0,2600,Berchem,Antwerpen,0,apartment,149000.0,good,2.0,48,installed,1,0,0,0,9,0,0,apartments
1,2100,Deurne,Antwerpen,0,apartment,248000.0,good,2.0,91,installed,3,0,0,0,2,0,0,apartments
2,2660,Hoboken,Antwerpen,0,apartment,229000.0,good,2.0,100,not installed,3,0,0,0,26,0,0,apartments
3,1180,Uccle,Bruxelles,0,apartment,470000.0,good,2.0,179,equipped,3,0,0,0,10,0,0,apartments
4,2018,Antwerpen,Antwerpen,0,apartment,480000.0,good,2.0,116,installed,2,0,0,0,7,0,0,apartments


In [498]:
# Check there are only two types of properties: appartment: 0 and house: 1
df['type_of_property'].value_counts()

type_of_property
0    15195
1    10911
Name: count, dtype: int64

In [499]:
# Drop unecessary colums (swimming_pool, furnished, open_fire, sub_property_group_encoded) and columns which have high correlation with others: subtype_of property and bedroom_nr
df.drop(['subtype_of_property', 'bedroom_nr', 'swimming_pool', 'furnished', 'open_fire', 'sub_property_group_encoded'], axis=1, inplace=True)

In [500]:
# Building condition to numeric values / Removed observations with 'no info' about the building condition
# Although some of them correspond to new projects, there is no way to confirm if all 'no info' properties are 'new'

# Drop unknown building condition rows:
unknown_building_state = df[df['building_condition'] == 'no info'].index
df.drop(unknown_building_state, inplace=True)

# Make function to convert build condition to ordinal
def convert_build_condition(building_condition):
    if building_condition == 'to restore':
        return 0
    elif building_condition == 'to renovate':
        return 1
    else:
        return 2

# Apply the function
df['building_condition'] = df['building_condition'].apply(convert_build_condition)

In [501]:
# There are two appartments with 5 facades, which corresponds to a project (immoweb id 20147859) of apartments of 1,2 and 3 rooms, 
# not a specific one with 5 facades. Decided to remove that observation.
drop_by_facades = df[(df['facade_number'] > 4) & (df['type_of_property'] == 0)].index
df.drop(drop_by_facades, inplace=True)

In [502]:
# 'equipped_kitchen' still needs to be changed to a numerical value: {no: 0, semi: 1, yes: 2} 
print(df['equipped_kitchen'].value_counts())

# change it to numerical values
df['equipped_kitchen'] = df['equipped_kitchen'].apply(lambda x: 0 if x == 'not installed' else 1)

# check the change was made
df['equipped_kitchen'].value_counts()

equipped_kitchen
equipped         6859
not installed    6805
installed        5758
Name: count, dtype: int64


equipped_kitchen
1    12617
0     6805
Name: count, dtype: int64

In [503]:
# There are over 900 houses without plot size, which might affect the model's predictions:
zero_plot_surface = df[(df['type_of_property'] == 1) & (df['plot_surface'] == 0)]
print(f'Houses without plot surfacce: {zero_plot_surface.shape[0]}')

# Since it represents about 10% of the total observations for houses,
# decided to add the mean plot size per its commune to them.

mean_plot_surface = round(df.groupby(by='commune')['plot_surface'].mean())

df['plot_surface'] = df.apply(
    lambda x: mean_plot_surface[x['commune']] if int(x['plot_surface']) == 0 else x['plot_surface'],
    axis=1)

# For some communes, either there was only one observation or all observations had 0 for plot_surface:
zero_plot_surface_2 = df[(df['type_of_property'] == 1) & (df['plot_surface'] == 0)].index
print(f'New number of houses without plot surfacce: {zero_plot_surface_2.shape[0]}')

# Therefore, they were removed:
df.drop(zero_plot_surface_2, inplace=True)

Houses without plot surfacce: 916
New number of houses without plot surfacce: 16


In [504]:
df.head()

Unnamed: 0,zip_code,commune,province,type_of_property,price,building_condition,facade_number,living_area,equipped_kitchen,terrace,garden,plot_surface
0,2600,Berchem,Antwerpen,0,149000.0,2,2.0,48,1,9,0,92.0
1,2100,Deurne,Antwerpen,0,248000.0,2,2.0,91,1,2,0,55.0
2,2660,Hoboken,Antwerpen,0,229000.0,2,2.0,100,0,26,0,136.0
3,1180,Uccle,Bruxelles,0,470000.0,2,2.0,179,1,10,0,159.0
4,2018,Antwerpen,Antwerpen,0,480000.0,2,2.0,116,1,7,0,28.0
