# Cleaning the dataset

In [14]:
import pandas as pd

df = pd.read_csv('final_dataset.csv')

## Null values

For columns Furnished, Garden, Fireplace, SwimmingPool and Terrace we can assume that no information means the property does not have them. (Same with ToiletCount, except this one refers to a toilet separate from the bathroom. ShowerCount can also be zero if the property only has a bathtub for example.)

In [15]:
fill_with_zeros = ['Furnished','Garden','Fireplace','SwimmingPool','Terrace','ToiletCount','ShowerCount']
for i in fill_with_zeros:
    df[i] = df[i].fillna(0)

## Iterative Imputer for LivingArea

### Mapping

We'll impute LivingArea based on BedroomCount, TypeOfProperty and SubtypeOfProperty (numerical version).
But first, we need to associate categorical values to numeric ones. This will help us use corr() later to understand correlation with price.

In [16]:
subtype_mapping = {
    'apartment': 1,
    'house': 2,
    'villa': 3,
    'ground_floor': 4,
    'duplex': 5,
    'apartment_block': 6,
    'flat_studio': 7,
    'penthouse': 8,
    'mixed_use_building': 9,
    'service_flat': 10,
    'kot': 11,
    'mansion': 12,
    'town_house': 13,
    'bungalow': 14,
    'loft': 15,
    'exceptional_property': 16,
    'country_cottage': 17,
    'farmhouse': 18,
    'triplex': 19,
    'chalet': 20,
    'other_property': 21,
    'manor_house': 22,
    'castle': 23,
    'pavilion': 24,
    'show_house': 25
}

df['SubtypeOfProperty_Numerical'] = df['SubtypeOfProperty'].map(subtype_mapping)

living_area_df = df[['BedroomCount','TypeOfProperty','SubtypeOfProperty_Numerical','LivingArea']]

### Iterative Imputing

In [17]:
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

living_area_imputer = IterativeImputer(random_state=0)
living_area_df_imputed = living_area_imputer.fit_transform(living_area_df)

living_area_df_imputed = pd.DataFrame(living_area_df_imputed)
df['LivingArea'] = living_area_df_imputed[3]

#print(df[['BedroomCount','TypeOfProperty','SubtypeOfProperty_Numerical','LivingArea','Price']].head()) # Debug

### Iterative Imputing for other data

#### BathroomCount

In [18]:
bathroom_count_df = df[['BedroomCount','BathroomCount','TypeOfProperty','SubtypeOfProperty_Numerical']]

bathroom_count_imputer = IterativeImputer(random_state=0)
bathroom_count_df_imputed = bathroom_count_imputer.fit_transform(bathroom_count_df)
bathroom_count_df_imputed = np.round(bathroom_count_df_imputed) # it's a count, we don't need decimals

bathroom_count_df_imputed = pd.DataFrame(bathroom_count_df_imputed)
df['BathroomCount'] = bathroom_count_df_imputed[1]

print(df[['BedroomCount','BathroomCount','TypeOfProperty','SubtypeOfProperty_Numerical']].head(10)) # Debug

   BedroomCount  BathroomCount  TypeOfProperty  SubtypeOfProperty_Numerical
0           1.0            1.0               2                            7
1          13.0            6.0               1                            6
2           4.0            2.0               1                            2
3           4.0            1.0               1                            2
4           2.0            0.0               2                            1
5           1.0            1.0               2                            1
6           6.0            4.0               1                            3
7           2.0            0.0               1                            2
8           3.0            2.0               2                            1
9           2.0            1.0               2                            1


#### RoomCount

(I think we should drop it, not always coherent)

In [19]:
room_count_df = df[['BedroomCount','TypeOfProperty','SubtypeOfProperty_Numerical','RoomCount']]

room_count_imputer = IterativeImputer(random_state=0)
room_count_df_imputed = room_count_imputer.fit_transform(room_count_df)
room_count_df_imputed = np.round(room_count_df_imputed)

room_count_df_imputed = pd.DataFrame(room_count_df_imputed)
df['RoomCount'] = room_count_df_imputed[3]

print(df[['BedroomCount','TypeOfProperty','SubtypeOfProperty_Numerical','RoomCount']].head()) # Debug

   BedroomCount  TypeOfProperty  SubtypeOfProperty_Numerical  RoomCount
0           1.0               2                            7        1.0
1          13.0               1                            6       31.0
2           4.0               1                            2        9.0
3           4.0               1                            2        9.0
4           2.0               2                            1        1.0


#### GardenArea

Properties without garden will have "0" as GardenArea. 
We could use IterativeImputer but all properties with garden have listed their areas so it's useless here.

In [21]:
GardenArea_df = df[['Garden','GardenArea','TypeOfProperty','SubtypeOfProperty_Numerical']]

GardenArea_df.loc[GardenArea_df['Garden'] == 0, 'GardenArea'] = 0
df['GardenArea'] = GardenArea_df['GardenArea']

#### SurfaceOfPlot

Issue here; probably need to remove outliers

In [23]:
SurfaceOfPlot_df = df[['BedroomCount','Garden','GardenArea','TypeOfProperty','SubtypeOfProperty_Numerical','SurfaceOfPlot']]

SurfaceOfPlot_imputer = IterativeImputer(random_state=0)
SurfaceOfPlot_df_imputed = SurfaceOfPlot_imputer.fit_transform(SurfaceOfPlot_df)
SurfaceOfPlot_df_imputed = np.round(SurfaceOfPlot_df_imputed)

SurfaceOfPlot_df_imputed = pd.DataFrame(SurfaceOfPlot_df_imputed)
#df['SurfaceOfPlot'] = SurfaceOfPlot_df_imputed[5]
print(df[['BedroomCount','Garden','GardenArea','TypeOfProperty','SubtypeOfProperty_Numerical','SurfaceOfPlot']].head(15))

    BedroomCount  Garden  GardenArea  TypeOfProperty  \
0            1.0     0.0         0.0               2   
1           13.0     0.0         0.0               1   
2            4.0     0.0         0.0               1   
3            4.0     1.0         1.0               1   
4            2.0     0.0         0.0               2   
5            1.0     0.0         0.0               2   
6            6.0     1.0      2519.0               1   
7            2.0     0.0         0.0               1   
8            3.0     0.0         0.0               2   
9            2.0     0.0         0.0               2   
10           2.0     0.0         0.0               2   
11           3.0     0.0         0.0               2   
12           5.0     0.0         0.0               1   
13           1.0     0.0         0.0               2   
14           2.0     0.0         0.0               2   

    SubtypeOfProperty_Numerical  SurfaceOfPlot  
0                             7         3118.0  
1    

## Numerical values for other categorical values

#### Numerical values for kitchen

In [24]:
kitchen_mapping = {
    'NO_DATA' : 0,
    'NOT_INSTALLED': 1,
    'USA_UNINSTALLED': 2,
    'SEMI_EQUIPPED': 3,
    'USA_SEMI_EQUIPPED': 4,
    'INSTALLED': 5,
    'USA_INSTALLED': 6,
    'HYPER_EQUIPPED': 7,
    'USA_HYPER_EQUIPPED': 8
}

df['Kitchen'] = df['Kitchen'].fillna('NO_DATA')
df['Kitchen_Numerical'] = df['Kitchen'].map(kitchen_mapping)

#print(df[['BedroomCount','LivingArea','Price','Kitchen','Kitchen_Numerical']].head()) # Debug


#### Numerical values for state of building

In [25]:
stateOfBuilding_mapping = {
    'NO_DATA' : 0,
    'TO_RESTORE' : 1,
    'TO_RENOVATE' : 2,
    'TO_BE_DONE_UP' : 3,
    'GOOD' : 4,
    'AS_NEW' : 5,
    'JUST_RENOVATED' : 6
}
              
df['StateOfBuilding'] = df['StateOfBuilding'].fillna('NO_DATA')     
df['StateOfBuilding_Numerical'] = df['StateOfBuilding'].map(stateOfBuilding_mapping)       
        
#print(df[['BedroomCount','LivingArea','Price','StateOfBuilding','StateOfBuilding_Numerical']])    # Debug 