## Importing all the required packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso



In [2]:
# Importing the dataset
df = pd.read_csv('/Users/andre/Documents/GitHub/immo-analysis-project/data/dataset_immo_.csv')

## Dataset cleaning

In [3]:
# Checking up the columns names
df.columns

Index(['url', 'locality', 'Type_property', 'subtype_property', 'price',
       'type-transaction', 'n_rooms', 'living_area', 'equipped_kitchen',
       'furnished', 'fireplace', 'terrace', 'area_terrace', 'garden',
       'area-garden', 'land-surface', 'n-facades', 'swimming-pool',
       'state-building'],
      dtype='object')

In [4]:
# Renaming some columns names in order to avoid problems

df.rename(columns={'swimming-pool': 'swimming_pool'}, inplace=True)
df.rename(columns={'state-building': 'state_building'}, inplace=True)
df.rename(columns={'land-surface': 'land_surface'}, inplace=True)

In [5]:
# Checking up the shape of the dataset
print(df.shape)
# Printing the basic statistics info of the dataset
print(df.describe())


(10020, 19)
              price       n_rooms  living_area  equipped_kitchen  \
count  1.001300e+04  10013.000000  9843.000000      10013.000000   
mean   5.082320e+05      2.994407   173.863253          0.866374   
std    5.532948e+05      1.732388   132.057334          0.340267   
min    1.000000e+04      0.000000    13.000000          0.000000   
25%    2.390000e+05      2.000000    92.000000          1.000000   
50%    3.490000e+05      3.000000   137.000000          1.000000   
75%    5.550000e+05      4.000000   211.000000          1.000000   
max    8.100000e+06     46.000000  1700.000000          1.000000   

          furnished     fireplace       terrace  area_terrace        garden  \
count  10013.000000  10013.000000  10013.000000   7620.000000  10013.000000   
mean       0.043943      0.375911      0.721662     17.192520      0.358534   
std        0.204978      0.484381      0.448203     32.555167      0.479594   
min        0.000000      0.000000      0.000000      0.0000

In [6]:
# Checking up the first 5 rows
df.head

<bound method NDFrame.head of                                                      url       locality  \
0      https://www.immoweb.be/en/classified/apartment...     Borgerhout   
1      https://www.immoweb.be/en/classified/flat-stud...         Leuven   
2      https://www.immoweb.be/en/classified/apartment...           Lede   
3      https://www.immoweb.be/en/classified/apartment...           Gent   
4      https://www.immoweb.be/en/classified/penthouse...         Hotton   
...                                                  ...            ...   
10015  https://www.immoweb.be/en/classified/villa/for...          Aalst   
10016  https://www.immoweb.be/en/classified/house/for...        Waregem   
10017  https://www.immoweb.be/en/classified/house/for...            NaN   
10018  https://www.immoweb.be/en/classified/villa/for...  Blankenberge    
10019  https://www.immoweb.be/en/classified/villa/for...         Hulste   

      Type_property subtype_property     price type-transaction  n_ro

In [7]:
# Removing duplicated rows
df.duplicated()
# Replacing NaN with 0
df.fillna(0, inplace=True)

df

Unnamed: 0,url,locality,Type_property,subtype_property,price,type-transaction,n_rooms,living_area,equipped_kitchen,furnished,fireplace,terrace,area_terrace,garden,area-garden,land_surface,n-facades,swimming_pool,state_building
0,https://www.immoweb.be/en/classified/apartment...,Borgerhout,APARTMENT,APARTMENT,245000.0,FOR_SALE,2.0,106.0,1.0,0.0,1.0,1.0,6.0,0.0,0.0,0.0,3.0,0.0,JUST_RENOVATED
1,https://www.immoweb.be/en/classified/flat-stud...,Leuven,APARTMENT,FLAT_STUDIO,199000.0,FOR_SALE,0.0,31.0,1.0,1.0,1.0,1.0,3.0,0.0,0.0,0.0,4.0,0.0,GOOD
2,https://www.immoweb.be/en/classified/apartment...,Lede,APARTMENT,APARTMENT,195000.0,FOR_SALE,2.0,91.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,AS_NEW
3,https://www.immoweb.be/en/classified/apartment...,Gent,APARTMENT,APARTMENT,185000.0,FOR_SALE,2.0,101.0,1.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0,2.0,0.0,TO_RESTORE
4,https://www.immoweb.be/en/classified/penthouse...,Hotton,APARTMENT,PENTHOUSE,375000.0,FOR_SALE,3.0,214.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,TO_BE_DONE_UP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10015,https://www.immoweb.be/en/classified/villa/for...,Aalst,HOUSE,VILLA,495000.0,FOR_SALE,5.0,238.0,1.0,0.0,0.0,1.0,135.0,1.0,710.0,1200.0,0.0,0.0,AS_NEW
10016,https://www.immoweb.be/en/classified/house/for...,Waregem,HOUSE,HOUSE,500000.0,FOR_SALE,2.0,140.0,1.0,0.0,1.0,1.0,10.0,0.0,0.0,0.0,2.0,0.0,TO_BE_DONE_UP
10017,https://www.immoweb.be/en/classified/house/for...,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
10018,https://www.immoweb.be/en/classified/villa/for...,Blankenberge,HOUSE,VILLA,525000.0,FOR_SALE,5.0,290.0,1.0,0.0,0.0,1.0,20.0,1.0,600.0,737.0,0.0,0.0,GOOD


In [8]:
# Removing all the rows where there's no information on locality
df.drop(df[df['locality'] == 0].index, inplace=True)
# Removing the columns type-transaction and url as it is not relevant
df.drop(columns=['type-transaction'], inplace=True)
df.drop(columns=['url'], inplace=True)
# Removing the columns area garden and terrace
df.drop(columns=['area_terrace'], inplace=True)
df.drop(columns=['area-garden'], inplace=True)
# Removing the column number of facades
df.drop(columns=['n-facades'], inplace=True)

In [9]:
# Removing outliers 
def remove_outliers(df,columns,n_std):
    for col in columns:
        print('Working on column: {}'.format(col))
        
        mean = df[col].mean()
        sd = df[col].std()
        
        df = df[(df[col] <= mean+(n_std*sd))]
        
    return df

df = remove_outliers(df, ['price', 'living_area'], 3)

df

Working on column: price
Working on column: living_area


Unnamed: 0,locality,Type_property,subtype_property,price,n_rooms,living_area,equipped_kitchen,furnished,fireplace,terrace,garden,land_surface,swimming_pool,state_building
0,Borgerhout,APARTMENT,APARTMENT,245000.0,2.0,106.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,JUST_RENOVATED
1,Leuven,APARTMENT,FLAT_STUDIO,199000.0,0.0,31.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,GOOD
2,Lede,APARTMENT,APARTMENT,195000.0,2.0,91.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,AS_NEW
3,Gent,APARTMENT,APARTMENT,185000.0,2.0,101.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,TO_RESTORE
4,Hotton,APARTMENT,PENTHOUSE,375000.0,3.0,214.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,TO_BE_DONE_UP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10014,Brugge,HOUSE,HOUSE,499000.0,3.0,174.0,1.0,0.0,0.0,1.0,1.0,1290.0,0.0,GOOD
10015,Aalst,HOUSE,VILLA,495000.0,5.0,238.0,1.0,0.0,0.0,1.0,1.0,1200.0,0.0,AS_NEW
10016,Waregem,HOUSE,HOUSE,500000.0,2.0,140.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,TO_BE_DONE_UP
10018,Blankenberge,HOUSE,VILLA,525000.0,5.0,290.0,1.0,0.0,0.0,1.0,1.0,737.0,0.0,GOOD


## Preprocessing the dataset for the model building

In [10]:
# Standardizing the values of the columns: number of rooms, living area and land surface

from sklearn.preprocessing import scale

# Subset of columns to transform
cols = ['n_rooms','living_area', 'land_surface']

# Overwrite old columns with transformed columns
df[cols] = scale(df[cols])

df

Unnamed: 0,locality,Type_property,subtype_property,price,n_rooms,living_area,equipped_kitchen,furnished,fireplace,terrace,garden,land_surface,swimming_pool,state_building
0,Borgerhout,APARTMENT,APARTMENT,245000.0,-0.594528,-0.530233,1.0,0.0,1.0,1.0,0.0,-0.248029,0.0,JUST_RENOVATED
1,Leuven,APARTMENT,FLAT_STUDIO,199000.0,-1.931874,-1.332928,1.0,1.0,1.0,1.0,0.0,-0.248029,0.0,GOOD
2,Lede,APARTMENT,APARTMENT,195000.0,-0.594528,-0.690772,1.0,0.0,0.0,1.0,0.0,-0.248029,0.0,AS_NEW
3,Gent,APARTMENT,APARTMENT,185000.0,-0.594528,-0.583746,1.0,0.0,1.0,1.0,0.0,-0.248029,0.0,TO_RESTORE
4,Hotton,APARTMENT,PENTHOUSE,375000.0,0.074145,0.625648,1.0,0.0,0.0,0.0,0.0,-0.248029,0.0,TO_BE_DONE_UP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10014,Brugge,HOUSE,HOUSE,499000.0,0.074145,0.197544,1.0,0.0,0.0,1.0,1.0,0.439874,0.0,GOOD
10015,Aalst,HOUSE,VILLA,495000.0,1.411490,0.882510,1.0,0.0,0.0,1.0,1.0,0.391881,0.0,AS_NEW
10016,Waregem,HOUSE,HOUSE,500000.0,-0.594528,-0.166345,1.0,0.0,1.0,1.0,0.0,-0.248029,0.0,TO_BE_DONE_UP
10018,Blankenberge,HOUSE,VILLA,525000.0,1.411490,1.439046,1.0,0.0,0.0,1.0,1.0,0.144982,0.0,GOOD


In [11]:
# Using the function get dummies to transform categorical values

x = df[['locality', 'Type_property', 'subtype_property', 'price', 'n_rooms', 'living_area', 'equipped_kitchen', 
       'furnished', 'fireplace', 'terrace', 'garden', 'land_surface', 'swimming_pool', 'state_building']]

x = pd.get_dummies(data=x, drop_first=True)

In [12]:
x

Unnamed: 0,price,n_rooms,living_area,equipped_kitchen,furnished,fireplace,terrace,garden,land_surface,swimming_pool,...,subtype_property_TOWN_HOUSE,subtype_property_TRIPLEX,subtype_property_VILLA,state_building_0,state_building_AS_NEW,state_building_GOOD,state_building_JUST_RENOVATED,state_building_TO_BE_DONE_UP,state_building_TO_RENOVATE,state_building_TO_RESTORE
0,245000.0,-0.594528,-0.530233,1.0,0.0,1.0,1.0,0.0,-0.248029,0.0,...,False,False,False,False,False,False,True,False,False,False
1,199000.0,-1.931874,-1.332928,1.0,1.0,1.0,1.0,0.0,-0.248029,0.0,...,False,False,False,False,False,True,False,False,False,False
2,195000.0,-0.594528,-0.690772,1.0,0.0,0.0,1.0,0.0,-0.248029,0.0,...,False,False,False,False,True,False,False,False,False,False
3,185000.0,-0.594528,-0.583746,1.0,0.0,1.0,1.0,0.0,-0.248029,0.0,...,False,False,False,False,False,False,False,False,False,True
4,375000.0,0.074145,0.625648,1.0,0.0,0.0,0.0,0.0,-0.248029,0.0,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10014,499000.0,0.074145,0.197544,1.0,0.0,0.0,1.0,1.0,0.439874,0.0,...,False,False,False,False,False,True,False,False,False,False
10015,495000.0,1.411490,0.882510,1.0,0.0,0.0,1.0,1.0,0.391881,0.0,...,False,False,True,False,True,False,False,False,False,False
10016,500000.0,-0.594528,-0.166345,1.0,0.0,1.0,1.0,0.0,-0.248029,0.0,...,False,False,False,False,False,False,False,True,False,False
10018,525000.0,1.411490,1.439046,1.0,0.0,0.0,1.0,1.0,0.144982,0.0,...,False,False,True,False,False,True,False,False,False,False


## Model building - Linear model type Lasso

In [13]:
# Checking up all the information of the new preprocessed dataset
# It's possible to see that now it has only boolean and float values
# Due to the get dummies function, it has a total of 432 columns

print(x.info())

x

<class 'pandas.core.frame.DataFrame'>
Index: 9280 entries, 0 to 10019
Columns: 432 entries, price to state_building_TO_RESTORE
dtypes: bool(422), float64(10)
memory usage: 4.5 MB
None


Unnamed: 0,price,n_rooms,living_area,equipped_kitchen,furnished,fireplace,terrace,garden,land_surface,swimming_pool,...,subtype_property_TOWN_HOUSE,subtype_property_TRIPLEX,subtype_property_VILLA,state_building_0,state_building_AS_NEW,state_building_GOOD,state_building_JUST_RENOVATED,state_building_TO_BE_DONE_UP,state_building_TO_RENOVATE,state_building_TO_RESTORE
0,245000.0,-0.594528,-0.530233,1.0,0.0,1.0,1.0,0.0,-0.248029,0.0,...,False,False,False,False,False,False,True,False,False,False
1,199000.0,-1.931874,-1.332928,1.0,1.0,1.0,1.0,0.0,-0.248029,0.0,...,False,False,False,False,False,True,False,False,False,False
2,195000.0,-0.594528,-0.690772,1.0,0.0,0.0,1.0,0.0,-0.248029,0.0,...,False,False,False,False,True,False,False,False,False,False
3,185000.0,-0.594528,-0.583746,1.0,0.0,1.0,1.0,0.0,-0.248029,0.0,...,False,False,False,False,False,False,False,False,False,True
4,375000.0,0.074145,0.625648,1.0,0.0,0.0,0.0,0.0,-0.248029,0.0,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10014,499000.0,0.074145,0.197544,1.0,0.0,0.0,1.0,1.0,0.439874,0.0,...,False,False,False,False,False,True,False,False,False,False
10015,495000.0,1.411490,0.882510,1.0,0.0,0.0,1.0,1.0,0.391881,0.0,...,False,False,True,False,True,False,False,False,False,False
10016,500000.0,-0.594528,-0.166345,1.0,0.0,1.0,1.0,0.0,-0.248029,0.0,...,False,False,False,False,False,False,False,True,False,False
10018,525000.0,1.411490,1.439046,1.0,0.0,0.0,1.0,1.0,0.144982,0.0,...,False,False,True,False,False,True,False,False,False,False


In [14]:
# Assigning X and y

X = x.drop(['price'], axis=1).to_numpy()
y = x['price'].to_numpy().reshape(-1, 1)

In [15]:
# Splitting the dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Instanting the model to an object and trainning it

regressor = Lasso(alpha=0.01)

regressor.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


In [17]:
# Checking up the model score for the trainning set
print('Training set score: {:.2f}'.format(regressor.score(X_train, y_train)))

Training set score: 0.68


In [18]:
# Checking up the model score for the test set

print('Test set score: {:.2f}'.format(regressor.score(X_test, y_test)))

Test set score: 0.67


In [19]:
# Calculating the RMSE

from sklearn.metrics import mean_squared_error as MSE

pred = regressor.predict(X_test)

rmse = np.sqrt(MSE(y_test, pred))
print("RMSE : % f" %(rmse))

RMSE :  193647.748895


## Cross validation with k-fold and stratified k-fold

In [23]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, cross_val_score

clf = DecisionTreeClassifier(random_state=42)

k_folds = KFold(n_splits = 5)

scores = cross_val_score(clf, X, y, cv = k_folds)

print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())
print("Number of CV Scores used in Average: ", len(scores))

Cross Validation Scores:  [0.03771552 0.04364224 0.03663793 0.02747845 0.02262931]
Average CV Score:  0.03362068965517241
Number of CV Scores used in Average:  5


In [21]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

clf = DecisionTreeClassifier(random_state=42)

sk_folds = StratifiedKFold(n_splits = 5)

scores = cross_val_score(clf, X, y, cv = sk_folds)

print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())
print("Number of CV Scores used in Average: ", len(scores))



Cross Validation Scores:  [0.06142241 0.07435345 0.05818966 0.06465517 0.046875  ]
Average CV Score:  0.06109913793103448
Number of CV Scores used in Average:  5
