## Importing all the required packages

In [21]:
import numpy as np
import pandas as pd
import xgboost as xg
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [22]:
# Importing the dataset
df = pd.read_csv('/Users/andre/Documents/GitHub/immo-analysis-project/data/dataset_immo_.csv')

## Dataset cleaning

In [23]:
# Renaming some columns names in order to avoid problems

df.rename(columns={'swimming-pool': 'swimming_pool'}, inplace=True)
df.rename(columns={'state-building': 'state_building'}, inplace=True)
df.rename(columns={'land-surface': 'land_surface'}, inplace=True)

In [24]:
# Removing the null values from the column state_building
df.drop(df[df['state_building'] == "0"].index, inplace= True)
# Transforming the type of the column to str
df['state_building'] = df['state_building'].astype(str)
# Checking if all the null values were removed
df['state_building'].value_counts()

# This step was later necessary due to the fact that null values in this column were missed
# It has caused problems with the trained and saved model in the prediction part of the project

state_building
GOOD              3917
AS_NEW            2485
TO_BE_DONE_UP     1058
nan                924
TO_RENOVATE        904
JUST_RENOVATED     665
TO_RESTORE          41
Name: count, dtype: int64

In [25]:
# Removing duplicated rows
df.duplicated()
# Replacing NaN with 0
df.fillna(0, inplace=True)

In [26]:
# Removing all the rows where there's no information on locality
df.drop(df[df['locality'] == 0].index, inplace=True)
# Removing the columns type-transaction and url as it is not relevant
df.drop(columns=['type-transaction'], inplace=True)
df.drop(columns=['url'], inplace=True)
# Removing the columns area garden and terrace
df.drop(columns=['area_terrace'], inplace=True)
df.drop(columns=['area-garden'], inplace=True)
# Removing the column number of facades
df.drop(columns=['n-facades'], inplace=True)

## Preprocessing the dataset for the model building

## Model building - Random Forest Regressor

In [27]:
# Dropping null values before constructing the model
df.dropna(inplace=True) 

In [28]:
# Checking columns names
df.columns

Index(['locality', 'Type_property', 'subtype_property', 'price', 'n_rooms',
       'living_area', 'equipped_kitchen', 'furnished', 'fireplace', 'terrace',
       'garden', 'land_surface', 'swimming_pool', 'state_building'],
      dtype='object')

In [29]:
# Checking all the information of the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9630 entries, 0 to 10019
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   locality          9630 non-null   object 
 1   Type_property     9630 non-null   object 
 2   subtype_property  9630 non-null   object 
 3   price             9630 non-null   float64
 4   n_rooms           9630 non-null   float64
 5   living_area       9630 non-null   float64
 6   equipped_kitchen  9630 non-null   float64
 7   furnished         9630 non-null   float64
 8   fireplace         9630 non-null   float64
 9   terrace           9630 non-null   float64
 10  garden            9630 non-null   float64
 11  land_surface      9630 non-null   float64
 12  swimming_pool     9630 non-null   float64
 13  state_building    9630 non-null   object 
dtypes: float64(10), object(4)
memory usage: 1.1+ MB


In [30]:
df.head()

Unnamed: 0,locality,Type_property,subtype_property,price,n_rooms,living_area,equipped_kitchen,furnished,fireplace,terrace,garden,land_surface,swimming_pool,state_building
0,Borgerhout,APARTMENT,APARTMENT,245000.0,2.0,106.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,JUST_RENOVATED
1,Leuven,APARTMENT,FLAT_STUDIO,199000.0,0.0,31.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,GOOD
2,Lede,APARTMENT,APARTMENT,195000.0,2.0,91.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,AS_NEW
3,Gent,APARTMENT,APARTMENT,185000.0,2.0,101.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,TO_RESTORE
4,Hotton,APARTMENT,PENTHOUSE,375000.0,3.0,214.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,TO_BE_DONE_UP


In [31]:
# Assigning X and y
X = df.drop(['price'], axis=1)
y = df['price']

## Pipeline Building

In [32]:
# The first step of the pipeline is to transform the categorical values into numeric values
# It was later decided to use OneHotEncoder instead of get_dummies as the dimension os the dataframe is not changed
trans_1 = ColumnTransformer([('ohe_trans', 
                                  OneHotEncoder(sparse_output=False, 
                                                handle_unknown='ignore'), 
                                  [0, 1, 2, 12])], 
                                remainder='passthrough' )

In [33]:
# The second transformation consists of normalizing the data
trans_2 = ColumnTransformer([('scale', MinMaxScaler(),slice(0,len(X)+1))], remainder='passthrough')

In [34]:
# The third step taked the model
regressor = RandomForestRegressor(random_state=3)

In [35]:
# Pipeline building
model = Pipeline(steps=[('trans_1', trans_1), ('trans_2', trans_2), ('regressor', regressor)])

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instanting the model to an object and trainning it
model.fit(X_train, y_train)

In [36]:
# Checking up the model score for the trainning set
print('Training set score: ',model.score(X_train, y_train))
print('Training set score: {:.2f}'.format(model.score(X_train, y_train)))

# Checking up the model score for the test set
print('Test set score: ',model.score(X_test, y_test))
print('Test set score: {:.2f}'.format(model.score(X_test, y_test)))

# Calculating the RMSE
pred = model.predict(X_test)
rmse = np.sqrt(MSE(y_test, pred))
print("RMSE : % f" %(rmse))

Training set score:  0.9633712011272862
Training set score: 0.96
Test set score:  0.8125463783478836
Test set score: 0.81
RMSE :  237833.906634


In [37]:
# Analysing feature importances values
feature_importances = regressor.feature_importances_
print(feature_importances)

[2.96197561e-07 1.43705227e-04 9.52540501e-06 7.66318702e-05
 4.63503761e-05 1.05999516e-04 9.54619163e-05 2.99754299e-05
 4.50249226e-03 1.86186791e-05 1.72571345e-06 9.82947368e-06
 1.50589873e-04 7.37429212e-05 1.53436783e-04 5.20933213e-04
 3.53369438e-05 2.56145677e-04 3.27029239e-05 7.32269031e-07
 2.61766844e-06 3.68384052e-07 1.97290242e-05 4.20542946e-05
 1.85912624e-05 9.49141655e-06 3.56800455e-05 3.00408385e-05
 1.99359676e-04 3.70808769e-06 4.93682184e-06 9.60289734e-06
 1.63478246e-05 1.03243614e-04 1.76555976e-05 1.94660920e-05
 2.03595301e-06 3.21362169e-05 5.51246547e-05 5.83144777e-05
 1.50752499e-06 4.16586165e-06 1.00487790e-04 7.33989591e-05
 8.69263072e-05 4.32290405e-05 8.33651090e-07 2.03479726e-05
 6.52968039e-06 4.60118802e-04 2.54362957e-05 7.91117204e-05
 1.42363099e-05 2.49226454e-06 7.34549919e-04 9.37710708e-05
 1.82315363e-05 6.84361681e-06 3.37646857e-03 2.91103980e-06
 4.73526686e-04 6.29375667e-05 3.56142112e-03 3.84662085e-04
 2.89376134e-04 1.644146

In [38]:

# Analysing the top 20 feature importances

X_train = x.drop('price', axis=1)
y_train = x['price']

# Initialize the Random Forest Regressor
model = RandomForestRegressor()

# Train the model
model.fit(X_train, y_train)
# Getting feature importances
feature_importances = regressor.feature_importances_

# Getting the names of the features
feature_names = X_train.columns

# Creating a DataFrame to store the feature importances along with their names
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# Sorting the DataFrame by importance values in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Creating a bar plot for feature importances
plt.figure(figsize=(10, 12))
sns.barplot(X='Importance', y='Feature', data=feature_importance_df.head(20), palette='viridis')
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Random Forest Regressor - Top 20 Feature Importances')
plt.show()


NameError: name 'x' is not defined