#### Import packages

In [31]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score

from xgboost import XGBRegressor


#### Import cleaned DataFrame

In [32]:
df = pd.read_csv("../data/cleaned_data_kangaroo.csv")
df.head()

Unnamed: 0,bedroomCount,bathroomCount,postCode,habitableSurface,roomCount,hasAttic,hasBasement,hasDiningRoom,buildingConstructionYear,floorCount,...,gardenOrientation_unknown,terraceOrientation_NORTH,terraceOrientation_NORTH_EAST,terraceOrientation_NORTH_WEST,terraceOrientation_SOUTH,terraceOrientation_SOUTH_EAST,terraceOrientation_SOUTH_WEST,terraceOrientation_WEST,terraceOrientation_unknown,roomCount_missing
0,2.0,1.0,1040,100.0,-1.0,0,1,0,2004.0,7.0,...,1,0,0,0,0,0,0,0,1,1
1,4.0,2.0,1040,270.0,12.0,1,1,1,1910.0,3.0,...,0,0,0,0,0,0,0,0,1,0
2,2.0,1.0,1040,87.0,-1.0,0,0,0,1970.0,7.0,...,1,0,0,0,0,0,0,0,1,1
3,2.0,2.0,1040,104.0,-1.0,0,0,0,2018.0,21.0,...,1,0,0,0,0,0,0,0,1,1
4,1.0,1.0,1040,71.0,-1.0,0,0,0,1906.0,3.0,...,1,0,0,0,0,0,0,0,1,1


#### Create X and Y 

In [33]:
# separate explicative variables and target variable 
X = df.drop(columns='price')
Y = df['price']

#### Divide Dataframe in training and testing parts

In [34]:
# 80% training and 20% testing data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8, test_size=0.2, random_state=0)

#### Linear regression 

In [35]:
# Import
model_LR = LinearRegression()

# Train the model
model_LR.fit(X_train, Y_train)

#Prediction 
Y_pred = model_LR.predict(X_test)

# Evaluation 
print(f"Mean absolute error: {mean_absolute_error(Y_test, Y_pred)}")
print (f"Mean absolute percentage error: {mean_absolute_percentage_error(Y_test, Y_pred)}")
print (f"R2 score : {r2_score(Y_test, Y_pred)}")

Mean absolute error: 88099.5211918535
Mean absolute percentage error: 0.3033100378186818
R2 score : -3.0434016736534977


#### Random Forest Regressor 

In [36]:
# Import 
model_RF = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model 
model_RF.fit(X_train, Y_train)

# Prediction 
Y_pred_RF = model_RF.predict(X_test)

# Evaluation 
print(f"Mean absolute error: {mean_absolute_error(Y_test, Y_pred_RF)}")
print (f"Mean absolute percentage error: {mean_absolute_percentage_error(Y_test, Y_pred_RF)}")
print (f"R2 score : {r2_score(Y_test, Y_pred_RF)}")






Mean absolute error: 60551.93042535789
Mean absolute percentage error: 0.2004806228143579
R2 score : 0.7561244405041188


#### Gradient Boosting Regressor

In [37]:
# Import 
model_GBR = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Train the model 
model_GBR.fit(X_train, Y_train)

# Prediction 
Y_pred_GBR = model_GBR.predict(X_test)

# Evaluation 
print(f"Mean absolute error: {mean_absolute_error(Y_test, Y_pred_GBR)}")
print (f"Mean absolute percentage error: {mean_absolute_percentage_error(Y_test, Y_pred_GBR)}")
print (f"R2 score : {r2_score(Y_test, Y_pred_GBR)}")



Mean absolute error: 75337.59747226216
Mean absolute percentage error: 0.25533312307597694
R2 score : 0.663099270957954


#### XGBoost

In [38]:
# Import 
model_XGB = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Train the model 
model_XGB.fit(X_train, Y_train)

# Prediction 
Y_pred_XGB = model_XGB.predict(X_test)

# Evaluation 
print(f"Mean absolute error: {mean_absolute_error(Y_test, Y_pred_XGB)}")
print (f"Mean absolute percentage error: {mean_absolute_percentage_error(Y_test, Y_pred_XGB)}")
print (f"R2 score : {r2_score(Y_test, Y_pred_XGB)}")


Mean absolute error: 64715.09218973167
Mean absolute percentage error: 0.21447957277675886
R2 score : 0.7429726864692979


In [39]:
# Attention : ne pas faire la mean sur l'ensemble du df avant de le split sinon ça aide beaucoup trop le modele 
# D'abord split puis faire le mean 
# check pycaret library 
