In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
import statsmodels.api as sm
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score

In [2]:
df = pd.read_csv('properties_cleaned_engineered.csv')
df

Unnamed: 0,bedrooms,bathrooms,area,prices,locations,property_types,pool,balcony,furnished,maid,gym,brand_new,burj_view,sea_view,beach
0,0,1,352.0,21500.0,Academic City,Apartment,1,0,0,0,1,0,0,0,0
1,0,1,400.0,21500.0,Academic City,Apartment,1,1,1,0,1,0,0,0,0
2,0,1,400.0,21500.0,Academic City,Apartment,0,0,1,0,0,0,0,0,0
3,0,1,410.0,21500.0,Academic City,Apartment,1,1,1,1,1,0,1,0,0
4,0,1,390.0,20500.0,Academic City,Apartment,1,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157375,3,3,1806.0,152000.0,World Trade Centre,Hotel Apartment,1,1,0,0,1,0,0,0,0
157376,3,4,1806.0,152000.0,World Trade Centre,Apartment,0,1,1,0,0,0,0,0,0
157377,3,4,1806.0,152000.0,World Trade Centre,Apartment,1,1,1,0,1,0,0,0,0
157378,3,3,1806.0,152000.0,World Trade Centre,Hotel Apartment,0,1,1,0,0,0,0,0,0


#### We need to create dummy variables for categorical variables

In [3]:
df = pd.get_dummies(df)
print(f"Now we have {df.shape[-1]} features!")
df

Now we have 130 features!


Unnamed: 0,bedrooms,bathrooms,area,prices,pool,balcony,furnished,maid,gym,brand_new,...,locations_Umm Suqeim,locations_Wadi Al Safa 2,locations_Wadi Al Shabak,locations_Wasl Gate,locations_World Trade Centre,property_types_Apartment,property_types_Hotel Apartment,property_types_Penthouse,property_types_Townhouse,property_types_Villa
0,0,1,352.0,21500.0,1,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
1,0,1,400.0,21500.0,1,1,1,0,1,0,...,0,0,0,0,0,1,0,0,0,0
2,0,1,400.0,21500.0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,1,410.0,21500.0,1,1,1,1,1,0,...,0,0,0,0,0,1,0,0,0,0
4,0,1,390.0,20500.0,1,1,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157375,3,3,1806.0,152000.0,1,1,0,0,1,0,...,0,0,0,0,1,0,1,0,0,0
157376,3,4,1806.0,152000.0,0,1,1,0,0,0,...,0,0,0,0,1,1,0,0,0,0
157377,3,4,1806.0,152000.0,1,1,1,0,1,0,...,0,0,0,0,1,1,0,0,0,0
157378,3,3,1806.0,152000.0,0,1,1,0,0,0,...,0,0,0,0,1,0,1,0,0,0


### Split into Train/Test splits

In [4]:
X = df.drop('prices',axis=1)
y = df.prices.values
print(f"X: {X.shape}, Y: {y.shape}")

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
print(f" Size of train: {X_train.shape[0]} \n Size of test: {X_test.shape[0]}")

X: (157380, 129), Y: (157380,)
 Size of train: 125904 
 Size of test: 31476


#### Multiple Linear Regression (using statsmodels)

#### Multiple Linear Regression using sklearn

In [10]:
lr = LinearRegression()
score = cross_val_score(lr,X_train,y_train,scoring='r2',cv=3,n_jobs=3)
print(f"Mean of CV R2 score: {np.mean(score)}")

Mean of CV R2 score: 0.7727074609999142


In [11]:
lr.fit(X_train,y_train)
print(f"Test R2 score is: {lr.score(X_test,y_test)}") 

Test R2 score is: 0.7943860590045717


### Multiple Linear Regression using Lasso regularizer

AKA Linear Regression model with L1 regularizer.

Let's first begin by scaling our input variables. We have to be careful and only scale our continous features, which area: bedrooms, bathrooms and area. These represent the first three columns in our input matrix.

In [13]:
X_train_scaled = X_train.copy(deep=True).values
X_test_scaled = X_test.copy(deep=True).values

scaler = StandardScaler(with_mean=True)
scaler.fit(X_train_scaled[:,:3])

X_train_scaled[:,:3] = scaler.transform(X_train_scaled[:,:3])
X_test_scaled[:,:3] = scaler.transform(X_test_scaled[:,:3])

print("First example, first 5 features:")
print(f"{X_train.values[0,:5]}   ---scale--->   {X_train_scaled[0,:5]}")

First example, first 5 features:
[3.000e+00 5.000e+00 1.784e+03 1.000e+00 0.000e+00]   ---scale--->   [ 0.80270613  1.63426775 -0.00395507  1.          0.        ]


In [30]:
reg_lasso = Lasso(random_state=42)

parameters = {
    'alpha': np.arange(1, 20, 0.5)
}

gs =  GridSearchCV(reg_lasso, parameters ,cv=3,scoring='r2')
gs.fit(X_train_scaled,y_train)

best_lasso_model = gs.best_estimator_
best_lasso_score = gs.best_score_
print(f"Best parameters: {gs.best_params_}\nBest score: {best_lasso_score}")

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best parameters: {'alpha': 1.0}
Best score: 0.7727032907018364


  model = cd_fast.enet_coordinate_descent(


In [31]:
print(f"Test R2 score is: {best_lasso_model.score(X_test_scaled,y_test)}") 

Test R2 score is: 0.7943886741938103


### Random Forest

Let's begin by fitting a simple Random Forest using default parameters, and evaluate the score

In [16]:
rf = RandomForestRegressor()
score = cross_val_score(rf,X_train,y_train,scoring='r2',cv=3,n_jobs=3)
print(f"Mean of CV R2 score: {np.mean(score)}")

Mean of CV R2 score: 0.9819410063337961


Not too bad! This is already much better than our linear regression models. Maybe we can improve the score? Let's perform a hyperparameter search.. 

In [18]:
parameters = {
    "n_estimators": [10,100],
    'criterion':['squared_error'], 
#     'max_features':[1.0,'sqrt','log']
}

gs = GridSearchCV(rf,parameters,scoring='r2',cv=3)
gs.fit(X_train,y_train)

best_rf_model = gs.best_estimator_
best_rf_score = gs.best_score_
print(f"Best parameters: {gs.best_params_}\nBest score: {gs.best_score_}")

Best parameters: {'criterion': 'squared_error', 'n_estimators': 100}
Best score: 0.9818594141941768


In [19]:
print(f"Test R2 score is: {best_rf_model.score(X_test,y_test)}") 

Test R2 score is: 0.9868367179351583


### Neural Network

Let's define a hyperparameter space for the hyperparameters search

In [23]:
parameters = {
                "learning_rate_init" : [0.001,0.001,0.01],
                "hidden_layer_sizes" : [(129,100,10,),(129,100,50,10,),(129,100,50,30,10,),(129,100,50,30,10,3,)], # why 129? poorly chosen "rule of thumb" where the first layer = n_features - 1 
                "alpha": [0.0001,0.001,0.01,0.1,1]
}

In [24]:
regr = MLPRegressor(random_state=42,
                    learning_rate='adaptive',
                    solver='adam',
                    max_iter=1000,
                    early_stopping= True
                   )

In [25]:
gs = GridSearchCV(regr,parameters,scoring='r2',cv=3,n_jobs=3)
gs.fit(X_train_scaled,y_train)
r2 = gs.best_score_
best_nn_model = gs.best_estimator_
print(f"Best Parameters: {gs.best_params_}\nBest Score: {r2}")

Best Parameters: {'alpha': 0.1, 'hidden_layer_sizes': (129, 100, 50, 10), 'learning_rate_init': 0.01}
Best Score: 0.9499625656411718


In [26]:
print(f"Test R2 score is: {best_nn_model.score(X_test_scaled,y_test)}") 

Test R2 score is: 0.9468102158376087


In [32]:
pd.DataFrame({
    "Linear Regression": [lr.score(X_test,y_test)],
    "Lasso Regression" : [best_lasso_model.score(X_test_scaled,y_test)],
     "Random Forest": [best_rf_model.score(X_test,y_test)],
     "Neural Network": [best_nn_model.score(X_test_scaled,y_test)]
},).style.set_caption("Summary table of our Test R2 results using different models.")


Unnamed: 0,Linear Regression,Lasso Regression,Random Forest,Neural Network
0,0.794386,0.794389,0.986837,0.94681


Clearly, our Random Forest regressor performed the best! Let's go ahead and pickle this model 

In [66]:
import pickle

filename = 'finalized_model.p'
pickle.dump(best_rf_model, open(filename, 'wb'))

And a sanity check..

In [67]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.9868367179351583


Great! 😃