In [149]:
# IMPORTING PACKAGES

import pandas as pd # data processing
import numpy as np # working with arrays
import matplotlib.pyplot as plt # visualization
import seaborn as sb # visualization
import hvplot.pandas

from sklearn.model_selection import train_test_split # data split

from sklearn.linear_model import LinearRegression # OLS algorithm
from sklearn.linear_model import Ridge # Ridge algorithm
from sklearn.linear_model import Lasso # Lasso algorithm
from sklearn.linear_model import BayesianRidge # Bayesian algorithm
from sklearn.linear_model import ElasticNet # ElasticNet algorithm
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import explained_variance_score as evs # evaluation metric
from sklearn.metrics import r2_score as r2 # evaluation metric

from pathlib import Path
import statsmodels.api as sm

In [150]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = Path('Resources/Dubai_RE_sales.csv')
df = pd.read_csv(file_path)

# Review the DataFrame
df.head()

Unnamed: 0,Transaction Number,Transaction Date,Transaction Type,Transaction sub type,Registration type,Is Free Hold?,Usage,Area,Property Type,Property Sub Type,...,Property Size (sq.m),Room(s),Parking,Nearest Metro,Nearest Mall,Nearest Landmark,No. of Buyer,No. of Seller,Master Project,Project
0,102-1-2023,2023-01-02 07:25:49,Sales,Sell - Pre registration,Off-Plan,Free Hold,Residential,BUSINESS BAY,Unit,Flat,...,105.75,2 B/R,1,Business Bay Metro Station,Dubai Mall,Downtown Dubai,1,1,,AYKON CITY 3
1,102-10-2023,2023-01-02 08:06:49,Sales,Sell - Pre registration,Off-Plan,Free Hold,Residential,BUSINESS BAY,Unit,Flat,...,52.29,Studio,1,Business Bay Metro Station,Dubai Mall,Downtown Dubai,1,1,,AYKON CITY 3
2,102-100-2023,2023-01-02 09:01:22,Sales,Sell - Pre registration,Off-Plan,Free Hold,Residential,Al Wasl,Unit,Flat,...,161.32,2 B/R,1,,,,1,1,,Fern
3,102-1000-2023,2023-01-05 13:26:06,Sales,Sell - Pre registration,Off-Plan,Free Hold,Residential,JUMEIRAH LAKES TOWERS,Unit,Flat,...,99.64,2 B/R,1,Damac Properties,Marina Mall,Burj Al Arab,1,1,,SEVEN CITY JLT
4,102-10000-2023,2023-03-01 16:02:19,Sales,Sell - Pre registration,Off-Plan,Free Hold,Residential,JUMEIRAH VILLAGE CIRCLE,Unit,Flat,...,63.95,1 B/R,1,Dubai Internet City,Marina Mall,Sports City Swimming Academy,1,1,,Binghatti Corner


In [151]:
#remove Commercial properties from Usage column
df = df[df['Usage'] != 'Commercial']
df

Unnamed: 0,Transaction Number,Transaction Date,Transaction Type,Transaction sub type,Registration type,Is Free Hold?,Usage,Area,Property Type,Property Sub Type,...,Property Size (sq.m),Room(s),Parking,Nearest Metro,Nearest Mall,Nearest Landmark,No. of Buyer,No. of Seller,Master Project,Project
0,102-1-2023,2023-01-02 07:25:49,Sales,Sell - Pre registration,Off-Plan,Free Hold,Residential,BUSINESS BAY,Unit,Flat,...,105.75,2 B/R,1,Business Bay Metro Station,Dubai Mall,Downtown Dubai,1,1,,AYKON CITY 3
1,102-10-2023,2023-01-02 08:06:49,Sales,Sell - Pre registration,Off-Plan,Free Hold,Residential,BUSINESS BAY,Unit,Flat,...,52.29,Studio,1,Business Bay Metro Station,Dubai Mall,Downtown Dubai,1,1,,AYKON CITY 3
2,102-100-2023,2023-01-02 09:01:22,Sales,Sell - Pre registration,Off-Plan,Free Hold,Residential,Al Wasl,Unit,Flat,...,161.32,2 B/R,1,,,,1,1,,Fern
3,102-1000-2023,2023-01-05 13:26:06,Sales,Sell - Pre registration,Off-Plan,Free Hold,Residential,JUMEIRAH LAKES TOWERS,Unit,Flat,...,99.64,2 B/R,1,Damac Properties,Marina Mall,Burj Al Arab,1,1,,SEVEN CITY JLT
4,102-10000-2023,2023-03-01 16:02:19,Sales,Sell - Pre registration,Off-Plan,Free Hold,Residential,JUMEIRAH VILLAGE CIRCLE,Unit,Flat,...,63.95,1 B/R,1,Dubai Internet City,Marina Mall,Sports City Swimming Academy,1,1,,Binghatti Corner
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81596,95-1-2023,2023-01-04 09:55:02,Sales,Delayed Development,Ready,Non Free Hold,Residential,SILICON OASIS,Unit,Flat,...,106.65,2 B/R,B1-71,,,IMG World Adventures,0,0,,ARABIAN GATE
81597,95-2-2023,2023-01-09 09:48:42,Sales,Delayed Development,Ready,Non Free Hold,Residential,SILICON OASIS,Unit,Flat,...,127.92,2 B/R,B2-105,,,IMG World Adventures,0,0,,ARABIAN GATE
81598,95-3-2023,2023-03-08 10:50:43,Sales,Delayed Development,Ready,Non Free Hold,Residential,SILICON OASIS,Unit,Flat,...,77.22,1 B/R,B1-55,,,IMG World Adventures,0,0,,ARABIAN GATE
81599,95-4-2023,2023-05-01 11:40:33,Sales,Delayed Development,Ready,Non Free Hold,Residential,SILICON OASIS,Unit,Flat,...,85.84,1 B/R,G-61,,,IMG World Adventures,0,0,,TOPAZ RESIDENCES 3


In [152]:
# remove columns from dataframe
columns_to_remove = ['Transaction Date', 'Transaction Number', 'Transaction sub type', 'Registration type', 'Usage', 'Transaction Size (sq.m)', 'Parking', 'Nearest Metro', 'Nearest Mall', 'Nearest Landmark', 'No. of Buyer', 'No. of Seller', 'Master Project', 'Project']
df = df.drop(columns=columns_to_remove)
df.head()


Unnamed: 0,Transaction Type,Is Free Hold?,Area,Property Type,Property Sub Type,Amount,Property Size (sq.m),Room(s)
0,Sales,Free Hold,BUSINESS BAY,Unit,Flat,2631000.0,105.75,2 B/R
1,Sales,Free Hold,BUSINESS BAY,Unit,Flat,1466000.0,52.29,Studio
2,Sales,Free Hold,Al Wasl,Unit,Flat,3309000.0,161.32,2 B/R
3,Sales,Free Hold,JUMEIRAH LAKES TOWERS,Unit,Flat,1170137.0,99.64,2 B/R
4,Sales,Free Hold,JUMEIRAH VILLAGE CIRCLE,Unit,Flat,590000.0,63.95,1 B/R


In [156]:
#remove blank cells 
df = df.dropna()
df.head()

Unnamed: 0,Transaction Type,Is Free Hold?,Area,Property Type,Property Sub Type,Amount,Property Size (sq.m),Room(s)
0,Sales,Free Hold,BUSINESS BAY,Unit,Flat,2631000.0,105.75,2 B/R
1,Sales,Free Hold,BUSINESS BAY,Unit,Flat,1466000.0,52.29,Studio
2,Sales,Free Hold,Al Wasl,Unit,Flat,3309000.0,161.32,2 B/R
3,Sales,Free Hold,JUMEIRAH LAKES TOWERS,Unit,Flat,1170137.0,99.64,2 B/R
4,Sales,Free Hold,JUMEIRAH VILLAGE CIRCLE,Unit,Flat,590000.0,63.95,1 B/R


In [157]:
df.dtypes

Transaction Type         object
Is Free Hold?            object
Area                     object
Property Type            object
Property Sub Type        object
Amount                  float64
Property Size (sq.m)    float64
Room(s)                  object
dtype: object

In [158]:
#df["A"] = df["A"].astype("category")

category_columns = ['Transaction Type', 'Is Free Hold?', 'Area', 'Property Type', 'Property Sub Type', 'Room(s)']

# Convert selected columns to categorical data type
df[category_columns] = df[category_columns].astype('category')

df.dtypes

Transaction Type        category
Is Free Hold?           category
Area                    category
Property Type           category
Property Sub Type       category
Amount                   float64
Property Size (sq.m)     float64
Room(s)                 category
dtype: object

In [159]:
df_encoded = pd.get_dummies(df, columns=category_columns, drop_first=True)

In [160]:
df_encoded.head()

Unnamed: 0,Amount,Property Size (sq.m),Transaction Type_Mortgage,Transaction Type_Sales,Is Free Hold?_Non Free Hold,Area_AL FURJAN,Area_AL KHAIL HEIGHTS,Area_AL WAHA,Area_ARABIAN RANCHES I,Area_ARABIAN RANCHES II,...,Room(s)_6 B/R,Room(s)_7 B/R,Room(s)_9 B/R,Room(s)_GYM,Room(s)_Hotel,Room(s)_Office,Room(s)_PENTHOUSE,Room(s)_Shop,Room(s)_Single Room,Room(s)_Studio
0,2631000.0,105.75,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1466000.0,52.29,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,3309000.0,161.32,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1170137.0,99.64,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,590000.0,63.95,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [210]:
# Split the data into features (X) and the target variable (y)
X = df_encoded.drop(columns=['Amount'])
y = df_encoded['Amount']

In [211]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [212]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [237]:
model = ElasticNet(alpha=1, l1_ratio=1, max_iter=10000)

In [238]:
model.fit(X_train, y_train)

ElasticNet(alpha=1, l1_ratio=1, max_iter=10000)

In [239]:
y_pred_scaled = model.predict(X_test)

In [240]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
std = np.std(y)

# Display the computed metrics
print("R-squared Score:", r2)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Standard Deviation (std) of the target variable:", std)

R-squared Score: 0.6449943541987422
Mean Squared Error (MSE): 3616184265775.202
Root Mean Squared Error (RMSE): 1901626.7419699382
Standard Deviation (std) of the target variable: 3877049.44525519
