In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
import pickle
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
import matplotlib.pyplot as plt

In [2]:
# Reading the cleaned dataset
cleaned_df = pd.read_csv('..\Cleaned_Data\Cleaned_Data_LogPrice.csv')
cleaned_df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Distance,Bathroom,Car,Region,Log_Price
0,Abbotsford,2,h,1480000.0,2.5,1.0,1.0,Northern Metropolitan,14.207553
1,Abbotsford,2,h,1035000.0,2.5,1.0,0.0,Northern Metropolitan,13.849913
2,Abbotsford,3,h,1465000.0,2.5,2.0,0.0,Northern Metropolitan,14.197366
3,Abbotsford,3,h,850000.0,2.5,2.0,1.0,Northern Metropolitan,13.652993
4,Abbotsford,4,h,1600000.0,2.5,1.0,2.0,Northern Metropolitan,14.285515


In [3]:
# Enumerate Type
encode = LabelEncoder().fit(cleaned_df['Type'])
carpet = {x: i for i, x in enumerate(encode.classes_)}
carpet

{'h': 0, 't': 1, 'u': 2}

In [4]:
# Enumerate Suburb
encoder = LabelEncoder().fit(cleaned_df['Suburb'])
carp = {x: i for i, x in enumerate(encoder.classes_)}
carp

{'Abbotsford': 0,
 'Aberfeldie': 1,
 'Airport West': 2,
 'Albanvale': 3,
 'Albert Park': 4,
 'Albion': 5,
 'Alphington': 6,
 'Altona': 7,
 'Altona Meadows': 8,
 'Altona North': 9,
 'Ardeer': 10,
 'Armadale': 11,
 'Ascot Vale': 12,
 'Ashburton': 13,
 'Ashwood': 14,
 'Aspendale': 15,
 'Aspendale Gardens': 16,
 'Attwood': 17,
 'Avondale Heights': 18,
 'Bacchus Marsh': 19,
 'Balaclava': 20,
 'Balwyn': 21,
 'Balwyn North': 22,
 'Bayswater': 23,
 'Bayswater North': 24,
 'Beaconsfield': 25,
 'Beaconsfield Upper': 26,
 'Beaumaris': 27,
 'Bellfield': 28,
 'Bentleigh': 29,
 'Bentleigh East': 30,
 'Berwick': 31,
 'Black Rock': 32,
 'Blackburn': 33,
 'Blackburn North': 34,
 'Blackburn South': 35,
 'Bonbeach': 36,
 'Boronia': 37,
 'Botanic Ridge': 38,
 'Box Hill': 39,
 'Braybrook': 40,
 'Briar Hill': 41,
 'Brighton': 42,
 'Brighton East': 43,
 'Broadmeadows': 44,
 'Brookfield': 45,
 'Brooklyn': 46,
 'Brunswick': 47,
 'Brunswick East': 48,
 'Brunswick West': 49,
 'Bulleen': 50,
 'Bullengarook': 51,


In [5]:
# Enumerate Region
encoder = LabelEncoder().fit(cleaned_df['Region'])
carp = {x: i for i, x in enumerate(encoder.classes_)}
carp

{'Eastern Metropolitan': 0,
 'Eastern Victoria': 1,
 'Northern Metropolitan': 2,
 'Northern Victoria': 3,
 'South-Eastern Metropolitan': 4,
 'Southern Metropolitan': 5,
 'Western Metropolitan': 6,
 'Western Victoria': 7}

In [6]:
# Convert to numerical variable 
cleaned_df['Suburb'] = LabelEncoder().fit_transform(cleaned_df['Suburb'])
cleaned_df['Suburb']

0          0
1          0
2          0
3          0
4          0
        ... 
17693    327
17694    332
17695    332
17696    332
17697    332
Name: Suburb, Length: 17698, dtype: int32

In [7]:
# Convert to numerical variable 
cleaned_df['Type'] = LabelEncoder().fit_transform(cleaned_df['Type'])
cleaned_df['Type']

0        0
1        0
2        0
3        0
4        0
        ..
17693    0
17694    0
17695    0
17696    1
17697    0
Name: Type, Length: 17698, dtype: int32

In [8]:
cleaned_df['Region'] = LabelEncoder().fit_transform(cleaned_df['Region'])
cleaned_df['Region']

0        2
1        2
2        2
3        2
4        2
        ..
17693    2
17694    6
17695    6
17696    6
17697    6
Name: Region, Length: 17698, dtype: int32

In [9]:
# Convert categorical data to numeric and separate target feature for training data
X = cleaned_df.drop(["Log_Price", 'Price'],  axis = 1)

y = cleaned_df['Price']

X

Unnamed: 0,Suburb,Rooms,Type,Distance,Bathroom,Car,Region
0,0,2,0,2.5,1.0,1.0,2
1,0,2,0,2.5,1.0,0.0,2
2,0,3,0,2.5,2.0,0.0,2
3,0,3,0,2.5,2.0,1.0,2
4,0,4,0,2.5,1.0,2.0,2
...,...,...,...,...,...,...,...
17693,327,3,0,25.5,2.0,2.0,2
17694,332,4,0,6.3,1.0,3.0,6
17695,332,2,0,6.3,2.0,1.0,6
17696,332,2,1,6.3,1.0,2.0,6


In [10]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# scale data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# Shape data
X_train_scaled.shape, X_test_scaled.shape

((14158, 7), (3540, 7))

# Data Modeling

## Linear Regression model

In [12]:
# Initialize the Linear Regression model
lr_model = LinearRegression()

# Train the model on the training data
lr_model.fit(X_train_scaled, y_train)

# Predict on the testing data
y_pred_lr = lr_model.predict(X_test_scaled)

# Calculate evaluation metrics
lr_mse = mean_squared_error(y_test, y_pred_lr)
lr_rmse = mean_squared_error(y_test, y_pred_lr, squared=False)
lr_mae = mean_absolute_error(y_test, y_pred_lr)  # Fix: Use mean_absolute_error
lr_r2 = r2_score(y_test, y_pred_lr)

# Printing each metric individually using f-strings
print(f"Mean Squared Error (MSE): {lr_mse}")
print(f"Root Mean Squared Error (RMSE): {lr_rmse}")
print(f"Mean Absolute Error (MAE): {lr_mae}")
print(f"R-squared (R2): {lr_r2}")

Mean Squared Error (MSE): 263305031644.15808
Root Mean Squared Error (RMSE): 513132.5673197503
Mean Absolute Error (MAE): 343080.5753475964
R-squared (R2): 0.401818669508635


In [13]:
# Calculate training and testing score 
training_score = lr_model.score(X_train_scaled, y_train)
testing_score = lr_model.score(X_test_scaled, y_test)

# Print the training and testing score
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.4458194691739725
Testing Score: 0.401818669508635


## Decision Tree Regression model

In [14]:
# Initialize the Decision Tree Regression model
dt_model = DecisionTreeRegressor(random_state=42)

# Train the model on the training data
dt_model.fit(X_train_scaled, y_train)

# Predict on the testing data
y_pred_dt = dt_model.predict(X_test_scaled)

# Calculate evaluation metrics
dt_mse = mean_squared_error(y_test, y_pred_dt)
dt_rmse = mean_squared_error(y_test, y_pred_dt, squared=False)
dt_mae = mean_absolute_error(y_test, y_pred_dt)
dt_r2 = r2_score(y_test, y_pred_dt)

# Print each metric using f-strings
print(f"Decision Tree Mean Squared Error (MSE): {dt_mse}")
print(f"Decision Tree Root Mean Squared Error (RMSE): {dt_rmse}")
print(f"Decision Tree Mean Absolute Error (MAE): {dt_mae}")
print(f"Decision Tree R-squared (R2): {dt_r2}")

Decision Tree Mean Squared Error (MSE): 183855904699.12082
Decision Tree Root Mean Squared Error (RMSE): 428784.21694264916
Decision Tree Mean Absolute Error (MAE): 231171.4046110783
Decision Tree R-squared (R2): 0.5823126926026831


In [15]:
# Calculate training and testing score 
training_score = dt_model.score(X_train_scaled, y_train)
testing_score = dt_model.score(X_test_scaled, y_test)

# Print the training and testing score
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.9059096553014386
Testing Score: 0.5823126926026831


## Gradient Boosting Regression model

In [16]:
# Initialize the Gradient Boosting Regression model
gb_model = GradientBoostingRegressor(random_state=42)

# Train the model on the training data
gb_model.fit(X_train_scaled, y_train)

# Predict on the testing data using Gradient Boosting Regressor
y_pred_gb = gb_model.predict(X_test_scaled)

# Calculate evaluation metrics
gb_mse = mean_squared_error(y_test, y_pred_gb)
gb_rmse = mean_squared_error(y_test, y_pred_gb, squared=False)
gb_mae = mean_absolute_error(y_test, y_pred_gb)
gb_r2 = r2_score(y_test, y_pred_gb)

# Print each metric using f-strings
print(f"Gradient Boosting Mean Squared Error (MSE): {gb_mse}")
print(f"Gradient Boosting Root Mean Squared Error (RMSE): {gb_rmse}")
print(f"Gradient Boosting Mean Absolute Error (MAE): {gb_mae}")
print(f"Gradient Boosting R-squared (R2): {gb_r2}")

Gradient Boosting Mean Squared Error (MSE): 146761963119.29492
Gradient Boosting Root Mean Squared Error (RMSE): 383095.24027230474
Gradient Boosting Mean Absolute Error (MAE): 226394.1630003356
Gradient Boosting R-squared (R2): 0.6665834077836079


In [17]:
# Calculate training and testing score 
training_score = gb_model.score(X_train_scaled, y_train)
testing_score = gb_model.score(X_test_scaled, y_test)

# Print the training and testing score
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.7249732415475423
Testing Score: 0.6665834077836079


## Random Forest Regression model

In [18]:
# Initialize the Random Forest Regression model
rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)

# Train the model on the training data
rf_model.fit(X_train_scaled, y_train)

# Predict on the testing data
y_pred_rf = rf_model.predict(X_test_scaled)

# Calculate evaluation metrics
rf_mse = mean_squared_error(y_test, y_pred_rf)
rf_rmse = mean_squared_error(y_test, y_pred_rf, squared=False)
rf_mae = mean_absolute_error(y_test, y_pred_rf)
rf_r2 = r2_score(y_test, y_pred_rf)

# Print each metric using f-strings
print(f"Random Forest Mean Squared Error (MSE): {rf_mse}")
print(f"Random Forest Root Mean Squared Error (RMSE): {rf_rmse}")
print(f"Random Forest Mean Absolute Error (MAE): {rf_mae}")
print(f"Random Forest R-squared (R2): {rf_r2}")

Random Forest Mean Squared Error (MSE): 140981120268.27365
Random Forest Root Mean Squared Error (RMSE): 375474.5267901321
Random Forest Mean Absolute Error (MAE): 209533.6998751363
Random Forest R-squared (R2): 0.6797164354602634


In [19]:
# Calculate training and testing score 
training_score = rf_model.score(X_train_scaled, y_train)
testing_score = rf_model.score(X_test_scaled, y_test)

# Print the training and testing score
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.8866498631421715
Testing Score: 0.6797164354602634


## Tuned Random Forest Regression model using Randomized Search CV

In [20]:
# Define the parameter grid for the RandomForestRegressor
param_dist = {
    'n_estimators': [10, 50, 100, 200, 500],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': range(2, 11),
    'min_samples_leaf': range(1, 11)
}

# Initialize the RandomForestRegressor model
rf_model_t = RandomForestRegressor(random_state=42, n_jobs=-1)

# Set up Randomized Search CV
random_search = RandomizedSearchCV(rf_model_t, param_distributions=param_dist, n_iter=100, 
                                   scoring='neg_mean_squared_error', n_jobs=-1, cv=5, random_state=42)

# Train the model on the training data using Randomized Search
random_search.fit(X_train_scaled, y_train)

# Retrieve the best estimator
best_rf_model = random_search.best_estimator_

# Predict on the testing data using the best estimator
y_pred_rf = best_rf_model.predict(X_test_scaled)

# Calculate evaluation metrics
rf_mse_t = mean_squared_error(y_test, y_pred_rf)
rf_rmse_t = mean_squared_error(y_test, y_pred_rf, squared=False)
rf_mae_t = mean_absolute_error(y_test, y_pred_rf)
rf_r2_t = r2_score(y_test, y_pred_rf)

# Print each metric using f-strings
print(f"Tuning Random Forest Mean Squared Error (MSE): {rf_mse_t}")
print(f"Tuning Random Forest Root Mean Squared Error (RMSE): {rf_rmse_t}")
print(f"Tuning Random Forest Mean Absolute Error (MAE): {rf_mae_t}")
print(f"Tuning Random Forest R-squared (R2): {rf_r2_t}")

Tuning Random Forest Mean Squared Error (MSE): 131079776297.77235
Tuning Random Forest Root Mean Squared Error (RMSE): 362049.4114037093
Tuning Random Forest Mean Absolute Error (MAE): 204277.86363556335
Tuning Random Forest R-squared (R2): 0.7022104951937342


In [21]:
# Calculate training and testing score 
training_score = best_rf_model.score(X_train_scaled, y_train)
testing_score = best_rf_model.score(X_test_scaled, y_test)

# Print the training and testing score
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.8305324959706591
Testing Score: 0.7022104951937342


## Decision Tree model

In [22]:
# Initialize the Decision Tree Regression model
dt_model = DecisionTreeRegressor(random_state=42)

# Train the model on the training data
dt_model.fit(X_train_scaled, y_train)

# Predict on the testing data
y_pred_dt = dt_model.predict(X_test_scaled)

# Calculate evaluation metrics
dt_mse = mean_squared_error(y_test, y_pred_dt)
dt_rmse = mean_squared_error(y_test, y_pred_dt, squared=False)
dt_mae = mean_absolute_error(y_test, y_pred_dt)
dt_r2 = r2_score(y_test, y_pred_dt)

# Print each metric using f-strings
print(f"Decision Tree Mean Squared Error (MSE): {dt_mse}")
print(f"Decision Tree Root Mean Squared Error (RMSE): {dt_rmse}")
print(f"Decision Tree Mean Absolute Error (MAE): {dt_mae}")
print(f"Decision Tree R-squared (R2): {dt_r2}")

Decision Tree Mean Squared Error (MSE): 183855904699.12082
Decision Tree Root Mean Squared Error (RMSE): 428784.21694264916
Decision Tree Mean Absolute Error (MAE): 231171.4046110783
Decision Tree R-squared (R2): 0.5823126926026831


In [23]:
# Calculate training and testing score 
training_score = dt_model.score(X_train_scaled, y_train)
testing_score = dt_model.score(X_test_scaled, y_test)

# Print the training and testing score
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.9059096553014386
Testing Score: 0.5823126926026831


## Tuned Decision Tree Model using Randomized Search CV 

In [24]:
# Define the parameter grid for the Decision Tree
param_dist = {
    "max_depth": [None, 10, 20, 30, 40, 50],
    "min_samples_split": range(2, 11),
    "min_samples_leaf": range(1, 11),
    "criterion": ["squared_error", "friedman_mse", "absolute_error"]  # Updated the values
}

# Initialize the Decision Tree Regression model
dt_model = DecisionTreeRegressor(random_state=42)

# Set up Randomized Search CV
random_search = RandomizedSearchCV(dt_model, param_distributions=param_dist, n_iter=100, 
                                   scoring='neg_mean_squared_error', n_jobs=-1, cv=5, random_state=42)

# Train the model on the training data using Randomized Search
random_search.fit(X_train_scaled, y_train)

# Retrieve the best estimator
best_dt_model = random_search.best_estimator_

# Predict on the testing data using the best estimator
y_pred_dt = best_dt_model.predict(X_test_scaled)

# Calculate evaluation metrics
dt_mse_t = mean_squared_error(y_test, y_pred_dt)
dt_rmse_t = mean_squared_error(y_test, y_pred_dt, squared=False)
dt_mae_t = mean_absolute_error(y_test, y_pred_dt)
dt_r2_t = r2_score(y_test, y_pred_dt)

# Print each metric using f-strings
print(f"Tuning Decision Tree Mean Squared Error (MSE): {dt_mse_t}")
print(f"Tuning Decision Tree Root Mean Squared Error (RMSE): {dt_rmse_t}")
print(f"Tuning Decision Tree Mean Absolute Error (MAE): {dt_mae_t}")
print(f"Tuning Decision Tree R-squared (R2): {dt_r2_t}")

Tuning Decision Tree Mean Squared Error (MSE): 141516517346.90048
Tuning Decision Tree Root Mean Squared Error (RMSE): 376186.8117663091
Tuning Decision Tree Mean Absolute Error (MAE): 212122.12980225988
Tuning Decision Tree R-squared (R2): 0.6785001102923228


In [25]:
# Calculate training and testing score 
training_score = best_dt_model.score(X_train_scaled, y_train)
testing_score = best_dt_model.score(X_test_scaled, y_test)

# Print the training and testing score
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.7599996685189059
Testing Score: 0.6785001102923228


## Support Vector Regressor model

In [26]:
# Initialize the Support Vector Regressor model
svr_model = SVR()

# Train the model on the training data
svr_model.fit(X_train_scaled, y_train)

# Predict on the testing data
y_pred_svr = svr_model.predict(X_test_scaled)

# Calculate evaluation metrics
svr_mse = mean_squared_error(y_test, y_pred_svr)
svr_rmse = mean_squared_error(y_test, y_pred_svr, squared=False)
svr_mae = mean_absolute_error(y_test, y_pred_svr)
svr_r2 = r2_score(y_test, y_pred_svr)

# Print each metric using f-strings
print(f"Support Vector Regression Mean Squared Error (MSE): {svr_mse}")
print(f"Support Vector Regression Root Mean Squared Error (RMSE): {svr_rmse}")
print(f"Support Vector Regression Mean Absolute Error (MAE): {svr_mae}")
print(f"Support Vector Regression R-squared (R2): {svr_r2}")

Support Vector Regression Mean Squared Error (MSE): 471854221825.2494
Support Vector Regression Root Mean Squared Error (RMSE): 686916.45913113
Support Vector Regression Mean Absolute Error (MAE): 439321.42664546106
Support Vector Regression R-squared (R2): -0.07196730896827774


In [27]:
# Calculate training and testing score 
training_score = svr_model.score(X_train_scaled, y_train)
testing_score = svr_model.score(X_test_scaled, y_test)

# Print the training and testing score
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: -0.07406786808272514
Testing Score: -0.07196730896827774


## Lasso model

In [28]:
# Initialize the Lasso regression model
lasso_model = Lasso(alpha=1.0)  # You can adjust the alpha (regularization strength) as needed

# Train the model on the training data
lasso_model.fit(X_train_scaled, y_train)

# Predict on the testing data
y_pred_lasso = lasso_model.predict(X_test_scaled)

# Calculate evaluation metrics
lasso_mse = mean_squared_error(y_test, y_pred_lasso)
lasso_rmse = mean_squared_error(y_test, y_pred_lasso, squared=False)
lasso_mae = mean_absolute_error(y_test, y_pred_lasso)
lasso_r2 = r2_score(y_test, y_pred_lasso)

# Printing each metric using f-strings
print(f"Lasso Regression Mean Squared Error (MSE): {lasso_mse}")
print(f"Lasso Regression Root Mean Squared Error (RMSE): {lasso_rmse}")
print(f"Lasso Regression Mean Absolute Error (MAE): {lasso_mae}")
print(f"Lasso Regression R-squared (R2): {lasso_r2}")

Lasso Regression Mean Squared Error (MSE): 263304956508.411
Lasso Regression Root Mean Squared Error (RMSE): 513132.49410694215
Lasso Regression Mean Absolute Error (MAE): 343080.2682140865
Lasso Regression R-squared (R2): 0.4018188402034405


In [29]:
# Calculate training and testing score 
training_score = lasso_model.score(X_train_scaled, y_train)
testing_score = lasso_model.score(X_test_scaled, y_test)

# Print the training and testing score
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.44581946916021264
Testing Score: 0.4018188402034405


## Ridge model

In [30]:
# Initialize the Ridge regression model
ridge_model = Ridge(alpha=1.0)  # You can adjust the alpha (regularization strength) as needed

# Train the model on the training data
ridge_model.fit(X_train_scaled, y_train)

# Predict on the testing data
y_pred_ridge = ridge_model.predict(X_test_scaled)

# Calculate evaluation metrics
ridge_mse = mean_squared_error(y_test, y_pred_ridge)
ridge_rmse = mean_squared_error(y_test, y_pred_ridge, squared=False)
ridge_mae = mean_absolute_error(y_test, y_pred_ridge)
ridge_r2 = r2_score(y_test, y_pred_ridge)

# Printing each metric using f-strings
print(f"Ridge Regression Mean Squared Error (MSE): {ridge_mse}")
print(f"Ridge Regression Root Mean Squared Error (RMSE): {ridge_rmse}")
print(f"Ridge Regression Mean Absolute Error (MAE): {ridge_mae}")
print(f"Ridge Regression R-squared (R2): {ridge_r2}")

Ridge Regression Mean Squared Error (MSE): 263304075160.43375
Ridge Regression Root Mean Squared Error (RMSE): 513131.6353144033
Ridge Regression Mean Absolute Error (MAE): 343076.4018060183
Ridge Regression R-squared (R2): 0.4018208424663763


In [31]:
# Calculate training and testing score 
training_score = ridge_model.score(X_train_scaled, y_train)
testing_score = ridge_model.score(X_test_scaled, y_test)

# Print the training and testing score
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.44581946723851984
Testing Score: 0.4018208424663763


When selecting a machine learning model, we typically want to choose one that minimizes error metrics (like MSE, RMSE, and MAE) and maximizes the coefficient of determination (R-squared). R-squared is a statistical measure that represents the proportion of the variance for the dependent variable that's explained by the independent variables in a regression model.

## Model Summary Ranked by R-squared Value

### Tuned Random Forest Regression model using Randomized Search CV
- **R-squared**: 0.7022
- Lowest MSE, RMSE, and MAE among all models
- Training Score: 0.8305324959706591
- Testing Score: 0.7022104951937344

### Random Forest Regression model
- **R-squared**: 0.6797
- Slightly higher error metrics compared to the tuned models
- Training Score: 0.8866498631421715
- Testing Score: 0.6797164354602634

### Tuned Decision Tree Model using Randomized Search CV
- **R-squared**: 0.6785
- Lower error metrics compared to the non-tuned Decision Tree model
- Training Score: 0.7599996685189059
- Testing Score: 0.6785001102923228

### Gradient Boosting Regression model
- **R-squared**: 0.6666
- Lower error metrics than the Decision Tree and Linear Regression models
- Training Score: 0.7249732415475423
- Testing Score: 0.6665834077836079

### Decision Tree Regression model
- **R-squared**: 0.5823
- Training Score: 0.9059096553014386
- Testing Score: 0.5823126926026831

### Ridge model
- **R-squared**: 0.401
- Training Score: 0.44581946723851984
- Testing Score: 0.4018208424663763

### Lasso model
- **R-squared**: 0.4018
- Training Score: 0.44581946916021264
- Testing Score: 0.4018188402034405

### Linear Regression model
- **R-squared**: 0.4018
- Training Score: 0.4458194691739725
- Testing Score: 0.401818669508635

### Support Vector Regressor model
- **R-squared**: -0.0719 (negative R-squared indicates a very poor fit)
- Training Score: -0.07406786808272514
- Testing Score: -0.07196730896827774

    "Based on the above metrics, the **Tuned Random Forest Regression model** using Randomized Search CV is the best performing model. It has the highest testing score (R2) of 0.7022, indicating that it can explain approximately 70.22% of the variability in the target variable. It also has the lowest MSE and RMSE, suggesting better accuracy and lower errors in prediction.\n",
    "\n",
    "Therefore, the **Tuned Random Forest Regression model** is selected as the best model for predicting house prices in this scenario."

## Save and test the model

In [32]:
# Saving model
pickle.dump(best_rf_model, open('../Trained_Model/model.pkl','wb'))

In [33]:
# Create a dataframe
test_df = pd.DataFrame({
    "Suburb": [2],
    "Rooms": [2],
    "Type": [1],
    "Distance": [5],
    "Bathroom": [2],
    "Car": [2],
    "Region": [1],
})

# Create the index
index_ = ['0']

# Set the index
test_df.index = index_

In [34]:
# Loading model to compare the results
model = pickle.load(open('../Trained_Model/model.pkl','rb'))
print(model.predict(test_df))

[1034948.35806295]
