In [7]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
import pickle
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
import matplotlib.pyplot as plt

In [8]:
# Reading the cleaned dataset
cleaned_df = pd.read_csv('..\Cleaned_Data\Cleaned_Data_LogPrice.csv')
cleaned_df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Distance,Bathroom,Car,Region,Log_Price
0,0,2,0,1480000.0,2.5,1.0,1.0,2,14.207553
1,0,2,0,1035000.0,2.5,1.0,0.0,2,13.849913
2,0,3,0,1465000.0,2.5,2.0,0.0,2,14.197366
3,0,3,0,850000.0,2.5,2.0,1.0,2,13.652993
4,0,4,0,1600000.0,2.5,1.0,2.0,2,14.285515


In [9]:
# Separate the features and target variable
X = cleaned_df.drop(['Price', 'Log_Price'], axis=1)  # Drop the original Price and Log_Price to avoid data leakage
y = cleaned_df['Log_Price']  # Target variable

# Display the first few rows of features and target
(X.head(), y.head())

(   Suburb  Rooms  Type  Distance  Bathroom  Car  Region
 0       0      2     0       2.5       1.0  1.0       2
 1       0      2     0       2.5       1.0  0.0       2
 2       0      3     0       2.5       2.0  0.0       2
 3       0      3     0       2.5       2.0  1.0       2
 4       0      4     0       2.5       1.0  2.0       2,
 0    14.207553
 1    13.849913
 2    14.197366
 3    13.652993
 4    14.285515
 Name: Log_Price, dtype: float64)

In [10]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Scale data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# Shape data
X_train_scaled.shape, X_test_scaled.shape

((14158, 7), (3540, 7))

# Data Modeling

## Linear Regression model

In [12]:
# Initialize the Linear Regression model
lr_model = LinearRegression()

# Train the model on the training data
lr_model.fit(X_train_scaled, y_train)

# Predict on the testing data
y_pred_lr = lr_model.predict(X_test_scaled)

# Calculate evaluation metrics
lr_mse = mean_squared_error(y_test, y_pred_lr)
lr_rmse = mean_squared_error(y_test, y_pred_lr, squared=False)
lr_mae = mean_absolute_error(y_test, y_pred_lr)  # Fix: Use mean_absolute_error
lr_r2 = r2_score(y_test, y_pred_lr)

# Printing each metric individually using f-strings
print(f"Linear Regression Mean Squared Error (MSE): {lr_mse}")
print(f"Linear Regression Root Mean Squared Error (RMSE): {lr_rmse}")
print(f"Linear Regression Mean Absolute Error (MAE): {lr_mae}")
print(f"Linear Regression R-squared (R2): {lr_r2}")

Linear Regression Mean Squared Error (MSE): 0.12703516990195998
Linear Regression Root Mean Squared Error (RMSE): 0.35641993477071393
Linear Regression Mean Absolute Error (MAE): 0.2831437864815163
Linear Regression R-squared (R2): 0.5187003639443211


In [13]:
# Calculate training and testing score 
training_score = lr_model.score(X_train_scaled, y_train)
testing_score = lr_model.score(X_test_scaled, y_test)

# Print the training and testing score
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.5390607601560405
Testing Score: 0.5187003639443211


## Gradient Boosting Regression model

In [14]:
# Initialize the Gradient Boosting Regression model
gb_model = GradientBoostingRegressor(random_state=42)

# Train the model on the training data
gb_model.fit(X_train_scaled, y_train)

# Predict on the testing data using Gradient Boosting Regressor
y_pred_gb = gb_model.predict(X_test_scaled)

# Calculate evaluation metrics
gb_mse = mean_squared_error(y_test, y_pred_gb)
gb_rmse = mean_squared_error(y_test, y_pred_gb, squared=False)
gb_mae = mean_absolute_error(y_test, y_pred_gb)
gb_r2 = r2_score(y_test, y_pred_gb)

# Print each metric using f-strings
print(f"Gradient Boosting Mean Squared Error (MSE): {gb_mse}")
print(f"Gradient Boosting Root Mean Squared Error (RMSE): {gb_rmse}")
print(f"Gradient Boosting Mean Absolute Error (MAE): {gb_mae}")
print(f"Gradient Boosting R-squared (R2): {gb_r2}")

Gradient Boosting Mean Squared Error (MSE): 0.059622014723633314
Gradient Boosting Root Mean Squared Error (RMSE): 0.2441761960626656
Gradient Boosting Mean Absolute Error (MAE): 0.1856554837009077
Gradient Boosting R-squared (R2): 0.7741093745178019


In [15]:
# Calculate training and testing score 
training_score = gb_model.score(X_train_scaled, y_train)
testing_score = gb_model.score(X_test_scaled, y_test)

# Print the training and testing score
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.7960610138289723
Testing Score: 0.7741093745178019


## Random Forest Regression model

In [16]:
# Initialize the Random Forest Regression model
rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)

# Train the model on the training data
rf_model.fit(X_train_scaled, y_train)

# Predict on the testing data
y_pred_rf = rf_model.predict(X_test_scaled)

# Calculate evaluation metrics
rf_mse = mean_squared_error(y_test, y_pred_rf)
rf_rmse = mean_squared_error(y_test, y_pred_rf, squared=False)
rf_mae = mean_absolute_error(y_test, y_pred_rf)
rf_r2 = r2_score(y_test, y_pred_rf)

# Print each metric using f-strings
print(f"Random Forest Mean Squared Error (MSE): {rf_mse}")
print(f"Random Forest Root Mean Squared Error (RMSE): {rf_rmse}")
print(f"Random Forest Mean Absolute Error (MAE): {rf_mae}")
print(f"Random Forest R-squared (R2): {rf_r2}")

Random Forest Mean Squared Error (MSE): 0.05561126649683585
Random Forest Root Mean Squared Error (RMSE): 0.23582041153563416
Random Forest Mean Absolute Error (MAE): 0.17514881786467343
Random Forest R-squared (R2): 0.7893049432989383


In [17]:
# Calculate training and testing score 
training_score = rf_model.score(X_train_scaled, y_train)
testing_score = rf_model.score(X_test_scaled, y_test)

# Print the training and testing score
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.9114944840327691
Testing Score: 0.7893049432989383


## Tuned Random Forest Regression model using Randomized Search CV

In [18]:
# Define the parameter grid for the RandomForestRegressor
param_dist = {
    'n_estimators': [10, 50, 100, 200, 500],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': range(2, 11),
    'min_samples_leaf': range(1, 11)
}

# Initialize the RandomForestRegressor model
rf_model_t = RandomForestRegressor(random_state=42, n_jobs=-1)

# Set up Randomized Search CV
random_search = RandomizedSearchCV(rf_model_t, param_distributions=param_dist, n_iter=100, 
                                   scoring='neg_mean_squared_error', n_jobs=-1, cv=5, random_state=42)

# Train the model on the training data using Randomized Search
random_search.fit(X_train_scaled, y_train)

# Retrieve the best estimator
best_rf_model = random_search.best_estimator_

# Predict on the testing data using the best estimator
y_pred_rf = best_rf_model.predict(X_test_scaled)

# Calculate evaluation metrics
rf_mse_t = mean_squared_error(y_test, y_pred_rf)
rf_rmse_t = mean_squared_error(y_test, y_pred_rf, squared=False)
rf_mae_t = mean_absolute_error(y_test, y_pred_rf)
rf_r2_t = r2_score(y_test, y_pred_rf)

# Print each metric using f-strings
print(f"Tuned Random Forest Mean Squared Error (MSE): {rf_mse_t}")
print(f"Tuned Random Forest Root Mean Squared Error (RMSE): {rf_rmse_t}")
print(f"Tuned Random Forest Mean Absolute Error (MAE): {rf_mae_t}")
print(f"Tuned Random Forest R-squared (R2): {rf_r2_t}")

Tuned Random Forest Mean Squared Error (MSE): 0.05231951273435163
Tuned Random Forest Root Mean Squared Error (RMSE): 0.22873459015713304
Tuned Random Forest Mean Absolute Error (MAE): 0.17046033421544568
Tuned Random Forest R-squared (R2): 0.8017764493321988


In [19]:
# Calculate training and testing score 
training_score = best_rf_model.score(X_train_scaled, y_train)
testing_score = best_rf_model.score(X_test_scaled, y_test)

# Print the training and testing score
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.8838822221415641
Testing Score: 0.8017764493321988


## Decision Tree model

In [20]:
# Initialize the Decision Tree Regression model
dt_model = DecisionTreeRegressor(random_state=42)

# Train the model on the training data
dt_model.fit(X_train_scaled, y_train)

# Predict on the testing data
y_pred_dt = dt_model.predict(X_test_scaled)

# Calculate evaluation metrics
dt_mse = mean_squared_error(y_test, y_pred_dt)
dt_rmse = mean_squared_error(y_test, y_pred_dt, squared=False)
dt_mae = mean_absolute_error(y_test, y_pred_dt)
dt_r2 = r2_score(y_test, y_pred_dt)

# Print each metric using f-strings
print(f"Decision Tree Mean Squared Error (MSE): {dt_mse}")
print(f"Decision Tree Root Mean Squared Error (RMSE): {dt_rmse}")
print(f"Decision Tree Mean Absolute Error (MAE): {dt_mae}")
print(f"Decision Tree R-squared (R2): {dt_r2}")

Decision Tree Mean Squared Error (MSE): 0.06934985358839214
Decision Tree Root Mean Squared Error (RMSE): 0.26334360365953857
Decision Tree Mean Absolute Error (MAE): 0.19150419924915607
Decision Tree R-squared (R2): 0.7372533974775062


In [21]:
# Calculate training and testing score 
training_score = dt_model.score(X_train_scaled, y_train)
testing_score = dt_model.score(X_test_scaled, y_test)

# Print the training and testing score
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.9223930916406637
Testing Score: 0.7372533974775062


## Tuned Decision Tree Model using Randomized Search CV 

In [22]:
# Define the parameter grid for the Decision Tree
param_dist = {
    "max_depth": [None, 10, 20, 30, 40, 50],
    "min_samples_split": range(2, 11),
    "min_samples_leaf": range(1, 11),
    "criterion": ["squared_error", "friedman_mse", "absolute_error"]  # Updated the values
}

# Initialize the Decision Tree Regression model
dt_model = DecisionTreeRegressor(random_state=42)

# Set up Randomized Search CV
random_search = RandomizedSearchCV(dt_model, param_distributions=param_dist, n_iter=100, 
                                   scoring='neg_mean_squared_error', n_jobs=-1, cv=5, random_state=42)

# Train the model on the training data using Randomized Search
random_search.fit(X_train_scaled, y_train)

# Retrieve the best estimator
best_dt_model = random_search.best_estimator_

# Predict on the testing data using the best estimator
y_pred_dt = best_dt_model.predict(X_test_scaled)

# Calculate evaluation metrics
dt_mse_t = mean_squared_error(y_test, y_pred_dt)
dt_rmse_t = mean_squared_error(y_test, y_pred_dt, squared=False)
dt_mae_t = mean_absolute_error(y_test, y_pred_dt)
dt_r2_t = r2_score(y_test, y_pred_dt)

# Print each metric using f-strings
print(f"Tuned Decision Tree Mean Squared Error (MSE): {dt_mse_t}")
print(f"Tuned Decision Tree Root Mean Squared Error (RMSE): {dt_rmse_t}")
print(f"Tuned Decision Tree Mean Absolute Error (MAE): {dt_mae_t}")
print(f"Tuned Decision Tree R-squared (R2): {dt_r2_t}")

Tuned Decision Tree Mean Squared Error (MSE): 0.05813894562082217
Tuned Decision Tree Root Mean Squared Error (RMSE): 0.24112018916055572
Tuned Decision Tree Mean Absolute Error (MAE): 0.17969697817548377
Tuned Decision Tree R-squared (R2): 0.7797282957974705


In [23]:
# Calculate training and testing score 
training_score = best_dt_model.score(X_train_scaled, y_train)
testing_score = best_dt_model.score(X_test_scaled, y_test)

# Print the training and testing score
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.8461008204542811
Testing Score: 0.7797282957974705


## Support Vector Regressor model

In [24]:
# Initialize the Support Vector Regressor model
svr_model = SVR()

# Train the model on the training data
svr_model.fit(X_train_scaled, y_train)

# Predict on the testing data
y_pred_svr = svr_model.predict(X_test_scaled)

# Calculate evaluation metrics
svr_mse = mean_squared_error(y_test, y_pred_svr)
svr_rmse = mean_squared_error(y_test, y_pred_svr, squared=False)
svr_mae = mean_absolute_error(y_test, y_pred_svr)
svr_r2 = r2_score(y_test, y_pred_svr)

# Print each metric using f-strings
print(f"Support Vector Regression Mean Squared Error (MSE): {svr_mse}")
print(f"Support Vector Regression Root Mean Squared Error (RMSE): {svr_rmse}")
print(f"Support Vector Regression Mean Absolute Error (MAE): {svr_mae}")
print(f"Support Vector Regression R-squared (R2): {svr_r2}")

Support Vector Regression Mean Squared Error (MSE): 0.06496672807932648
Support Vector Regression Root Mean Squared Error (RMSE): 0.2548857157224125
Support Vector Regression Mean Absolute Error (MAE): 0.19264943781122293
Support Vector Regression R-squared (R2): 0.7538597964292906


In [25]:
# Calculate training and testing score 
training_score = svr_model.score(X_train_scaled, y_train)
testing_score = svr_model.score(X_test_scaled, y_test)

# Print the training and testing score
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.7858616756356992
Testing Score: 0.7538597964292906


## Lasso model

In [26]:
# Initialize the Lasso regression model
lasso_model = Lasso(alpha=1.0)  # You can adjust the alpha (regularization strength) as needed

# Train the model on the training data
lasso_model.fit(X_train_scaled, y_train)

# Predict on the testing data
y_pred_lasso = lasso_model.predict(X_test_scaled)

# Calculate evaluation metrics
lasso_mse = mean_squared_error(y_test, y_pred_lasso)
lasso_rmse = mean_squared_error(y_test, y_pred_lasso, squared=False)
lasso_mae = mean_absolute_error(y_test, y_pred_lasso)
lasso_r2 = r2_score(y_test, y_pred_lasso)

# Printing each metric using f-strings
print(f"Lasso Regression Mean Squared Error (MSE): {lasso_mse}")
print(f"Lasso Regression Root Mean Squared Error (RMSE): {lasso_rmse}")
print(f"Lasso Regression Mean Absolute Error (MAE): {lasso_mae}")
print(f"Lasso Regression R-squared (R2): {lasso_r2}")

Lasso Regression Mean Squared Error (MSE): 0.2639477526711215
Lasso Regression Root Mean Squared Error (RMSE): 0.513758457517851
Lasso Regression Mean Absolute Error (MAE): 0.4102779011976301
Lasso Regression R-squared (R2): -2.1941926533486367e-05


In [27]:
# Calculate training and testing score 
training_score = lasso_model.score(X_train_scaled, y_train)
testing_score = lasso_model.score(X_test_scaled, y_test)

# Print the training and testing score
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.0
Testing Score: -2.1941926533486367e-05


## Ridge model

In [28]:
# Initialize the Ridge regression model
ridge_model = Ridge(alpha=1.0)  # You can adjust the alpha (regularization strength) as needed

# Train the model on the training data
ridge_model.fit(X_train_scaled, y_train)

# Predict on the testing data
y_pred_ridge = ridge_model.predict(X_test_scaled)

# Calculate evaluation metrics
ridge_mse = mean_squared_error(y_test, y_pred_ridge)
ridge_rmse = mean_squared_error(y_test, y_pred_ridge, squared=False)
ridge_mae = mean_absolute_error(y_test, y_pred_ridge)
ridge_r2 = r2_score(y_test, y_pred_ridge)

# Printing each metric using f-strings
print(f"Ridge Regression Mean Squared Error (MSE): {ridge_mse}")
print(f"Ridge Regression Root Mean Squared Error (RMSE): {ridge_rmse}")
print(f"Ridge Regression Mean Absolute Error (MAE): {ridge_mae}")
print(f"Ridge Regression R-squared (R2): {ridge_r2}")

Ridge Regression Mean Squared Error (MSE): 0.1270346652600287
Ridge Regression Root Mean Squared Error (RMSE): 0.35641922683832405
Ridge Regression Mean Absolute Error (MAE): 0.2831430491646807
Ridge Regression R-squared (R2): 0.5187022758871165


In [29]:
# Calculate training and testing score 
training_score = ridge_model.score(X_train_scaled, y_train)
testing_score = ridge_model.score(X_test_scaled, y_test)

# Print the training and testing score
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.5390607576578176
Testing Score: 0.5187022758871165


When selecting a machine learning model, we typically want to choose one that minimizes error metrics (like MSE, RMSE, and MAE) and maximizes the coefficient of determination (R-squared). R-squared is a statistical measure that represents the proportion of the variance for the dependent variable that's explained by the independent variables in a regression model.

## Model Summary Ranked by R-squared Value

1. **Tuned Random Forest Regression model using Randomized Search CV**
      - **R2**: 0.8017764493321988
      - **MSE**: 0.05231951273435163
      - **RMSE**: 0.22873459015713304
      - **MAE**: 0.17046033421544568
   
2. **Random Forest Regression model**
      - **R2**: 0.7893049432989383
      - **MSE**: 0.05561126649683585
      - **RMSE**: 0.23582041153563416
      - **MAE**: 0.17514881786467343
   
3. **Tuned Decision Tree Model using Randomized Search CV**
      - **R2**: 0.7797282957974705
      - **MSE**: 0.05813894562082217
      - **RMSE**: 0.24112018916055572
      - **MAE**: 0.17969697817548377
   
4. **Gradient Boosting Regression model**
      - **R2**: 0.7741093745178019
      - **MSE**: 0.059622014723633314
      - **RMSE**: 0.2441761960626656
      - **MAE**: 0.1856554837009077
   
5. **Decision Tree Regression model**
      - **R2**: 0.7372533974775062
      - **MSE**: 0.06934985358839214
      - **RMSE**: 0.26334360365953857
      - **MAE**: 0.19150419924915607
   
6. **Support Vector Regressor model**
      - **R2**: 0.7538597964292906
      - **MSE**: 0.06496672807932648
      - **RMSE**: 0.2548857157224125
      - **MAE**: 0.19264943781122293
   
7. **Ridge model**
      - **R2**: 0.5187022758871165
      - **MSE**: 0.1270346652600287
      - **RMSE**: 0.35641922683832405
      - **MAE**: 0.2831430491646807
   
8. **Linear Regression model**
      - **R2**: 0.5187003639443211
      - **MSE**: 0.12703516990195998
      - **RMSE**: 0.35641993477071393
      - **MAE**: 0.2831437864815163
   
9. **Lasso model**
      - **R2**: -2.1941926533486367e-05
      - **MSE**: 0.2639477526711215
      - **RMSE**: 0.513758457517851
      - **MAE**: 0.4102779011976301

    Based on the above metrics, the **Tuned Random Forest Regression model** using Randomized Search CV is the best performing model. It has the highest testing score (R2) of 0.8017764493321988, indicating that it can explain approximately 80.17% of the variability in the target variable. It also has the lowest MSE and RMSE, suggesting better accuracy and lower errors in prediction price. 
    
    Therefore, the **Tuned Random Forest Regression model** is selected as the best model for predicting house prices in this scenario.

## Save and test the model

In [30]:
# Saving model
pickle.dump(best_rf_model, open('../Trained_Model/model.pkl','wb'))

In [31]:
# Create a dataframe
test_df = pd.DataFrame({
    "Suburb": [2],
    "Rooms": [2],
    "Type": [1],
    "Distance": [5],
    "Bathroom": [2],
    "Car": [2],
    "Region": [1],
})

# Create the index
index_ = ['0']

# Set the index
test_df.index = index_

In [32]:
# Loading model to compare the results
model = pickle.load(open('../Trained_Model/model.pkl','rb'))
print(model.predict(test_df))

[13.5616448]


In [33]:
# Load the trained model from a pickle file
with open('../Trained_Model/model.pkl', 'rb') as file:
    model = pickle.load(file)

# Use the model to predict Log_Price
log_price_predictions = model.predict(test_df)

# Convert the log price predictions back to actual prices
price_predictions = np.exp(log_price_predictions)
print(price_predictions)

[775795.93596053]
