In [18]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.svm import SVR
import pickle

In [2]:
# Reading the cleaned dataset
cleaned_df = pd.read_csv('..\Cleaned_Data\Cleaned_Data_LogPrice.csv')
cleaned_df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Distance,Bathroom,Car,Region,Log_Price
0,Abbotsford,2,h,1480000.0,2.5,1.0,1.0,Northern Metropolitan,14.207553
1,Abbotsford,2,h,1035000.0,2.5,1.0,0.0,Northern Metropolitan,13.849913
2,Abbotsford,3,h,1465000.0,2.5,2.0,0.0,Northern Metropolitan,14.197366
3,Abbotsford,3,h,850000.0,2.5,2.0,1.0,Northern Metropolitan,13.652993
4,Abbotsford,4,h,1600000.0,2.5,1.0,2.0,Northern Metropolitan,14.285515


In [3]:
# Enumerate Type
encode = LabelEncoder().fit(cleaned_df['Type'])
carpet = {x: i for i, x in enumerate(encode.classes_)}
carpet

{'h': 0, 't': 1, 'u': 2}

In [4]:
# Enumerate Suburb
encoder = LabelEncoder().fit(cleaned_df['Suburb'])
carp = {x: i for i, x in enumerate(encoder.classes_)}
carp

{'Abbotsford': 0,
 'Aberfeldie': 1,
 'Airport West': 2,
 'Albanvale': 3,
 'Albert Park': 4,
 'Albion': 5,
 'Alphington': 6,
 'Altona': 7,
 'Altona Meadows': 8,
 'Altona North': 9,
 'Ardeer': 10,
 'Armadale': 11,
 'Ascot Vale': 12,
 'Ashburton': 13,
 'Ashwood': 14,
 'Aspendale': 15,
 'Aspendale Gardens': 16,
 'Attwood': 17,
 'Avondale Heights': 18,
 'Bacchus Marsh': 19,
 'Balaclava': 20,
 'Balwyn': 21,
 'Balwyn North': 22,
 'Bayswater': 23,
 'Bayswater North': 24,
 'Beaconsfield': 25,
 'Beaconsfield Upper': 26,
 'Beaumaris': 27,
 'Bellfield': 28,
 'Bentleigh': 29,
 'Bentleigh East': 30,
 'Berwick': 31,
 'Black Rock': 32,
 'Blackburn': 33,
 'Blackburn North': 34,
 'Blackburn South': 35,
 'Bonbeach': 36,
 'Boronia': 37,
 'Botanic Ridge': 38,
 'Box Hill': 39,
 'Braybrook': 40,
 'Briar Hill': 41,
 'Brighton': 42,
 'Brighton East': 43,
 'Broadmeadows': 44,
 'Brookfield': 45,
 'Brooklyn': 46,
 'Brunswick': 47,
 'Brunswick East': 48,
 'Brunswick West': 49,
 'Bulleen': 50,
 'Bullengarook': 51,


In [5]:
# Enumerate Region
encoder = LabelEncoder().fit(cleaned_df['Region'])
carp = {x: i for i, x in enumerate(encoder.classes_)}
carp

{'Eastern Metropolitan': 0,
 'Eastern Victoria': 1,
 'Northern Metropolitan': 2,
 'Northern Victoria': 3,
 'South-Eastern Metropolitan': 4,
 'Southern Metropolitan': 5,
 'Western Metropolitan': 6,
 'Western Victoria': 7}

In [6]:
# Convert to numerical variable 
cleaned_df['Suburb'] = LabelEncoder().fit_transform(cleaned_df['Suburb'])
cleaned_df['Suburb']

0          0
1          0
2          0
3          0
4          0
        ... 
17693    327
17694    332
17695    332
17696    332
17697    332
Name: Suburb, Length: 17698, dtype: int32

In [7]:
# Convert to numerical variable 
cleaned_df['Type'] = LabelEncoder().fit_transform(cleaned_df['Type'])
cleaned_df['Type']

0        0
1        0
2        0
3        0
4        0
        ..
17693    0
17694    0
17695    0
17696    1
17697    0
Name: Type, Length: 17698, dtype: int32

In [8]:
cleaned_df['Region'] = LabelEncoder().fit_transform(cleaned_df['Region'])
cleaned_df['Region']

0        2
1        2
2        2
3        2
4        2
        ..
17693    2
17694    6
17695    6
17696    6
17697    6
Name: Region, Length: 17698, dtype: int32

In [9]:
# Convert categorical data to numeric and separate target feature for training data
X = cleaned_df.drop(["Log_Price", 'Price'],  axis = 1)

y = cleaned_df['Price']

X

Unnamed: 0,Suburb,Rooms,Type,Distance,Bathroom,Car,Region
0,0,2,0,2.5,1.0,1.0,2
1,0,2,0,2.5,1.0,0.0,2
2,0,3,0,2.5,2.0,0.0,2
3,0,3,0,2.5,2.0,1.0,2
4,0,4,0,2.5,1.0,2.0,2
...,...,...,...,...,...,...,...
17693,327,3,0,25.5,2.0,2.0,2
17694,332,4,0,6.3,1.0,3.0,6
17695,332,2,0,6.3,2.0,1.0,6
17696,332,2,1,6.3,1.0,2.0,6


In [10]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17698 entries, 0 to 17697
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Suburb    17698 non-null  int32  
 1   Rooms     17698 non-null  int64  
 2   Type      17698 non-null  int32  
 3   Distance  17698 non-null  float64
 4   Bathroom  17698 non-null  float64
 5   Car       17698 non-null  float64
 6   Region    17698 non-null  int32  
dtypes: float64(3), int32(3), int64(1)
memory usage: 760.6 KB


In [11]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#Scale data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
X_train_scaled.shape, X_test_scaled.shape

((14158, 7), (3540, 7))

# Data Modeling

## Linear Regression model

In [None]:
# Fit the Linear Regression model with data
model_lr = LinearRegression().fit(X_train_scaled, y_train)

In [None]:
# Calculate training and testing score 
training_score = model_lr.score(X_train_scaled, y_train)
testing_score = model_lr.score(X_test_scaled, y_test)

In [None]:
# Print the training and testing score
print(f"Model: Linear Regression")
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

In [19]:
# Initialize the Linear Regression model
lr_model = LinearRegression()

# Train the model on the training data
lr_model.fit(X_train_scaled, y_train)

# Predict on the testing data
y_pred_lr = lr_model.predict(X_test_scaled)

# Calculate evaluation metrics
lr_mse = mean_squared_error(y_test, y_pred_lr)
lr_rmse = mean_squared_error(y_test, y_pred_lr, squared=False)
lr_mae = mean_absolute_error(y_test, y_pred_lr)  # Fix: Use mean_absolute_error
lr_r2 = r2_score(y_test, y_pred_lr)

lr_mse, lr_rmse, lr_mae, lr_r2

(263305031644.15808, 513132.5673197503, 343080.5753475964, 0.401818669508635)

## Random Forest model

In [None]:
# Fit the Random Forest model with data
model_rf = RandomForestRegressor(n_estimators = 100, criterion = 'squared_error',
                              random_state = 42,
                             max_depth = 2).fit(X_train, y_train)

In [None]:
# Calculate the training and testing score
training_score = model_rf.score(X_train, y_train)
testing_score = model_rf.score(X_test, y_test)

In [None]:
# Print the training testing score
print(f"Model: Randomized Search CV")
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

## Decision Tree model

In [None]:
# Fit the Decision Tree model with data
model_tree = DecisionTreeRegressor(criterion='squared_error', 
                                  splitter='best', max_depth=None, 
                                  min_samples_split=2,min_samples_leaf=1, 
                                  min_weight_fraction_leaf=0.0,max_features=None, 
                                  random_state= 42, max_leaf_nodes=None, 
                                  min_impurity_decrease=0.0, ccp_alpha=0.0,).fit(X_train, y_train)

In [None]:
# Calculate training and testing score 
training_score = model_tree.score(X_train, y_train)
testing_score = model_tree.score(X_test, y_test)

In [None]:
# Print the training testing score
print(f"Model: Decision Tree Regressor")
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

## Randomized Search CV model

In [None]:
# Fit the Randomized Search model with data
param_dists = {'criterion' : ['squared_error', 'friedman_mse',], 
                       'max_depth': [3,4,7, None],
                        'min_samples_split':np.arange(0.1, 1.1, 0.1),
                        'min_samples_leaf' : list(range(1, 21)), 
                        'max_features' : ['sqrt', 'log2', None]}

model_cv = RandomizedSearchCV(estimator = RandomForestRegressor(random_state= 42), 
                              param_distributions = param_dists,  n_iter=200, 
                              scoring= 'neg_mean_squared_error',
                              cv=5, random_state= 42).fit(X_train_scaled, y_train)

In [None]:
# Calculate training and testing score 
training_score = model_cv.score(X_train_scaled, y_train)
testing_score = model_cv.score(X_test_scaled, y_test)

In [None]:
# Print the training testing score
print(f"Model: Randomized Search CV")
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

## Support Vector Regressor model

In [None]:
# Fit the Support Vector Regressor model with data
model_svr = SVR(kernel = "rbf").fit(X_train_scaled, y_train)

In [None]:
# Calculate training and testing score 
training_score = model_svr.score(X_train_scaled, y_train)
testing_score = model_svr.score(X_test_scaled, y_test)

In [None]:
# Print the training testing score
print(f"Model: Support Vector Regressor")
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

## Lasso model

In [None]:
# Fit the Lasso model with data
model_lasso = Lasso(alpha =1.0 , max_iter = 1000).fit(X_train_scaled, y_train)

In [None]:
# Calculate training and testing score 
training_score = model_lasso.score(X_train_scaled, y_train)
testing_score = model_lasso.score(X_test_scaled, y_test)

In [None]:
# Print the training testing score
print(f"Model: Lasso Regression")
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

## Ridge model

In [None]:
# Fit the Ridge model with data
model_ridge = Ridge(alpha = 100).fit(X_train, y_train)

In [None]:
# Calculate training and testing score 
training_score = model_ridge.score(X_train, y_train)
testing_score = model_ridge.score(X_test, y_test)

In [None]:
# Print the training testing score
print(f"Model: Ridge Regression")
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

## Tuning Decision Tree Model

In [None]:
# Define the hyperparameters and their possible values
param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': [None, 'sqrt', 'log2'],
    'ccp_alpha': [0.0, 0.01, 0.1]
}

# Initialize the Decision Tree Regressor
model_tree = DecisionTreeRegressor(random_state=42)

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(model_tree, param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_
best_params

In [None]:
# Fit the Decision Tree model with data
model_tree = DecisionTreeRegressor(criterion='squared_error', 
                                  splitter='best', max_depth=15, 
                                  min_samples_split=2,min_samples_leaf=5, 
                                  min_weight_fraction_leaf=0.0,max_features=None, 
                                  random_state= 42, max_leaf_nodes=None, 
                                  min_impurity_decrease=0.0, ccp_alpha=0.0,).fit(X_train, y_train)

In [None]:
# Calculate training and testing score 
training_score = model_tree.score(X_train, y_train)
testing_score_tuned = model_tree.score(X_test, y_test)

In [None]:
# Print the training testing score
print(f"Model: Decision Tree Regressor")
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score_tuned}")

## Price Predictions

In [None]:
 # Predict the price with Linear Regression model
y_pred = model_lr.predict(X_test)
pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

In [None]:
# Predict the price with Random Forest model 
y_pred = model_rf.predict(X_test)
pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

In [None]:
 # Predict the price with Decision Tree model 
y_pred = model_tree.predict(X_test)
pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

In [None]:
 # Predict the price with Random Search CV model 
y_pred = model_cv.predict(X_test)
pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

In [None]:
 # Predict the price with Support Vector Regressor model 
y_pred = model_svr.predict(X_test)
pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

In [None]:
 # Predict the price with Lasso model 
y_pred = model_lasso.predict(X_test)
pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

In [None]:
 # Predict the price with Ridge model 
y_pred = model_ridge.predict(X_test)
pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

## Save and test the model

In [None]:
# Saving model
pickle.dump(model_tree, open('../Trained_Model/model.pkl','wb'))

In [None]:
test_df = pd.DataFrame({
    "Rooms": [2],
    "Type": [1],
    "Distance": [5],
    "Bathroom": [2],
    "Car": [2],
    "Region": [1],

})

# Create the index
index_ = ['0']

# Set the index
test_df.index = index_

In [None]:
#Loading model to compare the results
model = pickle.load(open('../Trained_Model/model.pkl','rb'))
print(model.predict(test_df))