In [77]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error as mse
from sklearn.svm import SVR
import pickle

In [78]:
# Reading the cleaned dataset
cleaned_df = pd.read_csv('..\Cleaned_Data\Cleaned_Data_LogPrice.csv')
cleaned_df.head()

Unnamed: 0,Rooms,Type,Price,Distance,Bathroom,Car,Region,Log_Price
0,2,h,1480000.0,2.5,1.0,1.0,Northern Metropolitan,14.207553
1,2,h,1035000.0,2.5,1.0,0.0,Northern Metropolitan,13.849913
2,3,h,1465000.0,2.5,2.0,0.0,Northern Metropolitan,14.197366
3,3,h,850000.0,2.5,2.0,1.0,Northern Metropolitan,13.652993
4,4,h,1600000.0,2.5,1.0,2.0,Northern Metropolitan,14.285515


In [79]:
# Enumerate Type
encode = LabelEncoder().fit(cleaned_df['Type'])
carpet = {x: i for i, x in enumerate(encode.classes_)}
carpet

{'h': 0, 't': 1, 'u': 2}

In [80]:
# Enumerate Region
encoder = LabelEncoder().fit(cleaned_df['Region'])
carp = {x: i for i, x in enumerate(encoder.classes_)}
carp

{'Eastern Metropolitan': 0,
 'Eastern Victoria': 1,
 'Northern Metropolitan': 2,
 'Northern Victoria': 3,
 'South-Eastern Metropolitan': 4,
 'Southern Metropolitan': 5,
 'Western Metropolitan': 6,
 'Western Victoria': 7}

In [81]:
# Convert to numerical variable 
cleaned_df['Type'] = LabelEncoder().fit_transform(cleaned_df['Type'])
cleaned_df['Type']

0        0
1        0
2        0
3        0
4        0
        ..
17693    0
17694    0
17695    0
17696    1
17697    0
Name: Type, Length: 17698, dtype: int32

In [82]:
cleaned_df['Region'] = LabelEncoder().fit_transform(cleaned_df['Region'])
cleaned_df['Region']

0        2
1        2
2        2
3        2
4        2
        ..
17693    2
17694    6
17695    6
17696    6
17697    6
Name: Region, Length: 17698, dtype: int32

In [83]:
# Convert categorical data to numeric and separate target feature for training data
X = cleaned_df.drop(["Log_Price", 'Price'],  axis = 1)

y = cleaned_df['Price']

X

Unnamed: 0,Rooms,Type,Distance,Bathroom,Car,Region
0,2,0,2.5,1.0,1.0,2
1,2,0,2.5,1.0,0.0,2
2,3,0,2.5,2.0,0.0,2
3,3,0,2.5,2.0,1.0,2
4,4,0,2.5,1.0,2.0,2
...,...,...,...,...,...,...
17693,3,0,25.5,2.0,2.0,2
17694,4,0,6.3,1.0,3.0,6
17695,2,0,6.3,2.0,1.0,6
17696,2,1,6.3,1.0,2.0,6


In [84]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17698 entries, 0 to 17697
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Rooms     17698 non-null  int64  
 1   Type      17698 non-null  int32  
 2   Distance  17698 non-null  float64
 3   Bathroom  17698 non-null  float64
 4   Car       17698 non-null  float64
 5   Region    17698 non-null  int32  
dtypes: float64(3), int32(2), int64(1)
memory usage: 691.5 KB


In [85]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#Scale data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Data Modeling

## Linear Regression model

In [86]:
# Fit the Linear Regression model with data
model_lr = LinearRegression().fit(X_train_scaled, y_train)

In [87]:
# Calculate training and testing score 
training_score = model_lr.score(X_train_scaled, y_train)
testing_score = model_lr.score(X_test_scaled, y_test)

In [88]:
# Print the training and testing score
print(f"Model: Linear Regression")
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Model: Linear Regression
Training Score: 0.4319944823019949
Testing Score: 0.3916199619836014


## Random Forest model

In [89]:
# Fit the Random Forest model with data
model_rf = RandomForestRegressor(n_estimators = 100, criterion = 'squared_error',
                              random_state = 42,
                             max_depth = 2).fit(X_train, y_train)

In [90]:
# Calculate the training and testing score
training_score = model_rf.score(X_train, y_train)
testing_score = model_rf.score(X_test, y_test)

In [91]:
# Print the training testing score
print(f"Model: Randomized Search CV")
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Model: Randomized Search CV
Training Score: 0.344073849439102
Testing Score: 0.32116465134583405


## Decision Tree model

In [92]:
# Fit the Decision Tree model with data
model_tree = DecisionTreeRegressor(criterion='squared_error', 
                                  splitter='best', max_depth=None, 
                                  min_samples_split=2,min_samples_leaf=1, 
                                  min_weight_fraction_leaf=0.0,max_features=None, 
                                  random_state= 42, max_leaf_nodes=None, 
                                  min_impurity_decrease=0.0, ccp_alpha=0.0,).fit(X_train, y_train)

In [93]:
# Calculate training and testing score 
training_score = model_tree.score(X_train, y_train)
testing_score = model_tree.score(X_test, y_test)

In [94]:
# Print the training testing score
print(f"Model: Decision Tree Regressor")
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Model: Decision Tree Regressor
Training Score: 0.8817107006845171
Testing Score: 0.570292985099475


## Randomized Search CV model

In [95]:
# Fit the Randomized Search model with data
param_dists = {'criterion' : ['squared_error', 'friedman_mse',], 
                       'max_depth': [3,4,7, None],
                        'min_samples_split':np.arange(0.1, 1.1, 0.1),
                        'min_samples_leaf' : list(range(1, 21)), 
                        'max_features' : ['sqrt', 'log2', None]}

model_cv = RandomizedSearchCV(estimator = RandomForestRegressor(random_state= 42), 
                              param_distributions = param_dists,  n_iter=200, 
                              scoring= 'neg_mean_squared_error',
                              cv=5, random_state= 42).fit(X_train_scaled, y_train)

In [96]:
# Calculate training and testing score 
training_score = model_cv.score(X_train_scaled, y_train)
testing_score = model_cv.score(X_test_scaled, y_test)

In [97]:
# Print the training testing score
print(f"Model: Randomized Search CV")
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Model: Randomized Search CV
Training Score: -223743259738.23077
Testing Score: -244606256995.22253


## Support Vector Regressor model

In [98]:
# Fit the Support Vector Regressor model with data
model_svr = SVR(kernel = "rbf").fit(X_train_scaled, y_train)

In [99]:
# Calculate training and testing score 
training_score = model_svr.score(X_train_scaled, y_train)
testing_score = model_svr.score(X_test_scaled, y_test)

In [100]:
# Print the training testing score
print(f"Model: Support Vector Regressor")
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Model: Support Vector Regressor
Training Score: -0.0739590262409262
Testing Score: -0.07186018130933003


## Lasso model

In [101]:
# Fit the Lasso model with data
model_lasso = Lasso(alpha =1.0 , max_iter = 1000).fit(X_train_scaled, y_train)

In [102]:
# Calculate training and testing score 
training_score = model_lasso.score(X_train_scaled, y_train)
testing_score = model_lasso.score(X_test_scaled, y_test)

In [103]:
# Print the training testing score
print(f"Model: Lasso Regression")
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Model: Lasso Regression
Training Score: 0.43199448229025583
Testing Score: 0.39162012134173785


## Ridge model

In [104]:
# Fit the Ridge model with data
model_ridge = Ridge(alpha = 100).fit(X_train, y_train)

In [105]:
# Calculate training and testing score 
training_score = model_ridge.score(X_train, y_train)
testing_score = model_ridge.score(X_test, y_test)

In [106]:
# Print the training testing score
print(f"Model: Ridge Regression")
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Model: Ridge Regression
Training Score: 0.43196974006724675
Testing Score: 0.3919629977245954


## Tuning Decision Tree Model

In [107]:
# Define the hyperparameters and their possible values
param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': [None, 'sqrt', 'log2'],
    'ccp_alpha': [0.0, 0.01, 0.1]
}

# Initialize the Decision Tree Regressor
model_tree = DecisionTreeRegressor(random_state=42)

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(model_tree, param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_
best_params

Fitting 5 folds for each of 405 candidates, totalling 2025 fits


{'ccp_alpha': 0.0,
 'max_depth': None,
 'max_features': None,
 'min_samples_leaf': 5,
 'min_samples_split': 2}

In [111]:
# Fit the Decision Tree model with data
model_tree = DecisionTreeRegressor(criterion='squared_error', 
                                  splitter='best', max_depth=11, 
                                  min_samples_split=2,min_samples_leaf=5, 
                                  min_weight_fraction_leaf=0.0,max_features=None, 
                                  random_state= 42, max_leaf_nodes=None, 
                                  min_impurity_decrease=0.0, ccp_alpha=0.0,).fit(X_train, y_train)

In [112]:
# Calculate training and testing score 
training_score = model_tree.score(X_train, y_train)
testing_score_tuned = model_tree.score(X_test, y_test)

In [113]:
# Print the training testing score
print(f"Model: Decision Tree Regressor")
print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score_tuned}")

Model: Decision Tree Regressor
Training Score: 0.7554414021233491
Testing Score: 0.6471961713517957


## Price Predictions

In [114]:
 # Predict the price with Linear Regression model
y_pred = model_lr.predict(X_test)
pd.DataFrame({"Prediction": y_pred, "Actual": y_test})



Unnamed: 0,Prediction,Actual
9681,-1.539581e+06,1060000.0
11658,-4.262983e+06,1005000.0
15500,-1.659321e+06,640000.0
1730,-1.319172e+06,1170000.0
10302,-4.375818e+05,501000.0
...,...,...
12717,-1.362070e+06,670000.0
9770,-1.911089e+06,830000.0
4983,-2.659126e+05,850000.0
9129,-7.308416e+05,2600000.0


In [115]:
# Predict the price with Random Forest model 
y_pred = model_rf.predict(X_test)
pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

Unnamed: 0,Prediction,Actual
9681,1.201201e+06,1060000.0
11658,1.077926e+06,1005000.0
15500,1.077926e+06,640000.0
1730,6.090085e+05,1170000.0
10302,6.090085e+05,501000.0
...,...,...
12717,1.077926e+06,670000.0
9770,1.077926e+06,830000.0
4983,1.077926e+06,850000.0
9129,1.201201e+06,2600000.0


In [116]:
 # Predict the price with Decision Tree model 
y_pred = model_tree.predict(X_test)
pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

Unnamed: 0,Prediction,Actual
9681,1.322240e+06,1060000.0
11658,7.740625e+05,1005000.0
15500,5.819167e+05,640000.0
1730,7.899286e+05,1170000.0
10302,6.141398e+05,501000.0
...,...,...
12717,7.606201e+05,670000.0
9770,6.696603e+05,830000.0
4983,9.883276e+05,850000.0
9129,1.699083e+06,2600000.0


In [117]:
 # Predict the price with Random Search CV model 
y_pred = model_cv.predict(X_test)
pd.DataFrame({"Prediction": y_pred, "Actual": y_test})



Unnamed: 0,Prediction,Actual
9681,1.564581e+06,1060000.0
11658,1.111903e+06,1005000.0
15500,1.625938e+06,640000.0
1730,1.076936e+06,1170000.0
10302,1.076936e+06,501000.0
...,...,...
12717,1.625938e+06,670000.0
9770,1.128146e+06,830000.0
4983,1.625938e+06,850000.0
9129,1.564581e+06,2600000.0


In [118]:
 # Predict the price with Support Vector Regressor model 
y_pred = model_svr.predict(X_test)
pd.DataFrame({"Prediction": y_pred, "Actual": y_test})



Unnamed: 0,Prediction,Actual
9681,915332.973372,1060000.0
11658,915332.973372,1005000.0
15500,915332.973372,640000.0
1730,915332.973337,1170000.0
10302,915332.818139,501000.0
...,...,...
12717,915332.973332,670000.0
9770,915332.973371,830000.0
4983,915332.509455,850000.0
9129,915332.973341,2600000.0


In [119]:
 # Predict the price with Lasso model 
y_pred = model_lasso.predict(X_test)
pd.DataFrame({"Prediction": y_pred, "Actual": y_test})



Unnamed: 0,Prediction,Actual
9681,-1.539562e+06,1060000.0
11658,-4.262950e+06,1005000.0
15500,-1.659308e+06,640000.0
1730,-1.319160e+06,1170000.0
10302,-4.375751e+05,501000.0
...,...,...
12717,-1.362055e+06,670000.0
9770,-1.911072e+06,830000.0
4983,-2.659038e+05,850000.0
9129,-7.308278e+05,2600000.0


In [120]:
 # Predict the price with Ridge model 
y_pred = model_ridge.predict(X_test)
pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

Unnamed: 0,Prediction,Actual
9681,1.657678e+06,1060000.0
11658,4.705740e+05,1005000.0
15500,1.192144e+06,640000.0
1730,6.228607e+05,1170000.0
10302,5.730369e+05,501000.0
...,...,...
12717,1.185849e+06,670000.0
9770,8.790909e+05,830000.0
4983,1.317508e+06,850000.0
9129,1.853772e+06,2600000.0


## Save and test the model

In [121]:
# Saving model
pickle.dump(model_tree, open('../Trained_Model/model.pkl','wb'))

In [122]:
test_df = pd.DataFrame({
    "Rooms": [2],
    "Type": [1],
    "Distance": [5],
    "Bathroom": [2],
    "Car": [2],
    "Region": [1],

})

# Create the index
index_ = ['0']

# Set the index
test_df.index = index_

In [123]:
#Loading model to compare the results
model = pickle.load(open('../Trained_Model/model.pkl','rb'))
print(model.predict(test_df))

[722785.71428571]
