In [1]:
import numpy as np
import pandas as pd

In [2]:
#Reading the cleaned csv data set file
df = pd.read_csv('save_cleaned.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72129 entries, 0 to 72128
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Type                  72129 non-null  object 
 1   MunicipalityCode      72129 non-null  int64  
 2   NearestStation        72129 non-null  object 
 3   TimeToNearestStation  72129 non-null  object 
 4   FloorPlan             72129 non-null  object 
 5   Area                  72129 non-null  int64  
 6   BuildingYear          72129 non-null  float64
 7   CityPlanning          72129 non-null  object 
 8   Purpose               72129 non-null  object 
 9   Structure             72129 non-null  object 
 10  Year                  72129 non-null  int64  
 11  Renovation            72129 non-null  object 
 12  TradePrice            72129 non-null  int64  
dtypes: float64(1), int64(4), object(8)
memory usage: 7.2+ MB


In [4]:
#Changing the type of MunicipalityCode coloumn to String 
df['MunicipalityCode'] = df['MunicipalityCode'].astype('string')

In [5]:
#Analysing the numerical coloumns in dataset data
df.describe()

Unnamed: 0,Area,BuildingYear,Year,TradePrice
count,72129.0,72129.0,72129.0,72129.0
mean,47.752222,1998.262294,2016.475468,33981880.0
std,27.3707,12.25127,1.584777,45799690.0
min,10.0,1945.0,2013.0,240000.0
25%,25.0,1990.0,2015.0,19000000.0
50%,50.0,2001.0,2016.0,27000000.0
75%,65.0,2007.0,2018.0,41000000.0
max,2000.0,2019.0,2019.0,8600000000.0


In [6]:
#Analyzing the correlation of numerical coloumns with Trade Price
df.corr()['TradePrice']

Area            0.572021
BuildingYear    0.176611
Year            0.028501
TradePrice      1.000000
Name: TradePrice, dtype: float64

In [7]:
#Creating the input set
X = pd.get_dummies(df.drop('TradePrice',axis = 1))  

In [8]:
#output set
y = df['TradePrice'] 

In [9]:
#Spliting the data set into test,training and hold out test data set
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
X_hold, X_test, y_hold, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [12]:
#Applying RandomForest 
from sklearn.ensemble import RandomForestRegressor
model_rf = RandomForestRegressor()

In [13]:
model_rf.fit(X_train,y_train)

RandomForestRegressor()

In [14]:
#Calculating predictions
y_pred = model_rf.predict(X_test)

In [15]:
from sklearn.metrics import mean_absolute_percentage_error

In [16]:
mean_absolute_percentage_error(y_test, y_pred)

0.18329595573439944

In [17]:
#Applying Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
model_tree = DecisionTreeRegressor()

In [18]:
model_tree.fit(X_train,y_train)

DecisionTreeRegressor()

In [19]:
y_pred_tree = model_tree.predict(X_test)

In [20]:
mean_absolute_percentage_error(y_test,y_pred_tree)

0.2270699547451734

In [21]:
#Scaling the data to apply Support Vector Regressor
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [22]:
scaler.fit(X_train)
X_train_scaled = scaler.fit_transform(X_train) 
X_test_scaled = scaler.transform(X_test) 

In [23]:
#Applying Support Vector Regressor
from sklearn.svm import SVR
model_SVR = SVR()

In [24]:
model_SVR.fit(X_train_scaled,y_train)

SVR()

In [25]:
y_pred_svr = model_SVR.predict(X_test_scaled)

In [26]:
mean_absolute_percentage_error(y_test,y_pred_svr)

0.6320532685192569

In [27]:
#Applying KNN
from sklearn.neighbors import KNeighborsRegressor
model_knn = KNeighborsRegressor()

In [28]:
model_knn.fit(X_train_scaled,y_train)

KNeighborsRegressor()

In [29]:
y_pred_knn = model_knn.predict(X_test_scaled) 

In [30]:
mean_absolute_percentage_error(y_test,y_pred_knn)

0.40262584917627725

In [31]:
#Comparing the performance of these 4 algorithms
#1. Random Forrest         - 18% error
#2. Decision Trees         - 23% error
#3. Support Vector         - 63% error
#4. KNN                    - 40% error
#So the best performance is from Random Forest

In [32]:
#calculating performance on hold out test set for random forest

In [33]:
y_pred_hold = model_rf.predict(X_hold)

In [34]:
mean_absolute_percentage_error(y_hold,y_pred_hold)

0.18850152201457168

In [35]:
#Final performance metric ( Mean absolute percentage error ) = 18.8%

In [36]:
#Creating the final prediction model
model_final = RandomForestRegressor()

In [37]:
#training the final model on entire dataset
model_final.fit(X,y)

RandomForestRegressor()

In [38]:
# Save the final model as a pickle in a file
import joblib
joblib.dump(model_final, 'Prediction_model.pkl')

['Prediction_model.pkl']