Importing Libraries

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import datetime as dt
from sklearn import metrics
import pickle

Reading the Dataset from the CSV File

In [21]:
df = pd.read_csv('data/final_df.csv')
df.head()

Unnamed: 0,date,new_construction,gdp,fed_rate,permit,mortgage_rate,personal_income,price
0,2000-01-01,107.3,101.491397,5.448387,1727,8.21,8382.6,100.552
1,2000-02-01,121.8,101.552445,5.734828,1692,8.325,8443.7,101.339
2,2000-03-01,153.7,101.626906,5.853548,1651,8.24,8503.7,102.127
3,2000-04-01,138.9,101.698161,6.019667,1597,8.1525,8542.9,102.922
4,2000-05-01,148.9,101.740194,6.268065,1543,8.515,8580.9,103.677


In [22]:
df.tail()

Unnamed: 0,date,new_construction,gdp,fed_rate,permit,mortgage_rate,personal_income,price
236,2019-09-01,118.3,100.84363,2.043,1461,3.605,18522.1,210.911
237,2019-10-01,135.8,100.884768,1.829677,1520,3.688,18587.4,211.598
238,2019-11-01,111.0,100.921514,1.553333,1497,3.695,18683.0,212.446
239,2019-12-01,110.7,100.951586,1.550968,1439,3.72,18675.1,213.434
240,2020-01-01,116.4,100.966969,1.55,1493,3.624,18873.9,214.49


In [23]:
# display columns, shape and types of the columns present in the dataframe
display(df.columns, df.shape, df.dtypes)

Index(['date', 'new_construction', 'gdp', 'fed_rate', 'permit',
       'mortgage_rate', 'personal_income', 'price'],
      dtype='object')

(241, 8)

date                 object
new_construction    float64
gdp                 float64
fed_rate            float64
permit                int64
mortgage_rate       float64
personal_income     float64
price               float64
dtype: object

Deeper Understanding of the Dataset

In [24]:
df.describe()

Unnamed: 0,new_construction,gdp,fed_rate,permit,mortgage_rate,personal_income,price
count,241.0,241.0,241.0,241.0,241.0,241.0,241.0
mean,109.63029,100.080589,1.783023,1314.829876,5.185427,12965.737759,158.869245
std,41.988829,0.912263,1.913202,477.707852,1.281163,2951.488421,28.312375
min,36.3,97.762514,0.066429,513.0,3.345,8382.6,100.552
25%,77.5,99.645967,0.155806,980.0,4.036,10335.8,140.011
50%,107.3,99.999349,1.157,1301.0,4.952,12489.4,159.33
75%,138.9,100.579304,2.423667,1665.0,6.18,15444.5,180.848
max,211.9,101.829671,6.544516,2263.0,8.515,18873.9,214.49


In [25]:
df.isna().sum()

date                0
new_construction    0
gdp                 0
fed_rate            0
permit              0
mortgage_rate       0
personal_income     0
price               0
dtype: int64

In [26]:
# changing the datatype of the date column and converting it intoProleptic format
df['date'] = pd.to_datetime(df['date'])
df['date']=df['date'].map(dt.datetime.toordinal)

In [27]:
df

Unnamed: 0,date,new_construction,gdp,fed_rate,permit,mortgage_rate,personal_income,price
0,730120,107.3,101.491397,5.448387,1727,8.2100,8382.6,100.552
1,730151,121.8,101.552445,5.734828,1692,8.3250,8443.7,101.339
2,730180,153.7,101.626906,5.853548,1651,8.2400,8503.7,102.127
3,730211,138.9,101.698161,6.019667,1597,8.1525,8542.9,102.922
4,730241,148.9,101.740194,6.268065,1543,8.5150,8580.9,103.677
...,...,...,...,...,...,...,...,...
236,737303,118.3,100.843630,2.043000,1461,3.6050,18522.1,210.911
237,737333,135.8,100.884768,1.829677,1520,3.6880,18587.4,211.598
238,737364,111.0,100.921514,1.553333,1497,3.6950,18683.0,212.446
239,737394,110.7,100.951586,1.550968,1439,3.7200,18675.1,213.434


Creating the Linear Regression Model

In [28]:
X = df.drop(['price'], axis=1)   # dropping the price column from the dataframe
y = df['price']                  # assigning the price column to the y variable(dependant variable)

# splitting the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

# Calling the LinearRegression class from the sklearn library
reg = LinearRegression()
reg.fit(X_train, y_train)     # fitting the data to the model
y_pred = reg.predict(X_test)  # predicting the data

In [29]:
print("For Linear Regression Model ")
print("Training Score : " + str(reg.score(X_train, y_train)))
print("Testing Score : " + str(reg.score(X_test, y_test)))

For Linear Regression Model 
Training Score : 0.8186294707680757
Testing Score : 0.815686971543842


In [30]:
print("Co-efficients : " + str(reg.coef_))
print("Intercept : " + str(reg.intercept_))

Co-efficients : [ 0.04577183 -0.05736736  1.86394988  4.13114928  0.032166    8.7293178
 -0.0188647 ]
Intercept : -33458.01927387141


In [31]:
metrics.mean_absolute_error(y_test, y_pred)  # calculating the mean absolute error

8.265671327951962

In [32]:
# save the model to disk
filename = 'reg.pkl'
pickle.dump(reg, open(filename, 'wb'))

Creating the Random Forest Model

In [33]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=1, random_state=30)
rf.fit(X_train, y_train)


In [34]:
print("For Random Forest Regressor Model ")
print("Training Score : " + str(rf.score(X_train, y_train)))
print("Testing Score : " + str(rf.score(X_test, y_test)))

For Random Forest Regressor Model 
Training Score : 0.9981404105571242
Testing Score : 0.9893228179140017


In [35]:
rf_pred = rf.predict(X_test)
metrics.mean_absolute_error(y_test, rf_pred)

1.9782653061224493