In [1]:
# importing all the important libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor       # for decision tree regressor problem
import sklearn.ensemble as ensemble

## Comparison between Linear Regression, Decision Tree and Random Forest

In [2]:
# Reading the dataset
train_data = pd.read_csv("datasets/kc_house_train_data.csv")

In [19]:
test_data = pd.read_csv("datasets/kc_house_test_data.csv")

In [3]:
X_train = train_data[['sqft_living', 'bedrooms', 'bathrooms', 'grade', 'view', 'waterfront', 'lat', 'yr_built']]
y_train = train_data["price"]

In [20]:
X_test = test_data[['sqft_living', 'bedrooms', 'bathrooms', 'grade', 'view', 'waterfront', 'lat', 'yr_built']]
y_test = test_data["price"]

In [24]:
train_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
1,7237550310,20140512T000000,1225000.0,4,4.5,5420,101930,1.0,0,0,...,11,3890,1530,2001,0,98053,47.6561,-122.005,4760,101930
2,9212900260,20140527T000000,468000.0,2,1.0,1160,6000,1.0,0,0,...,7,860,300,1942,0,98115,47.69,-122.292,1330,6000
3,114101516,20140528T000000,310000.0,3,1.0,1430,19901,1.5,0,0,...,7,1430,0,1927,0,98028,47.7558,-122.229,1780,12697
4,6054650070,20141007T000000,400000.0,3,1.75,1370,9680,1.0,0,0,...,7,1370,0,1977,0,98074,47.6127,-122.045,1370,10208


## Linear Regression Model

In [29]:
model_LR = LinearRegression()
score2 = cross_val_score(model_LR, x_train, y_train, cv=10, scoring='neg_root_mean_squared_error')
print('The rmse value is',np.abs(score2.mean()))

The rmse value is 211254.4212362953


## Decision Tree Model

In [26]:
rmsescore=[]
r2score=[]
index=[]
for i in range(1,7):             # taking range between 1 to 7 for min_samples leaf        
    for j in range(1,7):         # taking range between 1 to 7 for max depth
        model_DT=DecisionTreeRegressor(min_samples_leaf=i,max_depth=j)
        score1=cross_val_score(model_DT,X_train,y_train,cv=10,scoring='neg_root_mean_squared_error')
        index.append([i,j])           # appending the pair values of min samplesleaf and max depth
        rmsescore.append((np.abs(score1)).mean())
rmsevalue=rmsescore.index(min(rmsescore))
index=index[rmsevalue]                  # indexing the pair which produced the minimum error
        
print("The rmse score is",min(rmsescore))
print("The perfect min_samples leaf value is",index[0])
print("The perfect max_depth value is",index[1])

The rmse score is 207481.6123307354
The perfect min_samples leaf value is 6
The perfect max_depth value is 6


## Bagging

In [6]:
#Create a Bagged Decision Tree model object
baggedModel = ensemble.BaggingRegressor(base_estimator=None, n_estimators=50, oob_score=True)

In [7]:
#Train the model using the training set
baggedModel.fit(X_train,y_train)

BaggingRegressor(n_estimators=50, oob_score=True)

In [8]:
#OOB score
baggedModel.oob_score_

0.8104301504218714

In [9]:
#The Predicted values (Use test data if you want OOS predictions)
pred=baggedModel.predict(X_train)
pred[1:5]

array([1310339. ,  478664.9,  349103. ,  460450. ])

In [10]:
rmsescore = []
score1=cross_val_score(baggedModel,X_train,y_train,cv=10,scoring='neg_root_mean_squared_error')
rmsescore.append((np.abs(score1)).mean())
min(rmsescore)

162504.20082575287

## Random Forest

In [11]:
# Creating a random forest regression object
randomForest = ensemble.RandomForestRegressor(n_estimators=50, max_features=0.5, oob_score=True)

In [12]:
randomForest.fit(X_train,y_train)

RandomForestRegressor(max_features=0.5, n_estimators=50, oob_score=True)

In [13]:
randomForest.base_estimator_

DecisionTreeRegressor()

In [14]:
pred = randomForest.predict(X_train)

In [15]:
pred[1:10]

array([1416740.  ,  475737.1 ,  340451.88,  442315.  ,  726922.  ,
        529252.  ,  434534.  ,  303394.  ,  229071.  ])

In [16]:
randomForest.feature_importances_       # importance of features

array([0.27846522, 0.01355129, 0.09763024, 0.28545871, 0.04555776,
       0.02554929, 0.17565595, 0.07813153])

In [18]:
rmsescore = []
score1=cross_val_score(randomForest,X_train,y_train,cv=10,scoring='neg_root_mean_squared_error')
rmsescore.append((np.abs(score1)).mean())
min(rmsescore)

160580.92782227992

## Testing

In [23]:
test_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,3793500160,20150312T000000,323000.0,3,2.5,1890,6560,2.0,0,0,...,7,1890,0,2003,0,98038,47.3684,-122.031,2390,7570
1,1175000570,20150312T000000,530000.0,5,2.0,1810,4850,1.5,0,0,...,7,1810,0,1900,0,98107,47.67,-122.394,1360,4850
2,16000397,20141205T000000,189000.0,2,1.0,1200,9850,1.0,0,0,...,7,1200,0,1921,0,98002,47.3089,-122.21,1060,5095
3,461000390,20140624T000000,687500.0,4,1.75,2330,5000,1.5,0,0,...,7,1510,820,1929,0,98117,47.6823,-122.368,1460,5000
4,7895500070,20150213T000000,240000.0,4,1.0,1220,8075,1.0,0,0,...,7,890,330,1969,0,98001,47.3341,-122.282,1290,7800


### Comparison on the basis of Mean Absolute Error

## Linear Regression

In [31]:
model_LR.fit(X_train,y_train)
pred_LR = model_LR.predict(X_test)
pred_LR

array([246274.05044745, 624916.94562621, 300613.5991774 , ...,
       595050.57957964, 255342.32141709, 368935.42776906])

In [32]:
error = pred_LR - y_test
mae = np.sum(np.abs(error))/len(y_test)
mae

130381.33251795701

## Decision Tree

In [34]:
model_DT = DecisionTreeRegressor(min_samples_leaf=6,max_depth=6)
model_DT.fit(X_train,y_train)
pred_DT = model_DT.predict(X_test)
pred_DT

array([265179.45777027, 592638.69558824, 226836.52083333, ...,
       500227.32885906, 385827.66004963, 490869.71864952])

In [35]:
error = pred_DT - y_test
mae_DT = np.sum(np.abs(error))/len(y_test)
mae_DT

112829.29196204513

## Bagging

In [36]:
pred_Bag=baggedModel.predict(X_test)
pred_Bag

array([302122.  , 590692.04, 207301.  , ..., 588174.26, 292647.  ,
       449163.6 ])

In [37]:
error = pred_Bag - y_test
mae_Bag = np.sum(np.abs(error))/len(y_test)
mae_Bag

89560.9563455549

## Random Forest

In [38]:
pred_RF = randomForest.predict(X_test)
pred_RF

array([304779.  , 584387.  , 216179.88, ..., 571728.56, 283062.  ,
       443165.8 ])

In [39]:
error = pred_RF - y_test
mae_RF = np.sum(np.abs(error))/len(y_test)
mae_RF

88112.8274725462

### Random Forest is performing better than other models.