**Importing Necessary Libraries**

In [11]:
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
import plotly.graph_objs as go
import plotly.figure_factory as ff

**Importing dataset and examining it**

In [12]:
dataset = pd.read_csv("PricePrediction.csv")
pd.set_option('display.max_columns', None) # to make sure you can see all the columns in output window
print(dataset.head())
print(dataset.shape)
print(dataset.info())
print(dataset.describe())

   LotArea  YearBuilt  YearRemodelled  TotalBsmtSF CentralAir  1stFlrSF  \
0     8450       2003            2003          856          Y       856   
1     9600       1976            1976         1262          Y      1262   
2    11250       2001            2002          920          Y       920   
3     9550       1915            1970          756          Y       961   
4    14260       2000            2000         1145          Y      1145   

   2ndFlrSF  GrLivArea  BsmtFullBath  BsmtHalfBath  FullBath  HalfBath  \
0       854       1710             1             0         2         1   
1         0       1262             0             1         2         0   
2       866       1786             1             0         2         1   
3       756       1717             1             0         1         0   
4      1053       2198             1             0         2         1   

   BedroomAbvGr  KitchenAbvGr  TotRmsAbvGrd  Fireplaces  GarageCars  \
0             3             1    

**Converting Categorical features into Numerical features**

In [13]:
def converter(column):
    if column == 'Y':
        return 1
    else:
        return 0

dataset['CentralAir'] = dataset['CentralAir'].apply(converter)
dataset['PavedDrive'] = dataset['PavedDrive'].apply(converter)
print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   LotArea         1460 non-null   int64
 1   YearBuilt       1460 non-null   int64
 2   YearRemodelled  1460 non-null   int64
 3   TotalBsmtSF     1460 non-null   int64
 4   CentralAir      1460 non-null   int64
 5   1stFlrSF        1460 non-null   int64
 6   2ndFlrSF        1460 non-null   int64
 7   GrLivArea       1460 non-null   int64
 8   BsmtFullBath    1460 non-null   int64
 9   BsmtHalfBath    1460 non-null   int64
 10  FullBath        1460 non-null   int64
 11  HalfBath        1460 non-null   int64
 12  BedroomAbvGr    1460 non-null   int64
 13  KitchenAbvGr    1460 non-null   int64
 14  TotRmsAbvGrd    1460 non-null   int64
 15  Fireplaces      1460 non-null   int64
 16  GarageCars      1460 non-null   int64
 17  GarageArea      1460 non-null   int64
 18  PavedDrive      1460 non-nul

**Plotting Correlation Heatmap**

In [14]:
corrs = dataset.corr()
figure = ff.create_annotated_heatmap(
    z=corrs.values,
    x=list(corrs.columns),
    y=list(corrs.index),
    annotation_text=corrs.round(2).values,
    showscale=True)
figure.show()

**Dividing dataset into label and feature sets**

In [15]:
X = dataset.drop(['TotRmsAbvGrd','GarageCars','SalePrice'], axis = 1) # Features
Y = dataset['SalePrice'] # Labels
print(type(X))
print(type(Y))
print(X.shape)
print(Y.shape)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
(1460, 19)
(1460,)


**Normalizing numerical features so that each feature has mean 0 and variance 1**

In [16]:
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X)

**Implementing Random Forest Regression**

In [17]:
# Tuning the random forest parameter 'n_estimators' and implementing cross-validation using Grid Search
rfr = RandomForestRegressor(criterion='mse', max_features='sqrt', random_state=1)
grid_param = {'n_estimators': [50,100,150,200,250]}

gd_sr = GridSearchCV(estimator=rfr, param_grid=grid_param, scoring='r2', cv=5)

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print("Optimal parameters:\n", best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print("Best mean cross-validated score:\n", best_result)

Optimal parameters:
 {'n_estimators': 150}
Best mean cross-validated score:
 0.8434592861185891


In [18]:
# Building random forest using the tuned parameter
rfr = RandomForestRegressor(n_estimators=150, criterion='mse', max_features='sqrt', random_state=1)
rfr.fit(X_scaled,Y)
featimp = pd.Series(rfr.feature_importances_, index=list(X)).sort_values(ascending=False)
print(featimp)

GrLivArea         0.175216
YearBuilt         0.144536
TotalBsmtSF       0.134564
GarageArea        0.118773
1stFlrSF          0.097450
2ndFlrSF          0.057710
FullBath          0.056618
LotArea           0.053266
YearRemodelled    0.053164
Fireplaces        0.038010
WoodDeckSF        0.015221
BedroomAbvGr      0.013863
HalfBath          0.013277
BsmtFullBath      0.010463
CentralAir        0.005446
KitchenAbvGr      0.004656
PoolArea          0.003114
BsmtHalfBath      0.002369
PavedDrive        0.002284
dtype: float64


In [19]:
# Selecting features with higher sifnificance and redefining feature set
X_ = dataset[['GrLivArea', 'YearBuilt', 'TotalBsmtSF', 'GarageArea']]

feature_scaler = StandardScaler()
X_scaled_ = feature_scaler.fit_transform(X_)

In [20]:
# Tuning the random forest parameter 'n_estimators' and implementing cross-validation using Grid Search
rfr = RandomForestRegressor(criterion='mse', max_features='sqrt', random_state=1)
grid_param = {'n_estimators': [50,100,150,200,250]}

gd_sr = GridSearchCV(estimator=rfr, param_grid=grid_param, scoring='r2', cv=5)

gd_sr.fit(X_scaled_, Y)

best_parameters = gd_sr.best_params_
print("Optimal parameters:\n", best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print("Best mean cross-validated score:\n", best_result)

Optimal parameters:
 {'n_estimators': 50}
Best mean cross-validated score:
 0.8033566044717354
