# House Price Regression Project - Yash Pawar

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

## Step 1: Read the Dataset

In [2]:
import pandas as pd
df = pd.read_csv('training_set.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Step 2: Perform Basic Data Quality Checks


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
# create variable 'm' for finding missing values in the training Dataset
m = df.isna().sum()

In [5]:
m[m>0]

LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [6]:
df.duplicated().sum()

0

## Seperate X and Y(SalePrice) Variables
X = Independent Variable
 
Y = Dependent Variable

In [7]:
X = df.drop(columns=['Id', 'SalePrice'])
Y = df[['SalePrice']]

In [8]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [9]:
Y.head()

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000


## Step 4: Perform Feature Selection
i.e. Cat and Con Seperation

In [10]:
# Seperate the cat and con features for X
cat = list(X.columns[X.dtypes=='object'])
con = list(X.columns[X.dtypes!='object'])

In [11]:
cat

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [12]:
con

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [13]:
# Creating a Pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [14]:

num_pipe1 = Pipeline(steps=[('impute', SimpleImputer(strategy='median')),
                            ('scalar', StandardScaler())])

In [15]:
cat_pipe1 = Pipeline(steps=[('impute', SimpleImputer(strategy='constant', fill_value='Not_Avail')),
                            ('ordinal', OrdinalEncoder())])

In [16]:
pre1 = ColumnTransformer([('num', num_pipe1, con),
                          ('cat', cat_pipe1, cat)]).set_output(transform='pandas')

In [17]:
pre1

In [18]:
X_pre = pre1.fit_transform(X)
X_pre.head()

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__GarageType,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
0,0.073375,-0.220875,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.514104,0.575425,-0.288653,...,1.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,4.0
1,-0.872563,0.46032,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57075,1.171992,-0.288653,...,1.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,4.0
2,0.073375,-0.084636,0.07348,0.651479,-0.5172,0.984752,0.830215,0.325915,0.092907,-0.288653,...,1.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,4.0
3,0.309859,-0.44794,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.57075,-0.499274,-0.288653,...,5.0,3.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,0.0
4,0.073375,0.641972,0.375148,1.374795,-0.5172,0.951632,0.733308,1.366489,0.463568,-0.288653,...,1.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,4.0


In [19]:
# Feature Selection 
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector
lr = LinearRegression()
sel = SequentialFeatureSelector(lr, n_features_to_select='auto', direction='backward')
sel.fit_transform(X_pre,Y)
sel_cols = sel.get_feature_names_out()
sel_cols

array(['num__MSSubClass', 'num__LotArea', 'num__OverallQual',
       'num__OverallCond', 'num__YearBuilt', 'num__MasVnrArea',
       'num__BsmtFinSF1', 'num__BsmtFinSF2', 'num__BsmtUnfSF',
       'num__TotalBsmtSF', 'num__1stFlrSF', 'num__2ndFlrSF',
       'num__BsmtFullBath', 'num__KitchenAbvGr', 'num__TotRmsAbvGrd',
       'num__Fireplaces', 'num__GarageCars', 'num__WoodDeckSF',
       'num__EnclosedPorch', 'num__ScreenPorch', 'num__PoolArea',
       'num__YrSold', 'cat__LandContour', 'cat__Neighborhood',
       'cat__BldgType', 'cat__HouseStyle', 'cat__RoofMatl',
       'cat__Exterior1st', 'cat__MasVnrType', 'cat__ExterQual',
       'cat__BsmtQual', 'cat__BsmtCond', 'cat__BsmtExposure',
       'cat__HeatingQC', 'cat__KitchenQual', 'cat__Functional',
       'cat__FireplaceQu', 'cat__GarageCond', 'cat__Fence',
       'cat__MiscFeature'], dtype=object)

In [20]:
len(sel_cols)


40

In [21]:
sel_cols[0]

'num__MSSubClass'

In [22]:
sel_cols[0].split('__')

['num', 'MSSubClass']

In [23]:
sel_cols[0].split('__')[1]

'MSSubClass'

In [24]:
imp_cols = []
for i in sel_cols:
    s = i.split('__')[1]
    imp_cols.append(s)

In [25]:
imp_cols

['MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'BsmtFullBath',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'WoodDeckSF',
 'EnclosedPorch',
 'ScreenPorch',
 'PoolArea',
 'YrSold',
 'LandContour',
 'Neighborhood',
 'BldgType',
 'HouseStyle',
 'RoofMatl',
 'Exterior1st',
 'MasVnrType',
 'ExterQual',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'HeatingQC',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageCond',
 'Fence',
 'MiscFeature']

In [26]:
X_sel = X[imp_cols]
X_sel

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,BsmtQual,BsmtCond,BsmtExposure,HeatingQC,KitchenQual,Functional,FireplaceQu,GarageCond,Fence,MiscFeature
0,60,8450,7,5,2003,196.0,706,0,150,856,...,Gd,TA,No,Ex,Gd,Typ,,TA,,
1,20,9600,6,8,1976,0.0,978,0,284,1262,...,Gd,TA,Gd,Ex,TA,Typ,TA,TA,,
2,60,11250,7,5,2001,162.0,486,0,434,920,...,Gd,TA,Mn,Ex,Gd,Typ,TA,TA,,
3,70,9550,7,5,1915,0.0,216,0,540,756,...,TA,Gd,No,Gd,Gd,Typ,Gd,TA,,
4,60,14260,8,5,2000,350.0,655,0,490,1145,...,Gd,TA,Av,Ex,Gd,Typ,TA,TA,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,7917,6,5,1999,0.0,0,0,953,953,...,Gd,TA,No,Ex,TA,Typ,TA,TA,,
1456,20,13175,6,6,1978,119.0,790,163,589,1542,...,Gd,TA,No,TA,TA,Min1,TA,TA,MnPrv,
1457,70,9042,7,9,1941,0.0,275,0,877,1152,...,TA,Gd,No,Ex,Gd,Typ,Gd,TA,GdPrv,Shed
1458,20,9717,5,6,1950,0.0,49,1029,0,1078,...,TA,TA,Mn,Gd,Gd,Typ,,TA,,


## Step 5: Create a final Pipeline
categorical = OneHotEncoder

In [27]:
cat_sel = list(X_sel.columns[X_sel.dtypes=='object'])
con_sel = list(X_sel.columns[X_sel.dtypes!='object'])

In [28]:
cat_sel

['LandContour',
 'Neighborhood',
 'BldgType',
 'HouseStyle',
 'RoofMatl',
 'Exterior1st',
 'MasVnrType',
 'ExterQual',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'HeatingQC',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageCond',
 'Fence',
 'MiscFeature']

In [29]:
con_sel

['MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'BsmtFullBath',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'WoodDeckSF',
 'EnclosedPorch',
 'ScreenPorch',
 'PoolArea',
 'YrSold']

In [30]:
from sklearn.preprocessing import OneHotEncoder

In [31]:
num_pipe2 = Pipeline(steps=[('impute', SimpleImputer(strategy='median')),
                            ('scaler', StandardScaler())])

In [32]:
cat_pipe2 = Pipeline(steps=[('impute', SimpleImputer(strategy='constant', fill_value='Not_Avail')),
                            ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

In [33]:
pre2 = ColumnTransformer([('num', num_pipe2, con_sel),
                          ('cat', cat_pipe2, cat_sel)]).set_output(transform='pandas')

In [34]:
X_sel_pre = pre2.fit_transform(X_sel)
X_sel_pre.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,num__BsmtUnfSF,num__TotalBsmtSF,...,cat__Fence_GdPrv,cat__Fence_GdWo,cat__Fence_MnPrv,cat__Fence_MnWw,cat__Fence_Not_Avail,cat__MiscFeature_Gar2,cat__MiscFeature_Not_Avail,cat__MiscFeature_Othr,cat__MiscFeature_Shed,cat__MiscFeature_TenC
0,0.073375,-0.207142,0.651479,-0.5172,1.050994,0.514104,0.575425,-0.288653,-0.944591,-0.459303,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,-0.872563,-0.091886,-0.071836,2.179628,0.156734,-0.57075,1.171992,-0.288653,-0.641228,0.466465,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.073375,0.07348,0.651479,-0.5172,0.984752,0.325915,0.092907,-0.288653,-0.301643,-0.313369,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,0.309859,-0.096897,0.651479,-0.5172,-1.863632,-0.57075,-0.499274,-0.288653,-0.06167,-0.687324,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.073375,0.375148,1.374795,-0.5172,0.951632,1.366489,0.463568,-0.288653,-0.174865,0.19968,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


## Step 6: Train Test Split
20% Data Unseen to model

In [35]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X_sel_pre, Y, test_size=0.2, random_state=21)

In [36]:
xtrain.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,num__BsmtUnfSF,num__TotalBsmtSF,...,cat__Fence_GdPrv,cat__Fence_GdWo,cat__Fence_MnPrv,cat__Fence_MnWw,cat__Fence_Not_Avail,cat__MiscFeature_Gar2,cat__MiscFeature_Not_Avail,cat__MiscFeature_Othr,cat__MiscFeature_Shed,cat__MiscFeature_TenC
710,-0.636078,-0.640101,-2.241782,0.381743,-1.201217,-0.57075,-0.973018,-0.288653,-0.672923,-1.795509,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1098,-0.163109,-0.452686,-1.518467,0.381743,-1.168096,-0.57075,0.500854,-0.288653,-1.284176,-0.878862,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1286,-0.872563,-0.072844,-0.071836,-0.5172,-0.273836,1.925521,0.274948,0.213629,0.250749,0.616959,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
992,0.073375,-0.075851,-0.071836,2.179628,-0.240715,1.31114,0.20257,0.436865,-0.901577,-0.53683,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
631,1.492282,-0.593999,1.374795,-0.5172,1.150356,0.027027,-0.92038,-0.288653,2.179592,1.132288,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [37]:
ytrain.head()

Unnamed: 0,SalePrice
710,52000
1098,128000
1286,143000
992,187000
631,209500


In [38]:
xtest.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,num__BsmtUnfSF,num__TotalBsmtSF,...,cat__Fence_GdPrv,cat__Fence_GdWo,cat__Fence_MnPrv,cat__Fence_MnWw,cat__Fence_Not_Avail,cat__MiscFeature_Gar2,cat__MiscFeature_Not_Avail,cat__MiscFeature_Othr,cat__MiscFeature_Shed,cat__MiscFeature_TenC
880,-0.872563,-0.350058,-0.795151,-0.5172,1.117235,-0.57075,1.176379,-0.288653,-1.035147,0.074268,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
605,0.073375,0.309002,0.651479,0.381743,-0.207594,0.403405,0.022723,-0.288653,-0.573311,-0.659961,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1166,-0.872563,-0.004192,1.374795,-0.5172,1.216598,-0.172232,-0.973018,-0.288653,2.550871,1.451518,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
216,-0.872563,-0.207142,0.651479,-0.5172,1.084115,0.901552,1.101808,-0.288653,-0.174865,0.863222,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
970,-0.163109,0.02838,-1.518467,-1.416142,-0.737526,-0.57075,-0.973018,-0.288653,0.345832,-0.769412,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [39]:
ytest.head()

Unnamed: 0,SalePrice
880,157000
605,205000
1166,245350
216,210000
970,135000


In [40]:
xtrain.shape

(1168, 148)

In [41]:
xtest.shape

(292, 148)

## Step 7: Model Building

In [42]:
model1 = LinearRegression()
model1.fit(xtrain, ytrain)

In [43]:
# Checking the Score in the Train
model1.score(xtrain, ytrain)

0.9257737216394357

In [44]:
# Checking the sccore in the Test
model1.score(xtest, ytest)

0.8201535111434464

# Creating a Ridge Model

In [45]:
import numpy as np
params  = {'alpha': np.arange(start=1, stop=1000, step=1)}
params

{'alpha': array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
         92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
        105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
        118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
        131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
        157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
        170, 171, 172, 173, 174, 175, 176,

In [46]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
rr = Ridge()
gscv_ridge = GridSearchCV(rr, param_grid=params, cv=5, scoring='neg_mean_squared_error')
gscv_ridge.fit(xtrain, ytrain)

In [47]:
gscv_ridge.best_score_

-1056084227.2862409

In [48]:
gscv_ridge.best_params_

{'alpha': 23}

In [49]:
best_ridge = gscv_ridge.best_estimator_
best_ridge

In [50]:
best_ridge.score(xtrain, ytrain)

0.8830834896509135

In [51]:
best_ridge.score(xtest, ytest)

0.8346181527129088

# Creating Lasso Model

In [52]:
from sklearn.linear_model import Lasso
ls = Lasso()
gscv_lasso = GridSearchCV(ls, param_grid=params, cv=5, scoring='neg_mean_squared_error')
gscv_lasso.fit(xtrain, ytrain)

In [53]:
gscv_lasso.best_score_

-976504935.7310283

In [54]:
gscv_lasso.best_params_

{'alpha': 109}

In [55]:
best_lasso = gscv_lasso.best_estimator_
best_lasso

In [56]:
best_lasso.score(xtrain, ytrain)

0.9156928424157719

In [57]:
best_lasso.score(xtest, ytest)

0.8284585823037399

## Step 8: Model Evaluation

In [58]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(model, x, y):
    ypred = model.predict(x)
    mse = mean_squared_error(y, ypred)
    rmse = mse**(1/2)
    mae = mean_absolute_error(y, ypred)
    r2 = r2_score(y, ypred)
    print(f'Mean Squared Error: {mse:.2f}')
    print(f'Room Mean Squared Error: {rmse:.2f}')
    print(f'Mean Absolute Error: {mae:.2f}')
    print(f'R2 Score: {r2:.4f}')

In [59]:
# Evaluation of Ridge Model
print('Ridge Model Train Results: \n')
evaluate_model(best_ridge, xtrain, ytrain)

Ridge Model Train Results: 

Mean Squared Error: 723587574.77
Room Mean Squared Error: 26899.58
Mean Absolute Error: 16239.82
R2 Score: 0.8831


In [60]:
print('Ridge Model Test Results: \n')
evaluate_model(best_ridge, xtest, ytest)

Ridge Model Test Results: 

Mean Squared Error: 1120844375.40
Room Mean Squared Error: 33479.01
Mean Absolute Error: 17776.78
R2 Score: 0.8346


In [61]:
# Evaluation of Lasso Model
print('Lasso Model Train Results: \n')
evaluate_model(best_lasso, xtrain, ytrain)

Lasso Model Train Results: 

Mean Squared Error: 521770719.21
Room Mean Squared Error: 22842.30
Mean Absolute Error: 15310.43
R2 Score: 0.9157


In [62]:
print('Lasso Model Train Results: \n')
evaluate_model(best_lasso, xtest, ytest)

Lasso Model Train Results: 

Mean Squared Error: 1162589705.75
Room Mean Squared Error: 34096.77
Mean Absolute Error: 16906.22
R2 Score: 0.8285


# Cross Validation of R2 for Ridge & lasso

In [63]:
from sklearn.model_selection import cross_val_score
# Cross Validation For Ridge Model
ridge_scores = cross_val_score(best_ridge, xtrain, ytrain, cv=5, scoring='r2')
ridge_scores

array([0.61210718, 0.84934286, 0.89903411, 0.87585115, 0.92391518])

In [64]:
ridge_scores.mean()

0.8320500968307007

In [65]:
# Cross Validation for Lasso Model
lasso_scores = cross_val_score(best_lasso, xtrain, ytrain, cv=5, scoring='r2')
lasso_scores

array([0.62740301, 0.86739738, 0.90938634, 0.88627643, 0.93432028])

In [66]:
lasso_scores.mean()

0.8449566876787815

#### After Evaluating models we Conclude that Lasso Model is better than Ridge Model
### Lasso Model gives HIGHER Results in R2 Test = 0.8449

## Step 9: Out of Sample Data Prediction

In [67]:
xnew = pd.read_csv('sample_set.csv')
xnew.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [68]:
pre2

In [69]:
xnew_pre = pre2.transform(xnew)
xnew_pre.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,num__BsmtUnfSF,num__TotalBsmtSF,...,cat__Fence_GdPrv,cat__Fence_GdWo,cat__Fence_MnPrv,cat__Fence_MnWw,cat__Fence_Not_Avail,cat__MiscFeature_Gar2,cat__MiscFeature_Not_Avail,cat__MiscFeature_Othr,cat__MiscFeature_Shed,cat__MiscFeature_TenC
0,-0.872563,0.110763,-0.795151,0.381743,-0.340077,-0.57075,0.053428,0.604293,-0.672923,-0.400017,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.872563,0.37585,-0.071836,0.381743,-0.43944,0.027027,1.051363,-0.288653,-0.365032,0.619239,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,0.073375,0.332053,-0.795151,-0.5172,0.852269,-0.57075,0.761852,-0.288653,-0.974021,-0.295127,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.073375,-0.054002,-0.071836,0.381743,0.88539,-0.460051,0.347326,-0.288653,-0.550672,-0.299687,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,1.492282,-0.552407,1.374795,-0.5172,0.686666,-0.57075,-0.39619,-0.288653,1.018211,0.507509,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [70]:
preds = best_lasso.predict(xnew_pre)
preds

array([118607.79723585, 155269.7566509 , 179572.32134798, ...,
       175807.52471831, 115189.34968409, 221714.07535825])

## Save above into DataFrame

In [71]:
df_results = xnew[['Id']]
df_results

Unnamed: 0,Id
0,1461
1,1462
2,1463
3,1464
4,1465
...,...
1454,2915
1455,2916
1456,2917
1457,2918


In [72]:
df_results['SalePrice_pred'] = preds
df_results

Unnamed: 0,Id,SalePrice_pred
0,1461,118607.797236
1,1462,155269.756651
2,1463,179572.321348
3,1464,186081.888826
4,1465,200568.369426
...,...,...
1454,2915,85248.666902
1455,2916,82761.726436
1456,2917,175807.524718
1457,2918,115189.349684


In [73]:
df_results.to_csv('Results.csv', index = False)