## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score
%matplotlib inline

## Load Testing Data

In [2]:
test = pd.read_csv('./datasets/test.csv')

In [3]:
test.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [4]:
print(test.shape)

(879, 80)


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 0 to 878
Data columns (total 80 columns):
Id                 879 non-null int64
PID                879 non-null int64
MS SubClass        879 non-null int64
MS Zoning          879 non-null object
Lot Frontage       719 non-null float64
Lot Area           879 non-null int64
Street             879 non-null object
Alley              58 non-null object
Lot Shape          879 non-null object
Land Contour       879 non-null object
Utilities          879 non-null object
Lot Config         879 non-null object
Land Slope         879 non-null object
Neighborhood       879 non-null object
Condition 1        879 non-null object
Condition 2        879 non-null object
Bldg Type          879 non-null object
House Style        879 non-null object
Overall Qual       879 non-null int64
Overall Cond       879 non-null int64
Year Built         879 non-null int64
Year Remod/Add     879 non-null int64
Roof Style         879 non-null object
Roof M

## Filling up null values

In [6]:
test['Lot Frontage'].fillna(0, inplace = True)

In [7]:
test['Alley'].fillna('None', inplace = True)

In [8]:
test['Mas Vnr Type'].fillna('None', inplace = True)

In [9]:
test['Mas Vnr Area'].fillna(0, inplace = True)

In [10]:
test['Garage Type'].fillna('None', inplace = True)

In [11]:
test['Garage Yr Blt'].fillna(0, inplace = True)

In [12]:
test['Garage Finish'].fillna('None', inplace = True)

In [13]:
test['Garage Cars'].fillna(0, inplace = True)

In [14]:
test['Garage Area'].fillna(0, inplace = True)

In [15]:
test['Garage Qual'].fillna('None',inplace = True)

In [16]:
test['Garage Cond'].fillna('None',inplace = True)

In [17]:
test['Bsmt Qual'].fillna('None', inplace = True)

In [18]:
test['Bsmt Cond'].fillna('None', inplace = True)

In [19]:
test['Bsmt Exposure'].fillna('None', inplace = True)

In [20]:
test['BsmtFin Type 1'].fillna('None', inplace = True)

In [21]:
test['BsmtFin SF 1'].fillna(0, inplace = True)

In [22]:
test['BsmtFin Type 2'].fillna('None', inplace = True)

In [23]:
test['BsmtFin SF 2'].fillna(0, inplace = True)

In [24]:
test['Bsmt Unf SF'].fillna(0, inplace = True)

In [25]:
test['Total Bsmt SF'].fillna(0, inplace = True)

In [26]:
test['Bsmt Full Bath'].fillna(0, inplace = True)

In [27]:
test['Bsmt Half Bath'].fillna(0, inplace = True)

In [28]:
test['Fireplace Qu'].fillna('None', inplace = True)

In [29]:
test['Pool QC'].fillna('None', inplace = True)

In [30]:
test['Fence'].fillna('None', inplace = True)

In [31]:
test['Misc Feature'].fillna('None', inplace = True)

## Convert categorical features to numerical features

In [32]:
test_categorical = test.select_dtypes(include = ['object'])

In [33]:
test_categorical.shape

(879, 42)

In [34]:
test_categorical.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
MS Zoning,RM,RL,RL,RM,RL,RM,RM,RL,FV,RL
Street,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave
Alley,Grvl,,,,,,,,Pave,
Lot Shape,Reg,IR1,IR1,Reg,IR1,Reg,Reg,IR1,Reg,Reg
Land Contour,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl
Utilities,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub
Lot Config,Inside,Inside,Inside,Inside,Inside,Inside,Inside,CulDSac,Inside,Inside
Land Slope,Gtl,Gtl,Gtl,Gtl,Gtl,Gtl,Gtl,Mod,Gtl,Gtl
Neighborhood,OldTown,Sawyer,Gilbert,OldTown,NAmes,MeadowV,OldTown,CollgCr,Somerst,Mitchel
Condition 1,Norm,Norm,Norm,Norm,Norm,Norm,Norm,Norm,Norm,Norm


In [35]:
test['Exter Qual'] = test['Exter Qual'].replace(['Ex','Gd','TA','Fa'], [5,4,3,2])

In [36]:
test['Exter Cond'] = test['Exter Cond'].replace(['Ex','Gd','TA','Fa','Po'], [5,4,3,2,1])

In [37]:
test['Bsmt Qual'] = test['Bsmt Qual'].replace(['Ex','Gd','TA','Fa','Po','None'], [5,4,3,2,1,0])

In [38]:
test['Bsmt Cond'] = test['Bsmt Cond'].replace(['Ex','Gd','TA','Fa','Po','None'], [5,4,3,2,1,0])

In [39]:
test['Bsmt Exposure'] = test['Bsmt Exposure'].replace(['Gd','Av','Mn','No','None'], [4,3,2,1,0])

In [40]:
test['BsmtFin Type 1'] = test['BsmtFin Type 1'].replace(['GLQ','ALQ','BLQ','Rec','LwQ','Unf','None'], [6,5,4,3,2,1,0])

In [41]:
test['BsmtFin Type 2'] = test['BsmtFin Type 2'].replace(['GLQ','ALQ','BLQ','Rec','LwQ','Unf','None'], [6,5,4,3,2,1,0])

In [42]:
test['Heating QC'] = test['Heating QC'].replace(['Ex','Gd','TA','Fa'], [5,4,3,2])

In [43]:
test['Kitchen Qual'] = test['Kitchen Qual'].replace(['Ex','Gd','TA','Fa','Po'], [5,4,3,2,1])

In [44]:
test['Garage Finish'] = test['Garage Finish'].replace(['Fin','RFn','Unf','None'], [3,2,1,0])

In [45]:
test['Garage Qual'] = test['Garage Qual'].replace(['Ex','Gd','TA','Fa','Po','None'], [5,4,3,2,1,0])

In [46]:
test['Garage Cond'] = test['Garage Cond'].replace(['Ex','Gd','TA','Fa','Po','None'], [5,4,3,2,1,0])

In [47]:
test['Fireplace Qu'] = test['Fireplace Qu'].replace(['Ex','Gd','TA','Fa','Po','None'], [5,4,3,2,1,0])

## Removing columns

In [48]:
test.drop(columns = 'PID', inplace = True)

In [49]:
test['Pool QC'].value_counts()

None    875
Ex        3
TA        1
Name: Pool QC, dtype: int64

In [50]:
# drop Pool QC column as there are only a few non-null values
test.drop(columns = 'Pool QC', inplace = True)

In [51]:
test_categorical = test.select_dtypes(include = ['object'])

In [52]:
test_categorical.shape

(879, 28)

In [53]:
test_categorical.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
MS Zoning,RM,RL,RL,RM,RL,RM,RM,RL,FV,RL
Street,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave
Alley,Grvl,,,,,,,,Pave,
Lot Shape,Reg,IR1,IR1,Reg,IR1,Reg,Reg,IR1,Reg,Reg
Land Contour,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl
Utilities,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub
Lot Config,Inside,Inside,Inside,Inside,Inside,Inside,Inside,CulDSac,Inside,Inside
Land Slope,Gtl,Gtl,Gtl,Gtl,Gtl,Gtl,Gtl,Mod,Gtl,Gtl
Neighborhood,OldTown,Sawyer,Gilbert,OldTown,NAmes,MeadowV,OldTown,CollgCr,Somerst,Mitchel
Condition 1,Norm,Norm,Norm,Norm,Norm,Norm,Norm,Norm,Norm,Norm


In [54]:
test_numerical = test.select_dtypes(exclude = ['object'])

In [55]:
test = test_numerical.merge(pd.get_dummies(test_categorical, drop_first = True),left_index = True, right_index = True)

In [56]:
test.to_csv('./datasets/test_cleaned.csv', index = False)

In [57]:
test.shape

(879, 200)