## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score
%matplotlib inline

## Load Training Data

In [2]:
train = pd.read_csv('./datasets/train.csv')

In [3]:
train.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,...,0,0,,,,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,...,0,0,,,,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,138500


In [4]:
print(train.shape)

(2051, 81)


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 81 columns):
Id                 2051 non-null int64
PID                2051 non-null int64
MS SubClass        2051 non-null int64
MS Zoning          2051 non-null object
Lot Frontage       1721 non-null float64
Lot Area           2051 non-null int64
Street             2051 non-null object
Alley              140 non-null object
Lot Shape          2051 non-null object
Land Contour       2051 non-null object
Utilities          2051 non-null object
Lot Config         2051 non-null object
Land Slope         2051 non-null object
Neighborhood       2051 non-null object
Condition 1        2051 non-null object
Condition 2        2051 non-null object
Bldg Type          2051 non-null object
House Style        2051 non-null object
Overall Qual       2051 non-null int64
Overall Cond       2051 non-null int64
Year Built         2051 non-null int64
Year Remod/Add     2051 non-null int64
Roof Style         20

## Filling up null values

In [6]:
train['Lot Frontage'].fillna(0, inplace = True)

In [7]:
train['Alley'].fillna('None', inplace = True)

In [8]:
train['Mas Vnr Type'].fillna('None', inplace = True)

In [9]:
train['Mas Vnr Area'].fillna(0, inplace = True)

In [10]:
train['Garage Type'].fillna('None', inplace = True)

In [11]:
train['Garage Yr Blt'].fillna(0, inplace = True)

In [12]:
train['Garage Finish'].fillna('None', inplace = True)

In [13]:
train['Garage Cars'].fillna(0, inplace = True)

In [14]:
train['Garage Area'].fillna(0, inplace = True)

In [15]:
train['Garage Qual'].fillna('None',inplace = True)

In [16]:
train['Garage Cond'].fillna('None',inplace = True)

In [17]:
train['Bsmt Qual'].fillna('None', inplace = True)

In [18]:
train['Bsmt Cond'].fillna('None', inplace = True)

In [19]:
train['Bsmt Exposure'].fillna('None', inplace = True)

In [20]:
train['BsmtFin Type 1'].fillna('None', inplace = True)

In [21]:
train['BsmtFin SF 1'].fillna(0, inplace = True)

In [22]:
train['BsmtFin Type 2'].fillna('None', inplace = True)

In [23]:
train['BsmtFin SF 2'].fillna(0, inplace = True)

In [24]:
train['Bsmt Unf SF'].fillna(0, inplace = True)

In [25]:
train['Total Bsmt SF'].fillna(0, inplace = True)

In [26]:
train['Bsmt Full Bath'].fillna(0, inplace = True)

In [27]:
train['Bsmt Half Bath'].fillna(0, inplace = True)

In [28]:
train['Fireplace Qu'].fillna('None', inplace = True)

In [29]:
train['Pool QC'].fillna('None', inplace = True)

In [30]:
train['Fence'].fillna('None', inplace = True)

In [31]:
train['Misc Feature'].fillna('None', inplace = True)

In [32]:
train.isna().sum().sum()

0

## Convert categorical features to numerical features

In [33]:
train_categorical = train.select_dtypes(include = ['object'])

In [34]:
train_categorical.shape

(2051, 42)

In [35]:
train_categorical.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
MS Zoning,RL,RL,RL,RL,RL,RL,RM,RL,RL,RL
Street,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave
Alley,,,,,,,,,,
Lot Shape,IR1,IR1,Reg,Reg,IR1,IR1,Reg,IR1,Reg,IR1
Land Contour,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,HLS
Utilities,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub
Lot Config,CulDSac,CulDSac,Inside,Inside,Inside,Corner,Inside,Inside,Inside,Inside
Land Slope,Gtl,Gtl,Gtl,Gtl,Gtl,Gtl,Gtl,Gtl,Gtl,Sev
Neighborhood,Sawyer,SawyerW,NAmes,Timber,SawyerW,NAmes,Edwards,NAmes,OldTown,NAmes
Condition 1,RRAe,Norm,Norm,Norm,Norm,PosA,Norm,Norm,Artery,Norm


In [36]:
train['Exter Qual'] = train['Exter Qual'].replace(['Ex','Gd','TA','Fa'], [5,4,3,2])

In [37]:
train['Exter Cond'] = train['Exter Cond'].replace(['Ex','Gd','TA','Fa','Po'], [5,4,3,2,1])

In [38]:
train['Bsmt Qual'] = train['Bsmt Qual'].replace(['Ex','Gd','TA','Fa','Po','None'], [5,4,3,2,1,0])

In [39]:
train['Bsmt Cond'] = train['Bsmt Cond'].replace(['Ex','Gd','TA','Fa','Po','None'], [5,4,3,2,1,0])

In [40]:
train['Bsmt Exposure'] = train['Bsmt Exposure'].replace(['Gd','Av','Mn','No','None'], [4,3,2,1,0])

In [41]:
train['BsmtFin Type 1'] = train['BsmtFin Type 1'].replace(['GLQ','ALQ','BLQ','Rec','LwQ','Unf','None'], [6,5,4,3,2,1,0])

In [42]:
train['BsmtFin Type 2'] = train['BsmtFin Type 2'].replace(['GLQ','ALQ','BLQ','Rec','LwQ','Unf','None'], [6,5,4,3,2,1,0])

In [43]:
train['Heating QC'] = train['Heating QC'].replace(['Ex','Gd','TA','Fa','Po'], [5,4,3,2,1])

In [44]:
train['Kitchen Qual'] = train['Kitchen Qual'].replace(['Ex','Gd','TA','Fa'], [5,4,3,2])

In [45]:
train['Garage Finish'] = train['Garage Finish'].replace(['Fin','RFn','Unf','None'], [3,2,1,0])

In [46]:
train['Garage Qual'] = train['Garage Qual'].replace(['Ex','Gd','TA','Fa','Po','None'], [5,4,3,2,1,0])

In [47]:
train['Garage Cond'] = train['Garage Cond'].replace(['Ex','Gd','TA','Fa','Po','None'], [5,4,3,2,1,0])

In [48]:
train['Fireplace Qu'] = train['Fireplace Qu'].replace(['Ex','Gd','TA','Fa','Po','None'], [5,4,3,2,1,0])

## Removing columns

In [49]:
train.drop(columns = 'PID', inplace = True)

In [50]:
train['Pool QC'].value_counts()

None    2042
Gd         4
TA         2
Fa         2
Ex         1
Name: Pool QC, dtype: int64

In [51]:
# drop Pool QC column as there are only a few non-null values
train.drop(columns = 'Pool QC', inplace = True)

In [52]:
train_categorical = train.select_dtypes(include = ['object'])

In [53]:
train_categorical.shape

(2051, 28)

In [54]:
train_categorical.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
MS Zoning,RL,RL,RL,RL,RL,RL,RM,RL,RL,RL
Street,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave,Pave
Alley,,,,,,,,,,
Lot Shape,IR1,IR1,Reg,Reg,IR1,IR1,Reg,IR1,Reg,IR1
Land Contour,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,Lvl,HLS
Utilities,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub,AllPub
Lot Config,CulDSac,CulDSac,Inside,Inside,Inside,Corner,Inside,Inside,Inside,Inside
Land Slope,Gtl,Gtl,Gtl,Gtl,Gtl,Gtl,Gtl,Gtl,Gtl,Sev
Neighborhood,Sawyer,SawyerW,NAmes,Timber,SawyerW,NAmes,Edwards,NAmes,OldTown,NAmes
Condition 1,RRAe,Norm,Norm,Norm,Norm,PosA,Norm,Norm,Artery,Norm


In [55]:
train_numerical = train.select_dtypes(exclude = ['object'])

In [56]:
train = train_numerical.merge(pd.get_dummies(train_categorical, drop_first = True),left_index = True, right_index = True)

In [57]:
train.to_csv('./datasets/train_cleaned.csv', index = False)

In [58]:
train.shape

(2051, 214)