In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [3]:
import numpy as np
import pandas as pd

## Data Insights

In [4]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [5]:
len(train), len(test)

(1460, 1459)

In [6]:
y_train = train['SalePrice']
train.drop(['SalePrice', 'Id'], axis=1, inplace=True)
test_ids = test['Id'] 
test.drop(['Id'], axis=1, inplace=True)

In [7]:
train_columns = list(train.columns) 
test_columns = list(test.columns)
print(set(train_columns)-set(test_columns))
print(set(test_columns)-set(train_columns))

set()
set()


We can see that train_columns is superset of test_columns andthe only additional entry is SalePrice which is to be predicted

In [8]:
train_cols_with_null = []
for col in train_columns:
    null_num = train[col].isna().sum()
    if null_num>0:
        print(f'Column: {col} -- Null values: {null_num}')
        train_cols_with_null.append(col)

Column: LotFrontage -- Null values: 259
Column: Alley -- Null values: 1369
Column: MasVnrType -- Null values: 872
Column: MasVnrArea -- Null values: 8
Column: BsmtQual -- Null values: 37
Column: BsmtCond -- Null values: 37
Column: BsmtExposure -- Null values: 38
Column: BsmtFinType1 -- Null values: 37
Column: BsmtFinType2 -- Null values: 38
Column: Electrical -- Null values: 1
Column: FireplaceQu -- Null values: 690
Column: GarageType -- Null values: 81
Column: GarageYrBlt -- Null values: 81
Column: GarageFinish -- Null values: 81
Column: GarageQual -- Null values: 81
Column: GarageCond -- Null values: 81
Column: PoolQC -- Null values: 1453
Column: Fence -- Null values: 1179
Column: MiscFeature -- Null values: 1406


In [9]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

In [10]:
np.sum(np.array(train[train['BsmtQual'].isna()].index) != np.array(train[train['BsmtCond'].isna()].index))

0

### Addressing nan on column by column basis

In [11]:
encoder_dict = {} #All encoders not in the dict will be a simple pass

Dropping PoolQc and MiscFeature from both train and test

In [12]:
print('PoolQC' in train.columns, 'MiscFeature' in train.columns)
train.drop(['PoolQC', 'MiscFeature'], axis=1, inplace=True)
test.drop(['PoolQC', 'MiscFeature'], axis=1, inplace=True)
print('PoolQC' in train.columns, 'MiscFeature' in train.columns)

True True
False False


In [13]:
train_columns = train.columns
test_columns = test.columns

Alley

In [14]:
train.fillna({'Alley':"NA"}, inplace=True)
ord = OrdinalEncoder(categories=[['NA', 'Grvl', 'Pave']])
arr = np.expand_dims(train['Alley'].unique(), -1)
print(arr, ord.fit_transform(arr))
encoder_dict['Alley'] = ('ordinal',ord)

[['NA']
 ['Grvl']
 ['Pave']] [[0.]
 [1.]
 [2.]]


In [15]:
test.fillna({'Alley':"NA"}, inplace=True)
print(train['Alley'].unique(), test['Alley'].unique())

['NA' 'Grvl' 'Pave'] ['NA' 'Pave' 'Grvl']


MasVnrType

In [16]:
train['MasVnrType'].unique()

array(['BrkFace', nan, 'Stone', 'BrkCmn'], dtype=object)

In [17]:
train.fillna({'MasVnrType':'NA'}, inplace=True)
test.fillna({'MasVnrType':'NA'}, inplace=True)
oh = OneHotEncoder(sparse_output = False)
print(oh.fit_transform(np.expand_dims(np.array(list(train['MasVnrType'].unique())+ list(test['MasVnrType'].unique())),-1)))
print(train['MasVnrType'].unique())
encoder_dict['MasVnrType'] = ('onehot',oh)

[[0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]]
['BrkFace' 'NA' 'Stone' 'BrkCmn']


BsmtQual

In [18]:
train.fillna({'BsmtQual':'NA'}, inplace=True)
test.fillna({'BsmtQual':'NA'}, inplace=True)
ord = OrdinalEncoder(categories=[['NA', 'Po', 'Fa', 'TA',
                                 'Gd', 'Ex']])
print(ord.fit_transform((np.expand_dims(train['BsmtQual'].unique(),-1))))
print(train['BsmtQual'].unique())
encoder_dict['BsmtQual'] = ('ordinal',ord)

[[4.]
 [3.]
 [5.]
 [0.]
 [2.]]
['Gd' 'TA' 'Ex' 'NA' 'Fa']


BsmtCond

In [19]:
train.fillna({'BsmtCond':'NA'}, inplace=True)
test.fillna({'BsmtCond':'NA'}, inplace=True)
ord = OrdinalEncoder(categories=[['NA', 'Po', 'Fa', 'TA',
                                 'Gd', 'Ex']])
print(ord.fit_transform((np.expand_dims(train['BsmtCond'].unique(),-1))))
print(train['BsmtCond'].unique())
encoder_dict['BsmtCond'] = ('ordinal',ord)

[[3.]
 [4.]
 [0.]
 [2.]
 [1.]]
['TA' 'Gd' 'NA' 'Fa' 'Po']


BsmtExposure

In [20]:
print(set(np.array(train[train['BsmtExposure'].isna()].index))-set(np.array(train[train['BsmtCond']=='NA'].index)))
print(set(np.array(train[train['BsmtCond']=='NA'].index))-set(np.array(train[train['BsmtExposure'].isna()].index)))

{948}
set()


In [21]:
train.iloc[[948]][['BsmtExposure', 'BsmtQual', 'BsmtCond', ]]

Unnamed: 0,BsmtExposure,BsmtQual,BsmtCond
948,,Gd,TA


In [22]:
train.loc[train['BsmtQual'] == 'NA']['BsmtExposure']

17      NaN
39      NaN
90      NaN
102     NaN
156     NaN
182     NaN
259     NaN
342     NaN
362     NaN
371     NaN
392     NaN
520     NaN
532     NaN
533     NaN
553     NaN
646     NaN
705     NaN
736     NaN
749     NaN
778     NaN
868     NaN
894     NaN
897     NaN
984     NaN
1000    NaN
1011    NaN
1035    NaN
1045    NaN
1048    NaN
1049    NaN
1090    NaN
1179    NaN
1216    NaN
1218    NaN
1232    NaN
1321    NaN
1412    NaN
Name: BsmtExposure, dtype: object

In [23]:
train.loc[train['BsmtQual'] == 'NA', 'BsmtExposure'] = 'NA'
test.loc[test['BsmtQual'] == 'NA', 'BsmtExposure'] = 'NA'
train.fillna({'BsmtExposure':'NA'}, inplace=True)
test.fillna({'BsmtExposure':'NA'}, inplace=True)

ord = OrdinalEncoder(categories=[['NA', 'No', 'Mn', 'Av', 'Gd']])
print(ord.fit_transform((np.expand_dims(train['BsmtExposure'].unique(),-1))))
print(train['BsmtExposure'].unique())
encoder_dict['BsmtExposure'] = ('ordinal',ord)

train['BsmtExposure'].isna().sum(), test['BsmtExposure'].isna().sum()


[[1.]
 [4.]
 [2.]
 [3.]
 [0.]]
['No' 'Gd' 'Mn' 'Av' 'NA']


(0, 0)

BsmtFinType1

In [24]:
train.loc[train['BsmtQual'] == 'NA', 'BsmtFinType1'] = 'NA'
test.loc[test['BsmtQual'] == 'NA', 'BsmtFinType1'] = 'NA'
train.fillna({'BsmtFinType1':'Unf'}, inplace=True)
test.fillna({'BsmtFinType1':'Unf'}, inplace=True)

ord = OrdinalEncoder(categories=[['NA', 'Unf', 'LwQ', 'Rec',
                                 'BLQ', 'ALQ', 'GLQ']])
print(ord.fit_transform((np.expand_dims(train['BsmtFinType1'].unique(),-1))))
print(train['BsmtFinType1'].unique())
encoder_dict['BsmtFinType1'] = ('ordinal',ord)

train['BsmtFinType1'].isna().sum(), test['BsmtFinType1'].isna().sum()

[[6.]
 [5.]
 [1.]
 [3.]
 [4.]
 [0.]
 [2.]]
['GLQ' 'ALQ' 'Unf' 'Rec' 'BLQ' 'NA' 'LwQ']


(0, 0)

BsmtFinType2

In [25]:
train.loc[train['BsmtQual'] == 'NA', 'BsmtFinType2'] = 'NA'
test.loc[test['BsmtQual'] == 'NA', 'BsmtFinType2'] = 'NA'
train.fillna({'BsmtFinType2':'Unf'}, inplace=True)
test.fillna({'BsmtFinType2':'Unf'}, inplace=True)

ord = OrdinalEncoder(categories=[['NA', 'Unf', 'LwQ', 'Rec',
                                 'BLQ', 'ALQ', 'GLQ']])
print(ord.fit_transform((np.expand_dims(train['BsmtFinType2'].unique(),-1))))
print(train['BsmtFinType2'].unique())
encoder_dict['BsmtFinType2'] = ('ordinal',ord)

train['BsmtFinType2'].isna().sum(), test['BsmtFinType2'].isna().sum()

[[1.]
 [4.]
 [0.]
 [5.]
 [3.]
 [2.]
 [6.]]
['Unf' 'BLQ' 'NA' 'ALQ' 'Rec' 'LwQ' 'GLQ']


(0, 0)

Electrical

In [26]:
train.fillna({'Electrical':'Mix'}, inplace=True)
test.fillna({'Electrical':'Mix'}, inplace=True)
ord = OrdinalEncoder(categories=[['Mix', 'FuseP', 'FuseF', 'FuseA',
                                 'SBrkr']])
print(ord.fit_transform(np.expand_dims(np.array(train['Electrical'].unique()), axis=-1)))
print(train['Electrical'].unique())
encoder_dict['Electrical'] = ('ordinal',ord)
train['Electrical'].isna().sum(), test['Electrical'].isna().sum()

[[4.]
 [2.]
 [3.]
 [1.]
 [0.]]
['SBrkr' 'FuseF' 'FuseA' 'FuseP' 'Mix']


(0, 0)

FireplaceQu

In [27]:
ind1 = set(train[train['Fireplaces']==0].index)
ind2 = set(train[train['FireplaceQu'].isna()].index)
print(ind1-ind2)
print(ind2-ind1)

set()
set()


In [28]:
ind1 = set(test[test['Fireplaces']==0].index)
ind2 = set(test[test['FireplaceQu'].isna()].index)
print(ind1-ind2)
print(ind2-ind1)

set()
set()


In [29]:
train.fillna({'FireplaceQu': 'NA'}, inplace=True)
test.fillna({'FireplaceQu': 'NA'}, inplace=True)
ord = OrdinalEncoder(categories=[['NA', 'Po', 'Fa', 
                                 'TA', 'Gd', 'Ex']])
print(ord.fit_transform(np.expand_dims(np.array(train['FireplaceQu'].unique()), -1)))
print(train['FireplaceQu'].unique())
encoder_dict['FireplaceQu'] = ('ordinal',ord)
train['FireplaceQu'].isna().sum(), test['FireplaceQu'].isna().sum()

[[0.]
 [3.]
 [4.]
 [2.]
 [5.]
 [1.]]
['NA' 'TA' 'Gd' 'Fa' 'Ex' 'Po']


(0, 0)

Address the nan values in
GarageType, GarageYrBlt, GarageFinish, GarageQual, GarageCond

In [30]:
((train['GarageCars'] != 0) & (train['GarageType'].isna()|
                              train['GarageYrBlt'].isna()|
                              train['GarageFinish'].isna()|
                              train['GarageQual'].isna()|
                              train['GarageCond'].isna())
).sum()

0

In [31]:
(train['GarageType'].isna()|train['GarageYrBlt'].isna()|
 train['GarageFinish'].isna()|train['GarageQual'].isna()|
 train['GarageCond'].isna()).sum()

81

In [32]:
((train['GarageCars'] != 0) & (train['GarageArea'] == 0)).sum()

0

In [33]:
((test['GarageCars'] != 0) & (test['GarageType'].isna()|
                              test['GarageYrBlt'].isna()|
                              test['GarageFinish'].isna()|
                              test['GarageQual'].isna()|
                              test['GarageCond'].isna())
).sum()

2

In [34]:
((test['GarageCars'] != 0) & (test['GarageArea'] == 0)).sum()

0

In [35]:
train.fillna({'GarageType':'NA', 'GarageYrBlt':2100,
             'GarageFinish': 'NA', 'GarageQual': 'NA',
             'GarageCond': 'NA'}, inplace=True)

test.fillna({'GarageType':'NA', 'GarageYrBlt':2100,
             'GarageFinish': 'NA', 'GarageQual': 'NA',
             'GarageCond': 'NA'}, inplace=True)

print('For GarageType')
oh = OneHotEncoder(sparse_output=False)
print(oh.fit_transform(np.expand_dims(list(train['GarageType'].unique())+list(test['GarageType'].unique()),-1)))
print(train['GarageType'].unique())
encoder_dict['GarageType'] = ('onehot',oh)

print('For GarageFinish')
ord = OrdinalEncoder(categories=[['NA', 'Unf', 'RFn', 'Fin']])
print(ord.fit_transform(np.expand_dims(train['GarageFinish'].unique(),-1)))
print(train['GarageFinish'].unique())
encoder_dict['GarageFinish'] = ('ordinal',ord)

print('For GarageQual')
ord = OrdinalEncoder(categories=[['NA', 'Po', 'Fa', 'TA', 
                                 'Gd', 'Ex']])
print(ord.fit_transform(np.expand_dims(train['GarageQual'].unique(),-1)))
print(train['GarageQual'].unique())
encoder_dict['GarageQual'] = ('ordinal',ord)

print('For GarageCond')
ord = OrdinalEncoder(categories=[['NA', 'Po', 'Fa', 'TA', 
                                 'Gd', 'Ex']])
print(ord.fit_transform(np.expand_dims(train['GarageCond'].unique(),-1)))
print(train['GarageCond'].unique())
encoder_dict['GarageCond'] = ('ordinal',ord)


print(f"Number of cumu zeros: {(train['GarageType'].isna()|train['GarageYrBlt'].isna()|train['GarageFinish'].isna()|train['GarageQual'].isna()|train['GarageCond'].isna()).sum()}")

For GarageType
[[0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]]
['Attchd' 'Detchd' 'BuiltIn' 'CarPort' 'NA' 'Basment' '2Types']
For GarageFinish
[[2.]
 [1.]
 [3.]
 [0.]]
['RFn' 'Unf' 'Fin' 'NA']
For GarageQual
[[3.]
 [2.]
 [4.]
 [0.]
 [5.]
 [1.]]
['TA' 'Fa' 'Gd' 'NA' 'Ex' 'Po']
For GarageCond
[[3.]
 [2.]
 [0.]
 [4.]
 [1.]
 [5.]]
['TA' 'Fa' 'NA' 'Gd' 'Po' 'Ex']
Number of cumu zeros: 0


Fence

In [36]:
train.fillna({'Fence':'NA'}, inplace=True)
test.fillna({'Fence':'NA'}, inplace=True)
ord = OrdinalEncoder(categories=[['NA', 'MnWw', 'GdWo', 
                                 'MnPrv', 'GdPrv']])
print(ord.fit_transform(np.expand_dims(np.array(train['Fence'].unique()), -1)))
print(train['Fence'].unique())

encoder_dict['Fence'] = ('ordinal', ord)
train['Fence'].isna().sum(), test['Fence'].isna().sum()

[[0.]
 [3.]
 [2.]
 [4.]
 [1.]]
['NA' 'MnPrv' 'GdWo' 'GdPrv' 'MnWw']


(0, 0)

In [37]:
test_cols_with_null = []

for col in test_columns:
    null_num = test[col].isna().sum()
    if null_num>0:
        test_cols_with_null.append(null_num)
        print(f'Column {col} -- Null values {null_num}')

Column MSZoning -- Null values 4
Column LotFrontage -- Null values 227
Column Utilities -- Null values 2
Column Exterior1st -- Null values 1
Column Exterior2nd -- Null values 1
Column MasVnrArea -- Null values 15
Column BsmtFinSF1 -- Null values 1
Column BsmtFinSF2 -- Null values 1
Column BsmtUnfSF -- Null values 1
Column TotalBsmtSF -- Null values 1
Column BsmtFullBath -- Null values 2
Column BsmtHalfBath -- Null values 2
Column KitchenQual -- Null values 1
Column Functional -- Null values 2
Column GarageCars -- Null values 1
Column GarageArea -- Null values 1
Column SaleType -- Null values 1


In [38]:
train['LotFrontage'][0]

65.0

Setting column to data type dictionary for all columns

In [39]:
col_to_dtype = {}
# All non_category columns will find an entry in the above dictionary
col_to_dtype['LotFrontage'] = 'float'
col_to_dtype['LotArea'] = 'float'

col_to_dtype['MasVnrArea'] = 'float'
col_to_dtype['BsmtFinSF1'] = 'float'
col_to_dtype['BsmtFinSF2'] = 'float'
col_to_dtype['BsmtUnfSF'] = 'float'
col_to_dtype['TotalBsmtSF'] = 'float'

col_to_dtype['1stFlrSF'] = 'float'
col_to_dtype['2ndFlrSF'] = 'float'
col_to_dtype['LowQualFinSF'] = 'float'
col_to_dtype['GrLivArea'] = 'float'
col_to_dtype['BsmtFullBath'] = 'int'
col_to_dtype['BsmtHalfBath'] = 'int'
col_to_dtype['FullBath'] = 'int'
col_to_dtype['HalfBath'] = 'int'
col_to_dtype['Bedroom'] = 'int'
col_to_dtype['BedroomAbvGr'] = 'int'
col_to_dtype['Kitchen'] = 'int'
col_to_dtype['KitchenAbvGr'] = 'int'

col_to_dtype['TotRmsAbvGrd'] = 'int'
col_to_dtype['Fireplaces'] = 'int'
col_to_dtype['GarageYrBlt'] = 'int'
col_to_dtype['GarageCars'] = 'int'
col_to_dtype['GarageArea'] = 'float'

col_to_dtype['WoodDeckSF'] = 'float'
col_to_dtype['OpenPorchSF'] = 'float'
col_to_dtype['EnclosedPorch'] = 'float'
col_to_dtype['3SsnPorch'] = 'float'
col_to_dtype['ScreenPorch'] = 'float'
col_to_dtype['PoolArea'] = 'float'

col_to_dtype['MiscVal'] = 'int'
col_to_dtype['MoSold'] = 'int'
col_to_dtype['YrSold'] = 'int'
col_to_dtype['YearBuilt'] = 'int'
col_to_dtype['YearRemodAdd'] = 'int'

col_to_dtype['OverallQual'] = 'int'
col_to_dtype['OverallCond'] = 'int'

In [40]:
year_or_month_features = ['GarageYrBlt', 'MoSold', 'YrSold', 
                         'YearBuilt', 'YearRemodAdd']

Column LotFrontage -- Null values 259
Column Alley -- Null values 1369
Column MasVnrType -- Null values 872
Column MasVnrArea -- Null values 8
Column BsmtQual -- Null values 37
Column BsmtCond -- Null values 37
Column BsmtExposure -- Null values 38
Column BsmtFinType1 -- Null values 37
Column BsmtFinType2 -- Null values 38
Column Electrical -- Null values 1
Column FireplaceQu -- Null values 690
Column GarageType -- Null values 81
Column GarageYrBlt -- Null values 81
Column GarageFinish -- Null values 81
Column GarageQual -- Null values 81
Column GarageCond -- Null values 81
Column PoolQC -- Null values 1453
Column Fence -- Null values 1179
Column MiscFeature -- Null values 1406

Column MSZoning -- Null values 4
Column LotFrontage -- Null values 227
Column Alley -- Null values 1352
Column Utilities -- Null values 2
Column Exterior1st -- Null values 1
Column Exterior2nd -- Null values 1
Column MasVnrType -- Null values 894
Column MasVnrArea -- Null values 15
Column BsmtQual -- Null values 44
Column BsmtCond -- Null values 45
Column BsmtExposure -- Null values 44
Column BsmtFinType1 -- Null values 42
Column BsmtFinSF1 -- Null values 1
Column BsmtFinType2 -- Null values 42
Column BsmtFinSF2 -- Null values 1
Column BsmtUnfSF -- Null values 1
Column TotalBsmtSF -- Null values 1
Column BsmtFullBath -- Null values 2
Column BsmtHalfBath -- Null values 2
Column KitchenQual -- Null values 1
Column Functional -- Null values 2
Column FireplaceQu -- Null values 730
Column GarageType -- Null values 76
Column GarageYrBlt -- Null values 78
Column GarageFinish -- Null values 78
Column GarageCars -- Null values 1
Column GarageArea -- Null values 1
Column GarageQual -- Null values 78
Column GarageCond -- Null values 78
Column PoolQC -- Null values 1456
Column Fence -- Null values 1169
Column MiscFeature -- Null values 1408
Column SaleType -- Null values 1

In [41]:
for col in train.columns:
    if(train[col].isna().sum()>0):
        print(f"{col} --{train[col].isna().sum()}")

LotFrontage --259
MasVnrArea --8


In [42]:
for col in test.columns:
    if(test[col].isna().sum()>0):
        print(f"{col}--{test[col].isna().sum()}")

MSZoning--4
LotFrontage--227
Utilities--2
Exterior1st--1
Exterior2nd--1
MasVnrArea--15
BsmtFinSF1--1
BsmtFinSF2--1
BsmtUnfSF--1
TotalBsmtSF--1
BsmtFullBath--2
BsmtHalfBath--2
KitchenQual--1
Functional--2
GarageCars--1
GarageArea--1
SaleType--1


Filling the missing values in columns with few missing entries to some values

In [43]:
test.fillna({'MSZoning':'A', 'Utilities': 'ELO', 'Exterior1st': 'WdShing',
            'Exterior2nd': 'WdShing', 'BsmtFinSF1':0, 'BsmtFinSF2': 0,
            'BsmtUnfSF':0, 'TotalBsmtSF':0, 'BsmtFullBath': 0, 'BsmtHalfBath': 0,
            'KitchenQual': 'TA', 'Functional': 'Mod', 'GarageCars': 0,
            'GarageArea': 0, 'SaleType': 'Con'}, inplace=True)

In [44]:
for col in test.columns:
    if(test[col].isna().sum()>0):
        print(f"{col}--{test[col].isna().sum()}")

LotFrontage--227
MasVnrArea--15


MSZoning

In [45]:
ord = OrdinalEncoder(categories=[['A','C (all)','FV','I','RH','RL','RP','RM']])
print(ord.fit_transform(np.expand_dims(np.array(train['MSZoning'].unique()), -1)))
print(train['MSZoning'].unique())

encoder_dict['MSZoning'] = ('ordinal', ord)

[[5.]
 [7.]
 [1.]
 [2.]
 [4.]]
['RL' 'RM' 'C (all)' 'FV' 'RH']


Street

In [46]:
ord = OrdinalEncoder(categories=[['Grvl', 'Pave']])
print(ord.fit_transform(np.expand_dims(np.array(train['Street'].unique()), -1)))
print(train['Street'].unique())

encoder_dict['Street'] = ('ordinal', ord)

[[1.]
 [0.]]
['Pave' 'Grvl']


LandContour

In [47]:
ord = OrdinalEncoder(categories=[['Low', 'HLS', 'Bnk', 'Lvl']])
print(ord.fit_transform(np.expand_dims(np.array(train['LandContour'].unique()), -1)))
print(train['LandContour'].unique())

encoder_dict['LandContour'] = ('ordinal', ord)

[[3.]
 [2.]
 [0.]
 [1.]]
['Lvl' 'Bnk' 'Low' 'HLS']


Utilities

In [48]:
ord = OrdinalEncoder(categories=[['ELO', 'NoSeWa', 'NoSewr', 'AllPub']])
print(ord.fit_transform(np.expand_dims(np.array(train['Utilities'].unique()), -1)))
print(train['Utilities'].unique())

encoder_dict['Utilities'] = ('ordinal', ord)

[[3.]
 [1.]]
['AllPub' 'NoSeWa']


LotConfig

In [49]:
oh = OneHotEncoder(sparse_output=False)
print(oh.fit_transform(np.expand_dims(np.array(train['LotConfig'].unique()), -1)))
print(train['LotConfig'].unique())

encoder_dict['LotConfig'] = ('onehot', oh)

[[0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0.]]
['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']


LandSlope

In [50]:
ord = OrdinalEncoder(categories=[['Gtl', 'Mod', 'Sev']])
print(ord.fit_transform(np.expand_dims(np.array(train['LandSlope'].unique()), -1)))
print(train['LandSlope'].unique())

encoder_dict['LandSlope'] = ('ordinal', ord)

[[0.]
 [1.]
 [2.]]
['Gtl' 'Mod' 'Sev']


Neighborhood

In [51]:
oh = OneHotEncoder(sparse_output=False)
print(oh.fit_transform(np.expand_dims(np.array(list(train['Neighborhood'].unique())+list(test['Neighborhood'].unique())), -1)))
print(train['Neighborhood'].unique())

encoder_dict['Neighborhood'] = ('onehot', oh)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]
['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes'
 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR'
 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 'NPkVill'
 'Blmngtn' 'BrDale' 'SWISU' 'Blueste']


Condition1

In [52]:
oh = OneHotEncoder(sparse_output=False)
print(oh.fit_transform(np.expand_dims(np.array(list(train['Condition1'].unique())+list(test['Condition1'].unique())), -1)))
print(train['Condition1'].unique())

encoder_dict['Condition1'] = ('onehot', oh)

[[0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0.]]
['Norm' 'Feedr' 'PosN' 'Artery' 'RRAe' 'RRNn' 'RRAn' 'PosA' 'RRNe']


Condition2

In [53]:
oh = OneHotEncoder(sparse_output=False)
print(oh.fit_transform(np.expand_dims(np.array(list(train['Condition2'].unique())+list(test['Condition2'].unique())), -1)))
print(train['Condition2'].unique())

encoder_dict['Condition2'] = ('onehot', oh)

[[0. 0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]]
['Norm' 'Artery' 'RRNn' 'Feedr' 'PosN' 'PosA' 'RRAn' 'RRAe']


BldgType

In [54]:
ord = OrdinalEncoder(categories=[['1Fam', '2fmCon', 'Duplex', 'TwnhsE', 'Twnhs']])
print(ord.fit_transform(np.expand_dims(np.array(train['BldgType'].unique()), -1)))
print(train['BldgType'].unique())

encoder_dict['BldgType'] = ('ordinal', ord)

[[0.]
 [1.]
 [2.]
 [3.]
 [4.]]
['1Fam' '2fmCon' 'Duplex' 'TwnhsE' 'Twnhs']


HouseStyle

In [55]:
ord = OrdinalEncoder(categories=[['1Story', '1.5Unf', '1.5Fin', '2Story', 'SFoyer', 
                                 '2.5Unf', '2.5Fin', 'SLvl']])
print(ord.fit_transform(np.expand_dims(np.array(train['HouseStyle'].unique()), -1)))
print(train['HouseStyle'].unique())

encoder_dict['HouseStyle'] = ('ordinal', ord)

[[3.]
 [0.]
 [2.]
 [1.]
 [4.]
 [7.]
 [5.]
 [6.]]
['2Story' '1Story' '1.5Fin' '1.5Unf' 'SFoyer' 'SLvl' '2.5Unf' '2.5Fin']


RoofMatl

In [56]:
oh = OneHotEncoder(sparse_output=False)
print(oh.fit_transform(np.expand_dims(np.array(list(train['RoofMatl'].unique())+list(test['RoofMatl'].unique())), -1)))
print(train['RoofMatl'].unique())

encoder_dict['RoofMatl'] = ('onehot', oh)

[[0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]]
['CompShg' 'WdShngl' 'Metal' 'WdShake' 'Membran' 'Tar&Grv' 'Roll'
 'ClyTile']


Exterior1st

In [57]:
oh = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
print(oh.fit_transform(np.expand_dims(np.array(list(train['Exterior1st'].unique())+list(test['Exterior1st'].unique())), -1)))
print(train['Exterior1st'].unique())

encoder_dict['Exterior1st'] = ('onehot', oh)

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.

Exterior2nd

In [58]:
oh = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
print(oh.fit_transform(np.expand_dims(np.array(list(train['Exterior2nd'].unique())+list(test['Exterior2nd'].unique())), -1)))
print(train['Exterior2nd'].unique())

encoder_dict['Exterior2nd'] = ('onehot', oh)

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.

In [59]:
n = encoder_dict['Exterior2nd'][1].transform(np.expand_dims(np.array(test['Exterior2nd'].unique()),-1))
print(n.shape)
ns = np.sum(n, axis=1)
print(ns, ns.shape)

(16, 17)
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.] (16,)


In [60]:
n = encoder_dict['Exterior1st'][1].transform(np.expand_dims(np.array(test['Exterior1st'].unique()),-1))
print(n.shape)
ns = np.sum(n, axis=1)
print(ns, ns.shape)

(13, 15)
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.] (13,)


ExterQual

In [61]:
ord = OrdinalEncoder(categories=[['Po', 'Fa', 'TA', 'Gd', 'Ex']])
print(ord.fit_transform(np.expand_dims(np.array(train['ExterQual'].unique()), -1)))
print(train['ExterQual'].unique())

encoder_dict['ExterQual'] = ('ordinal', ord)

[[3.]
 [2.]
 [4.]
 [1.]]
['Gd' 'TA' 'Ex' 'Fa']


ExterCond

In [62]:
ord = OrdinalEncoder(categories=[['Po', 'Fa', 'TA', 'Gd', 'Ex']])
print(ord.fit_transform(np.expand_dims(np.array(train['ExterCond'].unique()), -1)))
print(train['ExterCond'].unique())

encoder_dict['ExterCond'] = ('ordinal', ord)

[[2.]
 [3.]
 [1.]
 [0.]
 [4.]]
['TA' 'Gd' 'Fa' 'Po' 'Ex']


Foundation

In [63]:
oh = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
print(oh.fit_transform(np.expand_dims(np.array(train['Foundation'].unique()), -1)))
print(train['Foundation'].unique())

encoder_dict['Foundation'] = ('onehot', oh)

[[0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0.]]
['PConc' 'CBlock' 'BrkTil' 'Wood' 'Slab' 'Stone']


HeatingQC

In [64]:
ord = OrdinalEncoder(categories=[['Po', 'Fa', 'TA', 'Gd', 'Ex']])
print(ord.fit_transform(np.expand_dims(np.array(train['HeatingQC'].unique()), -1)))
print(train['HeatingQC'].unique())

encoder_dict['HeatingQC'] = ('ordinal', ord)

[[4.]
 [3.]
 [2.]
 [1.]
 [0.]]
['Ex' 'Gd' 'TA' 'Fa' 'Po']


LotShape

In [65]:
ord = OrdinalEncoder(categories=[['IR3', 'IR2', 'IR1', 'Reg']])
print(ord.fit_transform(np.expand_dims(np.array(train['LotShape'].unique()), -1)))
print(train['LotShape'].unique())

encoder_dict['LotShape'] = ('ordinal', ord)

[[3.]
 [2.]
 [1.]
 [0.]]
['Reg' 'IR1' 'IR2' 'IR3']


SaleType

In [66]:
oh = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
print(oh.fit_transform(np.expand_dims(np.array(list(train['SaleType'].unique())+list(test['SaleType'].unique())), -1)))
print(train['SaleType'].unique())

encoder_dict['SaleType'] = ('onehot', oh)

[[0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]]
['WD' 'New' 'COD' 'ConLD' 'ConLI' 'CWD' 'ConLw' 'Con' 'Oth']


RoofStyle

In [67]:
ord = OrdinalEncoder(categories=[['Shed', 'Mansard', 'Hip', 'Gambrel', 
                                 'Gable', 'Flat']])
print(ord.fit_transform(np.expand_dims(np.array(train['RoofStyle'].unique()), -1)))
print(train['RoofStyle'].unique())

encoder_dict['RoofStyle'] = ('ordinal', ord)

[[4.]
 [2.]
 [3.]
 [1.]
 [5.]
 [0.]]
['Gable' 'Hip' 'Gambrel' 'Mansard' 'Flat' 'Shed']


PavedDrive

In [68]:
ord = OrdinalEncoder(categories=[['N','P','Y']])
print(ord.fit_transform(np.expand_dims(np.array(train['PavedDrive'].unique()), -1)))
print(train['PavedDrive'].unique())

encoder_dict['PavedDrive'] = ('ordinal', ord)

[[2.]
 [0.]
 [1.]]
['Y' 'N' 'P']


KitchenQual

In [69]:
ord = OrdinalEncoder(categories=[['Po', 'Fa', 'TA', 'Gd', 'Ex']])
print(ord.fit_transform(np.expand_dims(np.array(train['KitchenQual'].unique()), -1)))
print(train['KitchenQual'].unique())

encoder_dict['KitchenQual'] = ('ordinal', ord)

[[3.]
 [2.]
 [4.]
 [1.]]
['Gd' 'TA' 'Ex' 'Fa']


SaleCondition

In [70]:
oh = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
print(oh.fit_transform(np.expand_dims(np.array(train['SaleCondition'].unique()), -1)))
print(train['SaleCondition'].unique())

encoder_dict['SaleCondition'] = ('onehot', oh)

[[0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]]
['Normal' 'Abnorml' 'Partial' 'AdjLand' 'Alloca' 'Family']


Heating

In [71]:
heat_price_order = ['Grav', 'Wall', 'Floor', 
                    'GasA', 'OthW', 'GasW'] # Lowest to highest pricing

ord = OrdinalEncoder(categories=[heat_price_order])
print(ord.fit_transform(np.expand_dims(np.array(train['Heating'].unique()), -1)))
print(train['Heating'].unique())

encoder_dict['Heating'] = ('ordinal', ord)

[[3.]
 [5.]
 [0.]
 [1.]
 [4.]
 [2.]]
['GasA' 'GasW' 'Grav' 'Wall' 'OthW' 'Floor']


CentralAir

In [72]:
ord = OrdinalEncoder(categories=[['N','Y']])
print(ord.fit_transform(np.expand_dims(np.array(train['CentralAir'].unique()), -1)))
print(train['CentralAir'].unique())

encoder_dict['CentralAir'] = ('ordinal', ord)

[[1.]
 [0.]]
['Y' 'N']


Functional

In [73]:
ord = OrdinalEncoder(categories=[['Typ','Min1', 'Min2', 'Mod', 
                                 'Maj1', 'Maj2', 'Sev', 'Sal']])
print(ord.fit_transform(np.expand_dims(np.array(train['Functional'].unique()), -1)))
print(train['Functional'].unique())

encoder_dict['Functional'] = ('ordinal', ord)

[[0.]
 [1.]
 [4.]
 [2.]
 [3.]
 [5.]
 [6.]]
['Typ' 'Min1' 'Maj1' 'Min2' 'Mod' 'Maj2' 'Sev']


In [74]:
l2 = list(set(train.columns)-(set(encoder_dict.keys()).union(set(col_to_dtype.keys()))))
len(l2)

1

In [75]:
l2

['MSSubClass']

### Adding Column Transformer for all the columns Handled above

In [76]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

transformers = []
for col in train.columns:
    steps = []
    val = encoder_dict.get(col, None)
    if val:
        steps.append(val)
    val = col_to_dtype.get(col, None)
    if val and val=='float':
        steps.append(('standardScale', StandardScaler()))
    if steps:   
        transformers.append((col, Pipeline(steps=steps), [col]))
        
preprocessor = ColumnTransformer(transformers=transformers, remainder='passthrough') # This will keep the remaining columns as they are

In [77]:
preprocessor

In [78]:
len(train.columns), len(test.columns)

(77, 77)

In [79]:
X_train = preprocessor.fit_transform(train)
X_transformer = preprocessor.transform(train)
X_test = preprocessor.transform(test)
X_train.shape, X_test.shape

((1460, 183), (1459, 183))

### Imputing the missing values

In [80]:
train_na_cols = []
for col in train.columns:
    if train[col].isna().sum()>0:
        train_na_cols.append(col)

test_na_cols = []
for col in test.columns:
    if test[col].isna().sum()>0:
        test_na_cols.append(col)

print(train_na_cols, test_na_cols) # Printing the column names containing na

['LotFrontage', 'MasVnrArea'] ['LotFrontage', 'MasVnrArea']


In [81]:
# Verifying if only 2 columns have na. The 2 columns above are floating point columns. So,
# no encoding. Only Standard Scaling
nan_Xtrain = np.isnan(X_train).sum(axis=0)
nan_Xtest = np.isnan(X_test).sum(axis=0)
print(nan_Xtrain.shape, nan_Xtest.shape)

(183,) (183,)


In [82]:
nan_columns_train = np.nonzero(nan_Xtrain)
nan_columns_test = np.nonzero(nan_Xtest)
print(nan_columns_train, nan_columns_test) #Columns with nan entries

(array([  1, 102]),) (array([  1, 102]),)


The above missing columns are to be imputed

In [83]:
from sklearn.experimental import enable_iterative_imputer  # To enable IterativeImputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(max_iter=10, random_state=0)
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)


Ensuring that there is no column with nan values

In [84]:
nan_Xtrain_imputed = np.nonzero(np.isnan(X_train_imputed).sum(axis=0))
nan_Xtest_imputed = np.nonzero(np.isnan(X_test_imputed).sum(axis=0))
print(nan_Xtrain_imputed,nan_Xtest_imputed)

(array([], dtype=int64),) (array([], dtype=int64),)


In [85]:
y_train = np.log(y_train)

# Fitting train data using different models

## LASSO regression (without any further feature selection)

In [86]:
from sklearn.linear_model import Lasso, LassoCV

In [87]:
lasso_cv = LassoCV(cv=5)
# Fit the model
lasso_cv.fit(X_train_imputed, y_train)

# The best alpha found by cross-validation
print("Best alpha selected by cross-validation:", lasso_cv.alpha_)

Best alpha selected by cross-validation: 0.007071892022003756


In [88]:
model = Lasso(alpha=0.01)
model.fit(X_train_imputed, y_train)
y_test = model.predict(X_test_imputed)
y_test = np.exp(y_test)

In [89]:
out_dframe = pd.DataFrame({'Id': test_ids, 'SalePrice':y_test})
out_dframe.set_index('Id', inplace=True)
print(f"Number of rows -- {len(out_dframe)}")
print(f"Columns are -- {list(out_dframe.columns)}")

Number of rows -- 1459
Columns are -- ['SalePrice']


In [90]:
out_dframe.to_csv('/kaggle/working/Lasso_predict.csv')

Lasso Regression produces a Root-Mean-Squared-Error (RMSE) on log vales of 0.14882

In [91]:
from sklearn.model_selection import GridSearchCV

## Regression Trees

In [92]:
from sklearn.tree import DecisionTreeRegressor
# Initialize the regressor
regressor = DecisionTreeRegressor(random_state=42)

# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 10, 20, 40, None],
    'min_samples_split': [2, 5, 10, 20, 40],
    'min_samples_leaf': [1, 2, 4, 8, 16]
}

# Set up the grid search
grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid, 
                           cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

In [93]:
# Fit the grid search
grid_search.fit(X_train_imputed, y_train)

# Print the best parameters and score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-validated MSE: {-grid_search.best_score_}")

Best Parameters: {'max_depth': 10, 'min_samples_leaf': 8, 'min_samples_split': 2}
Best Cross-validated MSE: 0.034592451393137824


In [94]:
model = DecisionTreeRegressor(max_depth=10, min_samples_leaf=8, min_samples_split=2, random_state=42)
model.fit(X_train_imputed, y_train)
y_test = model.predict(X_test_imputed)
y_test = np.exp(y_test)

In [95]:
out_dframe = pd.DataFrame({'Id': test_ids, 'SalePrice':y_test})
out_dframe.set_index('Id', inplace=True)
print(f"Number of rows -- {len(out_dframe)}")
print(f"Columns are -- {list(out_dframe.columns)}")

Number of rows -- 1459
Columns are -- ['SalePrice']


In [96]:
out_dframe.to_csv('/kaggle/working/DescisionTreeRegressor_predict.csv')

## RandomForest Regressor

In [102]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the regressor
regressor = RandomForestRegressor(random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10, 20, 40, None],
    'min_samples_split': [2, 5, 10, 20, 40],
    'min_samples_leaf': [1, 2, 4, 8, 16],
}

# Set up the grid search
grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid, 
                           cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

In [103]:
# Fit the grid search
grid_search.fit(X_train_imputed, y_train)

# Print the best parameters and score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-validated MSE: {-grid_search.best_score_}")

Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Cross-validated MSE: 0.02012876046349472


In [104]:
model = RandomForestRegressor(random_state=42, **(grid_search.best_params_))
model.fit(X_train_imputed, y_train)
y_test = model.predict(X_test_imputed)
y_test = np.exp(y_test)

In [105]:
out_dframe = pd.DataFrame({'Id': test_ids, 'SalePrice':y_test})
out_dframe.set_index('Id', inplace=True)
print(f"Number of rows -- {len(out_dframe)}")
print(f"Columns are -- {list(out_dframe.columns)}")

Number of rows -- 1459
Columns are -- ['SalePrice']


In [106]:
out_dframe.to_csv('/kaggle/working/RandomForestRegressor_predict.csv')

## Gradient Boosting Algorithm