# Data Cleaning for House Price Prediction

Kaggle Contest link : https://www.kaggle.com/c/house-prices-advanced-regression-techniques

## 1) Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

  import pandas.util.testing as tm


## 2) Loading train and test set

In [0]:
df_train = pd.read_csv('drive/My Drive/Pytorch_DataSet/house-prices/train.csv')
df_test = pd.read_csv('drive/My Drive/Pytorch_DataSet/house-prices/test.csv')

In [3]:
len(df_train), len(df_test)

(1460, 1459)

## 3) Data Cleaning

In [4]:
"""
Taking only those columns for training that have different values, 
so we will not take columns having a single value repeated by 90% of the length of dataset(in both train and test).
"""
cols = []

for col in df_train.columns:

  if col != 'SalePrice':

    train_col = df_train[col].value_counts()
    test_col  = df_test[col].value_counts()
    max_train_col = max(train_col)
    max_test_col = max(test_col)

    print(train_col)
    print("Max value count = ",max_train_col)
    print('\n<------------------x----------------->\n')

    if (max_train_col > 0.85*len(df_train)) and (max_test_col > 0.85*len(df_test)):
      pass
    else:
      cols.append(col)  

1460    1
479     1
481     1
482     1
483     1
       ..
976     1
977     1
978     1
979     1
1       1
Name: Id, Length: 1460, dtype: int64
Max value count =  1

<------------------x----------------->

20     536
60     299
50     144
120     87
30      69
160     63
70      60
80      58
90      52
190     30
85      20
75      16
45      12
180     10
40       4
Name: MSSubClass, dtype: int64
Max value count =  536

<------------------x----------------->

RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: MSZoning, dtype: int64
Max value count =  1151

<------------------x----------------->

60.0     143
70.0      70
80.0      69
50.0      57
75.0      53
        ... 
106.0      1
38.0       1
138.0      1
140.0      1
137.0      1
Name: LotFrontage, Length: 110, dtype: int64
Max value count =  143

<------------------x----------------->

7200     25
9600     24
6000     17
10800    14
9000     14
         ..
7094      1
6130      1
9337     

In [5]:
len(cols)

55

In [0]:
# Removing the cols that have null values greater than 80% of len(train and test dataset)

for col in df_train.columns:

  if col != 'SalePrice':
    
    null_for_train_col = df_train[col].isnull().sum()
    null_for_test_col = df_test[col].isnull().sum()

    if null_for_train_col > 0.8*len(df_train) and null_for_test_col > 0.8*len(df_test):

      if col in cols:
        cols.remove(col)      

In [7]:
len(cols)

51

In [8]:
print(cols)

['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'LotShape', 'LotConfig', 'Neighborhood', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'Foundation', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtUnfSF', 'TotalBsmtSF', 'HeatingQC', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'MoSold', 'YrSold', 'SaleCondition']


In [10]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,...,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,...,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,...,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,...,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,...,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,...,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


In [9]:
# For train file

print(" For Training Set \n")
print("Column Name     Null Count    Col Type\n")
for col in cols:
  if df_train[col].isnull().sum() > 0:
    print(f'{col:>{10}}  {df_train[col].isnull().sum():>{10}}          {df_train[col].dtype}')

print("\n<-------------------x---------------->\n")
# For test file

print(" For Test Set \n")

print("Column Name     Null Count    Col Type\n")
for col in cols:
  if df_test[col].isnull().sum() > 0:
    print(f'{col:>{10}}  {df_test[col].isnull().sum():>{10}}          {df_test[col].dtype}')


 For Training Set 

Column Name     Null Count    Col Type

LotFrontage         259          float64
MasVnrType           8          object
MasVnrArea           8          float64
  BsmtQual          37          object
BsmtExposure          38          object
BsmtFinType1          37          object
BsmtFinType2          38          object
FireplaceQu         690          object
GarageType          81          object
GarageYrBlt          81          float64
GarageFinish          81          object

<-------------------x---------------->

 For Test Set 

Column Name     Null Count    Col Type

  MSZoning           4          object
LotFrontage         227          float64
Exterior1st           1          object
Exterior2nd           1          object
MasVnrType          16          object
MasVnrArea          15          float64
  BsmtQual          44          object
BsmtExposure          44          object
BsmtFinType1          42          object
BsmtFinSF1           1          float64


In [11]:
for col in cols:
  if df_train[col].isnull().sum()>0:
    print(df_train[col].value_counts())
    print('\n<------------------x----------------->\n')


60.0     143
70.0      70
80.0      69
50.0      57
75.0      53
        ... 
106.0      1
38.0       1
138.0      1
140.0      1
137.0      1
Name: LotFrontage, Length: 110, dtype: int64

<------------------x----------------->

None       864
BrkFace    445
Stone      128
BrkCmn      15
Name: MasVnrType, dtype: int64

<------------------x----------------->

0.0      861
72.0       8
180.0      8
108.0      8
120.0      7
        ... 
651.0      1
337.0      1
415.0      1
293.0      1
621.0      1
Name: MasVnrArea, Length: 327, dtype: int64

<------------------x----------------->

TA    649
Gd    618
Ex    121
Fa     35
Name: BsmtQual, dtype: int64

<------------------x----------------->

No    953
Av    221
Gd    134
Mn    114
Name: BsmtExposure, dtype: int64

<------------------x----------------->

Unf    430
GLQ    418
ALQ    220
BLQ    148
Rec    133
LwQ     74
Name: BsmtFinType1, dtype: int64

<------------------x----------------->

Unf    1256
Rec      54
LwQ      46
BLQ      33

In [12]:
df_train['LotFrontage'].mean(), df_train['LotFrontage'].median()

(70.04995836802665, 69.0)

In [13]:
df_train['MasVnrArea'].mean(), df_train['MasVnrArea'].median()

(103.68526170798899, 0.0)

In [18]:
df_train['GarageYrBlt'].mean(), df_train['GarageYrBlt'].median()

(1978.5061638868744, 1980.0)

In [0]:
# Filling up of null values

# For train set
df_train['LotFrontage'].fillna(df_train['LotFrontage'].median(),inplace=True)
df_train['MasVnrType'].fillna('None',inplace=True)
df_train['MasVnrArea'].fillna(df_train['MasVnrArea'].median(),inplace=True)
df_train['BsmtQual'].fillna('TA',inplace=True)
df_train['BsmtExposure'].fillna('No',inplace=True)
df_train['BsmtFinType1'].fillna('Unf',inplace=True)
df_train['BsmtFinType2'].fillna('Unf',inplace=True)
df_train['FireplaceQu'].fillna('Missing',inplace=True)
df_train['GarageType'].fillna('Attchd',inplace=True)
df_train['GarageYrBlt'].fillna(df_train['GarageYrBlt'].median(),inplace=True)
df_train['GarageFinish'].fillna('Unf',inplace=True)

In [33]:
# Checking if sale price also contains some null values

df_train['SalePrice'].isnull().sum()

0

In [21]:
has_null_values = False
for col in cols:
  if df_train[col].isnull().sum()>0:
    print(df_train[col].value_counts())
    has_null_values = True
    print('\n<------------------x----------------->\n')

if has_null_values == False:
  print("No null values exist.")

No null values exist.


In [22]:
# For test file

print(" For Test Set \n")

print("Column Name     Null Count    Col Type\n")
for col in cols:
  if df_test[col].isnull().sum() > 0:
    print(f'{col:>{10}}  {df_test[col].isnull().sum():>{10}}          {df_test[col].dtype}')

 For Test Set 

Column Name     Null Count    Col Type

  MSZoning           4          object
LotFrontage         227          float64
Exterior1st           1          object
Exterior2nd           1          object
MasVnrType          16          object
MasVnrArea          15          float64
  BsmtQual          44          object
BsmtExposure          44          object
BsmtFinType1          42          object
BsmtFinSF1           1          float64
BsmtFinType2          42          object
 BsmtUnfSF           1          float64
TotalBsmtSF           1          float64
BsmtFullBath           2          float64
KitchenQual           1          object
FireplaceQu         730          object
GarageType          76          object
GarageYrBlt          78          float64
GarageFinish          78          object
GarageCars           1          float64
GarageArea           1          float64


In [23]:
for col in cols:
  if df_test[col].isnull().sum()>0:
    print(df_test[col].value_counts())
    print('\n<------------------x----------------->\n')


RL         1114
RM          242
FV           74
C (all)      15
RH           10
Name: MSZoning, dtype: int64

<------------------x----------------->

60.0     133
80.0      68
70.0      63
50.0      60
75.0      52
        ... 
22.0       1
136.0      1
149.0      1
31.0       1
131.0      1
Name: LotFrontage, Length: 115, dtype: int64

<------------------x----------------->

VinylSd    510
MetalSd    230
HdBoard    220
Wd Sdng    205
Plywood    113
CemntBd     65
BrkFace     37
WdShing     30
AsbShng     24
Stucco      18
BrkComm      4
AsphShn      1
CBlock       1
Name: Exterior1st, dtype: int64

<------------------x----------------->

VinylSd    510
MetalSd    233
HdBoard    199
Wd Sdng    194
Plywood    128
CmentBd     66
Wd Shng     43
BrkFace     22
Stucco      21
AsbShng     18
Brk Cmn     15
ImStucc      5
CBlock       2
AsphShn      1
Stone        1
Name: Exterior2nd, dtype: int64

<------------------x----------------->

None       878
BrkFace    434
Stone      121
BrkCmn    

In [24]:
df_test['BsmtFinSF1'].mean() , df_test['BsmtFinSF1'].median()

(439.2037037037037, 350.5)

In [25]:
df_test['BsmtUnfSF'].mean() , df_test['BsmtUnfSF'].median()

(554.2949245541838, 460.0)

In [26]:
df_test['TotalBsmtSF'].mean() , df_test['TotalBsmtSF'].median()

(1046.1179698216736, 988.0)

In [27]:
df_test['BsmtFullBath'].mean() , df_test['BsmtFullBath'].median()

(0.4344543582704187, 0.0)

In [28]:
df_test['GarageCars'].mean() , df_test['GarageCars'].median()

(1.7661179698216736, 2.0)

In [29]:
df_test['GarageArea'].mean() , df_test['GarageArea'].median()

(472.76886145404666, 480.0)

In [0]:
df_test['MSZoning'].fillna('RL',inplace=True)
df_test['LotFrontage'].fillna(df_test['LotFrontage'].median(),inplace=True)
df_test['Exterior1st'].fillna('VinylSd',inplace=True)
df_test['Exterior2nd'].fillna('VinylSd',inplace=True)
df_test['MasVnrType'].fillna('None',inplace=True)
df_test['MasVnrArea'].fillna(df_test['MasVnrArea'].median(),inplace=True)
df_test['BsmtQual'].fillna('TA',inplace=True)
df_test['BsmtExposure'].fillna('No',inplace=True)
df_test['BsmtFinType1'].fillna('Unf',inplace=True)
df_test['BsmtFinSF1'].fillna(0,inplace=True)
df_test['BsmtFinType2'].fillna('Unf',inplace=True)
df_test['BsmtUnfSF'].fillna(0,inplace=True)
df_test['TotalBsmtSF'].fillna(df_test['TotalBsmtSF'].median(),inplace=True)
df_test['BsmtFullBath'].fillna(df_test['BsmtFullBath'].median(),inplace=True)
df_test['KitchenQual'].fillna('TA',inplace=True)
df_test['FireplaceQu'].fillna('Missing',inplace=True)
df_test['GarageType'].fillna('Attchd',inplace=True)
df_test['GarageYrBlt'].fillna(df_test['GarageYrBlt'].median(),inplace=True)
df_test['GarageFinish'].fillna('Unf',inplace=True)
df_test['GarageCars'].fillna(df_test['GarageCars'].median(),inplace=True)
df_test['GarageArea'].fillna(df_test['GarageArea'].median(),inplace=True)


In [31]:
has_null_values = False
for col in cols:
  if df_test[col].isnull().sum()>0:
    print(df_test[col].value_counts())
    has_null_values = True
    print('\n<------------------x----------------->\n')

if has_null_values == False:
  print("No null values exist.")

No null values exist.


In [32]:
df_train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [0]:
# Dropping the columns from data sets

for col in df_train.columns:
  
  if col != 'Id' and col != 'SalePrice':

    if col not in cols:
      df_train.drop(col,axis=1,inplace=True)
      df_test.drop(col,axis=1,inplace=True)
      

In [35]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,LotShape,LotConfig,Neighborhood,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,Foundation,BsmtQual,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtUnfSF,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,GrLivArea,BsmtFullBath,FullBath,HalfBath,BedroomAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,MoSold,YrSold,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Reg,Inside,CollgCr,1Fam,2Story,7,5,2003,2003,Gable,VinylSd,VinylSd,BrkFace,196.0,Gd,PConc,Gd,No,GLQ,706,Unf,150,856,Ex,856,854,1710,1,2,1,3,Gd,8,0,Missing,Attchd,2003.0,RFn,2,548,0,61,0,2,2008,Normal,208500
1,2,20,RL,80.0,9600,Reg,FR2,Veenker,1Fam,1Story,6,8,1976,1976,Gable,MetalSd,MetalSd,,0.0,TA,CBlock,Gd,Gd,ALQ,978,Unf,284,1262,Ex,1262,0,1262,0,2,0,3,TA,6,1,TA,Attchd,1976.0,RFn,2,460,298,0,0,5,2007,Normal,181500
2,3,60,RL,68.0,11250,IR1,Inside,CollgCr,1Fam,2Story,7,5,2001,2002,Gable,VinylSd,VinylSd,BrkFace,162.0,Gd,PConc,Gd,Mn,GLQ,486,Unf,434,920,Ex,920,866,1786,1,2,1,3,Gd,6,1,TA,Attchd,2001.0,RFn,2,608,0,42,0,9,2008,Normal,223500
3,4,70,RL,60.0,9550,IR1,Corner,Crawfor,1Fam,2Story,7,5,1915,1970,Gable,Wd Sdng,Wd Shng,,0.0,TA,BrkTil,TA,No,ALQ,216,Unf,540,756,Gd,961,756,1717,1,1,0,3,Gd,7,1,Gd,Detchd,1998.0,Unf,3,642,0,35,272,2,2006,Abnorml,140000
4,5,60,RL,84.0,14260,IR1,FR2,NoRidge,1Fam,2Story,8,5,2000,2000,Gable,VinylSd,VinylSd,BrkFace,350.0,Gd,PConc,Gd,Av,GLQ,655,Unf,490,1145,Ex,1145,1053,2198,1,2,1,4,Gd,9,1,TA,Attchd,2000.0,RFn,3,836,192,84,0,12,2008,Normal,250000


In [36]:
df_train.shape

(1460, 52)

In [37]:
df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,LotShape,LotConfig,Neighborhood,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,Foundation,BsmtQual,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtUnfSF,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,GrLivArea,BsmtFullBath,FullBath,HalfBath,BedroomAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,MoSold,YrSold,SaleCondition
0,1461,20,RH,80.0,11622,Reg,Inside,NAmes,1Fam,1Story,5,6,1961,1961,Gable,VinylSd,VinylSd,,0.0,TA,CBlock,TA,No,Rec,468.0,LwQ,270.0,882.0,TA,896,0,896,0.0,1,0,2,TA,5,0,Missing,Attchd,1961.0,Unf,1.0,730.0,140,0,0,6,2010,Normal
1,1462,20,RL,81.0,14267,IR1,Corner,NAmes,1Fam,1Story,6,6,1958,1958,Hip,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,CBlock,TA,No,ALQ,923.0,Unf,406.0,1329.0,TA,1329,0,1329,0.0,1,1,3,Gd,6,0,Missing,Attchd,1958.0,Unf,1.0,312.0,393,36,0,6,2010,Normal
2,1463,60,RL,74.0,13830,IR1,Inside,Gilbert,1Fam,2Story,5,5,1997,1998,Gable,VinylSd,VinylSd,,0.0,TA,PConc,Gd,No,GLQ,791.0,Unf,137.0,928.0,Gd,928,701,1629,0.0,2,1,3,TA,6,1,TA,Attchd,1997.0,Fin,2.0,482.0,212,34,0,3,2010,Normal
3,1464,60,RL,78.0,9978,IR1,Inside,Gilbert,1Fam,2Story,6,6,1998,1998,Gable,VinylSd,VinylSd,BrkFace,20.0,TA,PConc,TA,No,GLQ,602.0,Unf,324.0,926.0,Ex,926,678,1604,0.0,2,1,3,Gd,7,1,Gd,Attchd,1998.0,Fin,2.0,470.0,360,36,0,6,2010,Normal
4,1465,120,RL,43.0,5005,IR1,Inside,StoneBr,TwnhsE,1Story,8,5,1992,1992,Gable,HdBoard,HdBoard,,0.0,Gd,PConc,Gd,No,ALQ,263.0,Unf,1017.0,1280.0,Ex,1280,0,1280,0.0,2,0,2,Gd,5,0,Missing,Attchd,1992.0,RFn,2.0,506.0,0,82,0,1,2010,Normal


In [38]:
df_test.shape

(1459, 51)

In [0]:
# Saving the modified cleaned data sets for work

# For train
df_train.to_csv('cleaned_train.csv',index=False)

#For test
df_test.to_csv('cleaned_test.csv',index=False)
