In [129]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
%matplotlib inline

# Data load

In [110]:
train_df = pd.read_csv('../dat/train.csv')
print(train_df.shape)
train_df.head(3)

(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500


# Missing Vals

In [91]:
# Number of colomns with nulls
def check_nulls(df):
    return len(df.isnull().sum().nonzero()[0])

In [93]:
print('Number of colomns with nulls: ', check_nulls(train_df))

Number of colomns with nulls:  19


# NOTE: We dont need to split into object and non obj and impute only numberica, because anyway we're going to tranform the string vals. Also, filling all missing vals with None is not always right

## Object type fill in missing vals:

In [111]:
train_df_obj = train_df.select_dtypes(include=['object'])
print(train_df_obj.shape)


(1460, 43)


MSZoning         object
Street           object
Alley            object
LotShape         object
LandContour      object
Utilities        object
LotConfig        object
LandSlope        object
Neighborhood     object
Condition1       object
Condition2       object
BldgType         object
HouseStyle       object
RoofStyle        object
RoofMatl         object
Exterior1st      object
Exterior2nd      object
MasVnrType       object
ExterQual        object
ExterCond        object
Foundation       object
BsmtQual         object
BsmtCond         object
BsmtExposure     object
BsmtFinType1     object
BsmtFinType2     object
Heating          object
HeatingQC        object
CentralAir       object
Electrical       object
KitchenQual      object
Functional       object
FireplaceQu      object
GarageType       object
GarageFinish     object
GarageQual       object
GarageCond       object
PavedDrive       object
PoolQC           object
Fence            object
MiscFeature      object
SaleType        

In [117]:
print('Number of colomns with nulls: ', check_nulls(train_df_obj))

Number of colomns with nulls:  0


In [115]:
train_df_obj = train_df_obj.fillna("None")

In [116]:
print('Number of colomns with nulls: ', check_nulls(train_df_obj))

Number of colomns with nulls:  0


## Numerical missing values handling:

In [118]:
imp = Imputer()
train_df_non_obj = train_df.select_dtypes(exclude=['object'])
print(train_df_non_obj.shape)

(1460, 38)


In [119]:
print('Number of colomns with nulls: ', check_nulls(train_df_non_obj))

Number of colomns with nulls:  3


In [120]:
#train_df_non_obj = imp.fit_transform(train_df_non_obj)
train_df_non_obj = pd.DataFrame(imp.fit_transform(train_df_non_obj), index=train_df_non_obj.index, columns=train_df_non_obj.columns)


    Id  MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  \
0  1.0        60.0         65.0   8450.0          7.0          5.0     2003.0   
1  2.0        20.0         80.0   9600.0          6.0          8.0     1976.0   
2  3.0        60.0         68.0  11250.0          7.0          5.0     2001.0   

   YearRemodAdd  MasVnrArea  BsmtFinSF1    ...      WoodDeckSF  OpenPorchSF  \
0        2003.0       196.0       706.0    ...             0.0         61.0   
1        1976.0         0.0       978.0    ...           298.0          0.0   
2        2002.0       162.0       486.0    ...             0.0         42.0   

   EnclosedPorch  3SsnPorch  ScreenPorch  PoolArea  MiscVal  MoSold  YrSold  \
0            0.0        0.0          0.0       0.0      0.0     2.0  2008.0   
1            0.0        0.0          0.0       0.0      0.0     5.0  2007.0   
2            0.0        0.0          0.0       0.0      0.0     9.0  2008.0   

   SalePrice  
0   208500.0  
1   181500.

In [121]:
print('Number of colomns with nulls: ', check_nulls(train_df_non_obj))

Number of colomns with nulls:  0


## Now put the two transfroms together

In [124]:
train_df = pd.concat([train_df_non_obj, train_df_obj], axis=1)

In [125]:
train_df.shape

(1460, 81)

In [126]:
print('Number of colomns with nulls: ', check_nulls(train_df))

Number of colomns with nulls:  0


# Target variable

In [132]:
# Log transform the target for official scoring
train_df.SalePrice = np.log1p(train_df.SalePrice)
y = train_df.SalePrice