In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import matplotlib.pylab as plt
import seaborn as sns
import pandas as pd
import numpy as np
import gc

In [2]:
train_ori = pd.read_csv("input/train.csv")

train_ori["SalePrice"] = np.log1p(train_ori["SalePrice"])

train_ori.shape

(1460, 81)

In [3]:
train_ori['MSSubClass'] = train_ori['MSSubClass'].apply(str)
train_ori['OverallCond'] = train_ori['OverallCond'].astype(str)
train_ori['YrSold'] = train_ori['YrSold'].astype(str)
train_ori['MoSold'] = train_ori['MoSold'].astype(str)

For the sake of keeping the example simple, We will consider only numeric features

In [4]:
train_ori = train_ori.select_dtypes(exclude=['object'])
train_ori.shape

(1460, 34)

Let's observe missing values in each feature

In [5]:
missing_df = pd.DataFrame(train_ori.columns,columns=['feature'])

for e in train_ori.columns:
    missing_df.loc[missing_df['feature'] == e,'count'] = train_ori[e].isna().sum()
    missing_df.loc[missing_df['feature'] == e,'percentage'] = train_ori[e].isna().sum() / train_ori.shape[0]
    
missing_df.sort_values(by='count',inplace=True,ascending=False,)

missing_df.head(5)

Unnamed: 0,feature,count,percentage
1,LotFrontage,259.0,0.177397
23,GarageYrBlt,81.0,0.055479
6,MasVnrArea,8.0,0.005479
25,GarageArea,0.0,0.0
19,BedroomAbvGr,0.0,0.0


In [6]:
X = train_ori.drop(['Id','SalePrice'],axis = 1)
Y = train_ori.SalePrice

In [7]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.25, random_state=42)

In [8]:
def train_model(tr_X,tr_Y,vl_X,vl_Y):
    
    model = RandomForestRegressor(random_state=12)
    model.fit(tr_X, tr_Y)  
    y_pred = model.predict(vl_X)

    mse = mean_squared_error(vl_Y, y_pred)
    rmse = np.sqrt(mse)
    print("Error :",rmse)

# Three ways to handle missing values

# 1) Drop the feature with missing value


In [9]:
missing_col = [e for e in X_train.columns if X_train[e].isna().any()]

X_train_1 = X_train.copy()
X_val_1 = X_val.copy()
Y_train_1 = Y_train.copy()
Y_val_1 = Y_val.copy()

X_train_1.drop(missing_col,axis=1,inplace=True)
X_val_1.drop(missing_col,axis=1,inplace=True)

X_train_1.shape,X_val_1.shape

((1095, 29), (365, 29))

In [10]:
cat_feat = [f for f in X_train_1.columns if X_train_1.dtypes[f] == 'object']

for c in cat_feat:
    lbl = LabelEncoder() 
    lbl.fit(list(X_train_1[c].values) + list(X_val_1[c].values)) 
    X_train_1[c] = lbl.transform(list(X_train_1[c].values))
    X_val_1[c] = lbl.transform(list(X_val_1[c].values))
    

In [11]:
train_model(X_train_1,Y_train_1,X_val_1,Y_val_1)

Error : 0.15945825622863088




In [12]:
del X_train_1,Y_train_1,X_val_1,Y_val_1
gc.collect()

55

# 2) Imputation

In [13]:
X_train_2 = X_train.copy()
X_val_2 = X_val.copy()
Y_train_2 = Y_train.copy()
Y_val_2 = Y_val.copy()

X_train_2.shape,X_val_2.shape

((1095, 32), (365, 32))

In [15]:
X_train_2.fillna(-999,inplace=True)
X_val_2.fillna(-999,inplace=True)

In [16]:
train_model(X_train_2,Y_train_2,X_val_2,Y_val_2)

Error : 0.16225894180924758




In [17]:
del X_train_2,Y_train_2,X_val_2,Y_val_2
gc.collect()

12

# 3) Extension to Imputation

In [18]:
X_train_3 = X_train.copy()
X_val_3 = X_val.copy()
Y_train_3 = Y_train.copy()
Y_val_3 = Y_val.copy()

X_train_3.shape,X_val_3.shape

((1095, 32), (365, 32))

In [19]:
cols_with_missing = (col for col in X_train_3.columns if X_train_3[col].isnull().any())

for col in cols_with_missing:
    X_train_3[col + '_missing'] = X_train_3[col].isnull()
    X_val_3[col + '_missing'] = X_val_3[col].isnull()

X_train_3.shape,X_val_3.shape

((1095, 35), (365, 35))

In [20]:
X_train_3.fillna(-999,inplace=True)
X_val_3.fillna(-999,inplace=True)

In [21]:
train_model(X_train_3,Y_train_3,X_val_3,Y_val_3)

Error : 0.1587026693701065


