In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 20, 15

In [2]:
train = pd.read_csv('./data/train.csv')
train.drop(columns='Id', inplace=True)
print("Training shape: ", train.shape)
#----------------------
test = pd.read_csv('./data/test.csv')
test.drop(columns='Id', inplace=True)
print("Testing shape: ",test.shape)

Training shape:  (1460, 80)
Testing shape:  (1459, 79)


In [3]:
# assign target
target = train.SalePrice.copy()
target = np.log1p(target)
train.drop(columns='SalePrice', inplace=True)

In [4]:
train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [5]:
numerical = ['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea',
       'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']
date_features = ['YearBuilt', 'YearRemodAdd','GarageYrBlt', 'MoSold', 'YrSold']
categorical = [i for i in train.columns if i not in numerical+date_features]

print('Numerical Features : ', len(numerical), '\n',
      train[numerical].dtypes.value_counts(),'\n---------------------')
print('categorical Features : ', len(categorical),  '\n',
      train[categorical].dtypes.value_counts(),'\n---------------------')
print('Date features : ', len(date_features),  '\n',
      train[date_features].dtypes.value_counts(),'\n---------------------')
#--------------------------------------
numerical += date_features

Numerical Features :  30 
 int64      28
float64     2
dtype: int64 
---------------------
categorical Features :  44 
 object    43
int64      1
dtype: int64 
---------------------
Date features :  5 
 int64      4
float64    1
dtype: int64 
---------------------


In [23]:
null_across_rows = pd.concat((test,train)).isnull().sum(axis=1)
# rows having more than 20% null values
print("rows having more than 20% null values\n", null_across_rows[null_across_rows > train.shape[1]*.2])

null_across_columns = pd.concat((test,train)).isnull().sum(axis=0)
# features having more than 50% null values
print("features having more than 50% null values\n", null_across_columns[null_across_columns > train.shape[0]*.5])

rows having more than 20% null values
 Series([], dtype: int64)
features having more than 50% null values
 Alley          2721
FireplaceQu    1420
PoolQC         2909
Fence          2348
MiscFeature    2814
dtype: int64


In [22]:
# Find Null values
#test[numerical+date_features].isnull().sum()
median = pd.concat((train,test)).median()
train[numerical] = train[numerical].fillna(median)
test[numerical] = test[numerical].fillna(median)


In [24]:
# taking log of numerical columns
train_norm = train.copy()
test_norm = test.copy()
train_norm[numerical] = np.log1p(train_norm[numerical])
test_norm[numerical] = np.log1p(test_norm[numerical])

In [25]:
# normalizing them to standard meand and var
meanVarScaler = StandardScaler()
_ = meanVarScaler.fit(pd.concat( (train_norm[numerical],test_norm[numerical]) ))
train_norm[numerical] = meanVarScaler.transform(train_norm[numerical])
test_norm[numerical] = meanVarScaler.transform(test_norm[numerical])
#train_norm = pd.DataFrame(train_norm)

In [26]:
train_norm[numerical].describe()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,YearBuilt,YearRemodAdd,GarageYrBlt,MoSold,YrSold
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,0.029348,0.031485,0.011625,0.012207,0.015656,0.000923,-0.013617,0.011281,0.014769,0.013225,...,-0.033539,0.033189,-0.027777,0.007485,-0.001057,-0.001431,0.028836,0.015339,0.042554,0.0175
std,0.980018,1.014858,0.974563,0.972615,1.003165,1.003887,0.982343,0.993496,0.974513,0.983995,...,0.975009,1.136098,0.961383,1.069456,0.988739,0.997362,0.988195,0.966188,0.979098,1.010159
min,-3.467074,-3.774121,-6.005905,-6.780599,-0.793461,-1.418228,-0.362565,-3.01546,-5.72708,-3.689017,...,-0.427545,-0.112774,-0.309144,-0.066702,-0.189329,-3.340684,-1.645198,-3.191933,-2.75913,-1.363909
25%,-0.28579,-0.323694,-0.705122,-0.435439,-0.793461,-1.418228,-0.362565,-0.115574,-0.044788,-0.68464,...,-0.427545,-0.112774,-0.309144,-0.066702,-0.189329,-0.563707,-0.822744,-0.644532,-0.231402,-0.602779
50%,0.098625,0.121561,0.038652,-0.435439,-0.793461,0.578756,-0.362565,0.291147,0.142072,-0.037478,...,-0.427545,-0.112774,-0.309144,-0.066702,-0.189329,0.063107,0.469015,0.0399,0.123273,0.157971
75%,0.56005,0.517974,0.682938,0.454875,1.155602,0.786068,-0.362565,0.572553,0.371144,0.726889,...,-0.427545,-0.112774,-0.309144,-0.066702,-0.189329,0.943542,0.943013,0.916964,0.701507,0.918344
max,4.825442,6.24687,2.219471,2.514892,2.022214,1.480152,3.522207,1.141007,1.68809,4.493698,...,3.137249,10.504893,3.922184,16.036152,7.591891,1.266618,1.226278,1.272988,1.547581,1.678337


In [None]:
# plot all numerical 
f, ax = plt.subplots(int(np.ceil(len(numerical/3.))), 3, figsize=(50,40))
f.suptitle('Numerical Features in std normal')
for (i,feat),axx in zip(enumerate(numerical), ax.flat):
    axx.hist(train_norm[i])
    axx.set_title(feat+' - '+str(i))

plt.savefig('./data/numerical.png')
plt.close(f)

In [35]:
# Correlation
fig, ax = plt.subplots()
sns.heatmap(train_norm[categorical].corr(method='pearson'), annot=True, fmt='.4f', 
            cmap=plt.get_cmap('coolwarm'), cbar=False, ax=ax)
ax.set_yticklabels(ax.get_yticklabels(), rotation="horizontal")
plt.savefig('./data/correlation_categorical.png', bbox_inches='tight', pad_inches=0.0)
plt.close(fig)

---------------------

## Now deealing with categorical features


In [27]:
# looking for Nan values in categorical columns
nan_sum = test_norm[categorical].isnull().sum() + train_norm[categorical].isnull().sum()
print(nan_sum[nan_sum>100])

Alley           2721
FireplaceQu     1420
GarageType       157
GarageFinish     159
GarageQual       159
GarageCond       159
PoolQC          2909
Fence           2348
MiscFeature     2814
dtype: int64


In [28]:
to_delete = ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
categorical = [i for i in categorical if i not in to_delete]

train_norm.drop(columns=to_delete, inplace=True)
test_norm.drop(columns=to_delete, inplace=True)

mode = pd.concat((train_norm[categorical],test_norm[categorical])).mode().iloc[0]
train_norm[categorical] = train_norm[categorical].fillna(mode)
test_norm[categorical] = test_norm[categorical].fillna(mode)


In [29]:
# What I have in mind now is that I'll convert them using LabelEncoder 
#then I'll Normalize them to zero mean and standard variance
#--------------------------------------------------------------------------
Encoder = LabelEncoder()
for feature in categorical:
    _ = Encoder.fit(pd.concat((train_norm[feature],test_norm[feature])))
    train_norm[feature] = Encoder.transform(train_norm[feature]).astype(np.float64)
    test_norm[feature]  = Encoder.transform(test_norm[feature]).astype(np.float64)
#--------------------------------------------------------------------------
#-------------------------------------------------------------------------
MeanVarScaler = StandardScaler()
_ = MeanVarScaler.fit(pd.concat( (train_norm[categorical],test_norm[categorical]) ))
train_norm[categorical] = MeanVarScaler.transform(train_norm[categorical])
test_norm[categorical]  = MeanVarScaler.transform(test_norm[categorical])

train_norm[categorical].describe()

Unnamed: 0,MSSubClass,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,Electrical,KitchenQual,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,-0.006801,0.001546,2.2e-05,-0.003632,0.000595,0.018499,-0.022854,0.034351,-0.031185,-0.010205,...,-0.00284,-0.008763,-0.011265,-0.003728,-0.000933,0.033736,0.001405,0.047282,0.013651,-0.007872
std,0.992492,0.960159,1.000172,0.999771,1.004822,1.414214,1.011493,1.110673,1.009529,0.993841,...,1.003821,0.995122,1.047601,0.997037,0.992199,0.926829,0.99516,0.924396,0.974185,1.021148
min,-0.982129,-4.59975,-15.564382,-1.381776,-3.943059,-0.018512,-1.904903,-0.21626,-2.087829,-2.334857,...,-3.51873,-2.813414,-6.159311,-1.254776,-1.502693,-6.006155,-7.644866,-3.407932,-4.701938,-3.505415
25%,-0.982129,-0.042157,0.064249,-1.381776,0.316671,-0.018512,-0.658174,-0.21626,-0.912735,-0.046258,...,0.300615,-0.415996,0.256806,-0.688235,-0.280525,0.233534,0.18646,0.31503,0.319311,0.204967
50%,-0.061354,-0.042157,0.064249,0.746669,0.316671,-0.018512,0.588555,-0.21626,-0.073382,-0.046258,...,0.300615,0.782713,0.256806,-0.688235,-0.280525,0.233534,0.18646,0.31503,0.319311,0.204967
75%,0.399034,-0.042157,0.064249,0.746669,0.316671,-0.018512,0.588555,-0.21626,0.76597,-0.046258,...,0.300615,0.782713,0.256806,1.577932,0.941643,0.233534,0.18646,0.31503,0.319311,0.204967
max,2.47078,1.477041,0.064249,0.746669,0.316671,54.018515,0.588555,7.825321,1.941064,6.819538,...,0.300615,0.782713,0.256806,1.577932,0.941643,0.233534,0.18646,0.31503,0.319311,1.132563


------------------------------------
--------------------------------

In [31]:
#
print(train_norm.shape, test_norm.shape, target.shape)

(1460, 74) (1459, 74) (1460,)


In [32]:
# save to disk
np.save('./data/ready_train.npy', train_norm.values)
np.save('./data/ready_test.npy',  test_norm.values)
np.save('./data/ready_target.npy', target.values)

