# Feature Engineering

After analyzing the data, this is the next step of the process where we "Engineer" the features. This is still a part of the pre-processing step. It includes fixing the variables with NaNs, discarding rare labels, fixing the distribution of variables, and also splitting the data into testing and training sets. 

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [29]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Splitting the data into testing and training sets 

We do this step before pre-processing the data. The test-set is not used to learn the transform parameters; it needs to be completely new data. 

In [30]:
x_train, x_test, y_train, y_test = train_test_split(data, data.SalePrice, test_size=0.1, random_state=0)

## Filling up the missing (NaN) values from the categorical and continuous variables

We discovered some variables having "NaN" in their values, we want to remove those values. We perform slightly different operations between categorical and numerical values. 

### Categorical variables 

In [31]:
#Take only the categories in which the type is categorical and there exists some missing values for the variable
cat_vars = [var for var in data.columns if x_train[var].dtypes == 'O' and x_train[var].isnull().sum() > 1 ]
cat_vars

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [32]:
for var in cat_vars: 
    print(var, " has ", np.round(x_train[var].isnull().mean(),3) , "% values missing")

Alley  has  0.938 % values missing
MasVnrType  has  0.005 % values missing
BsmtQual  has  0.024 % values missing
BsmtCond  has  0.024 % values missing
BsmtExposure  has  0.025 % values missing
BsmtFinType1  has  0.024 % values missing
BsmtFinType2  has  0.025 % values missing
FireplaceQu  has  0.473 % values missing
GarageType  has  0.056 % values missing
GarageFinish  has  0.056 % values missing
GarageQual  has  0.056 % values missing
GarageCond  has  0.056 % values missing
PoolQC  has  0.995 % values missing
Fence  has  0.814 % values missing
MiscFeature  has  0.961 % values missing


In [33]:
def fill_cat_na (data,var):
    df = data.copy()
    df[var] = df[var].fillna("Missing")
    return df

x_train = fill_cat_na(x_train,cat_vars)
x_test = fill_cat_na(x_test,cat_vars)

#we can check if there are any values missing as well
x_train[cat_vars].isnull().sum()

Alley           0
MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
PoolQC          0
Fence           0
MiscFeature     0
dtype: int64

### Numerical Variables

For these variables, instead of replacing with just "Missing" we want to actually replace with information that we can use. For numerical variables, we usually want to replace with either the mean or the mode. 

In [34]:
num_vars = [var for var in data.columns if data[var].dtypes != 'O' and data[var].isnull().sum() > 1]
num_vars

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [35]:
#Let's also see how many values are missing for each type 
for var in num_vars: 
    print(var, "has", np.round(data[var].isnull().mean(),3), "% values missing")

LotFrontage has 0.177 % values missing
MasVnrArea has 0.005 % values missing
GarageYrBlt has 0.055 % values missing


In [36]:
for var in num_vars:
    mode = x_train[var].mode()[0] #we capture the mode value 
    x_train[var].fillna(mode, inplace=True) #replace the missing value with the mode 
    x_test[var].fillna(mode,inplace=True)

## Temporal Variables

We take the "Age" instead of the "Year". The age is just a number with respect to when the house was sold.


In [37]:
def elapsed_years(df, var):
    # capture difference between year variable and year the house was sold
    df[var] = df['YrSold'] - df[var]
    return df

for var in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:
    x_train = elapsed_years(x_train, var)
    x_test = elapsed_years(x_test, var)
    


## Making numerical variables normally distributed 

Converting numerical values which do not contain 0 to a Gaussian distribution helps linear models converge better

In [38]:
#Finding all the variables that are not of a categorical type, are not in the categorical variables and there is no "yr" or "year" in the variables 
fnum_vars = [var for var in data.columns if data[var].dtypes != 'O' and var not in cat_vars+['Id'] and "Yr" not in var and "Year" not in var]

In [39]:
#from the fnum_var that we got, filter out to only get the columns in which there is no 0 value
nonzero_vars = [var for var in x_train[fnum_vars] if x_train[var].isin([0]).sum() < 1]

In [40]:
#Convert these into a gaussian distribution
for var in nonzero_vars: 
    x_train[var] = np.log(x_train[var])
    x_test[var] = np.log(x_test[var])

In [41]:
#ensuring that we still don't have any null values in the new variables
print([var for var in nonzero_vars if x_test[var].isnull().sum()>0])
print([var for var in nonzero_vars if x_train[var].isnull().sum()>0])

[]
[]


## Categorical variables 

In this section we fix up the categorical variables 

### Removing rare occurences of labels 

We start by removing the rare occurences of certain labels. In categorical variables, we don't want to deal with variables that are present in less than 1% of observation so we can just remove them. 

In [42]:
cat_var = [var for var in data.columns if x_train[var].dtypes == 'O']

In [52]:
len(cat_var)

43

In [53]:
#Find whether an occurence is rare from a category by grouping by variable, and dividing by the length of the dataframe 
def find_freq(data,var,p):
    df = data.copy()
    rareoc = data.groupby(var)['SalePrice'].count()/len(data)
    return rareoc[rareoc>p].index

In [54]:
for var in cat_var:
    frequentindex = find_freq(x_train,var,0.01)
    x_train[var] = np.where(x_train[var].isin(frequentindex), x_train[var], 'rare')
    x_test[var] = np.where(x_test[var].isin(frequentindex),x_test[var], 'rare')

In [55]:
x_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
930,931,2.995732,RL,4.290459,9.096612,Pave,2,IR1,HLS,AllPub,...,0,0,3,2,0,1.94591,2009,WD,Normal,12.21106
656,657,2.995732,RL,4.276666,9.21104,Pave,2,IR1,Lvl,AllPub,...,0,0,2,2,0,2.079442,2008,WD,Normal,11.887931
45,46,4.787492,RL,4.110874,8.943506,Pave,2,Reg,Lvl,AllPub,...,0,0,3,2,0,0.693147,2010,WD,Normal,12.675764
1348,1349,2.995732,RL,4.094345,9.69252,Pave,2,rare,Low,AllPub,...,0,0,3,2,0,2.079442,2007,WD,Normal,12.278393
55,56,2.995732,RL,4.60517,9.227689,Pave,2,IR1,Lvl,AllPub,...,0,0,3,2,0,1.94591,2008,WD,Normal,12.103486


### Converting strings into categories 



In [57]:
#This function converts the strings into multiple categories (similar to one hot encoding, but based on the mean value)
def replace_categories(train, test, var, target):
    ordered_labels = train.groupby([var])[target].mean().sort_values().index
    ordinal_label = {k:i for i, k in enumerate(ordered_labels, 0)} 
    train[var] = train[var].map(ordinal_label)
    test[var] = test[var].map(ordinal_label)

In [60]:
for var in cat_var:
    replace_categories(x_train, x_test, var, 'SalePrice')

In [61]:
x_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
930,931,2.995732,3,4.290459,9.096612,1,2,1,3,1,...,0,0,3,2,0,1.94591,2009,2,3,12.21106
656,657,2.995732,3,4.276666,9.21104,1,2,1,1,1,...,0,0,2,2,0,2.079442,2008,2,3,11.887931
45,46,4.787492,3,4.110874,8.943506,1,2,0,1,1,...,0,0,3,2,0,0.693147,2010,2,3,12.675764
1348,1349,2.995732,3,4.094345,9.69252,1,2,2,2,1,...,0,0,3,2,0,2.079442,2007,2,3,12.278393
55,56,2.995732,3,4.60517,9.227689,1,2,1,1,1,...,0,0,3,2,0,1.94591,2008,2,3,12.103486


# Feature Scaling

We now want to scale all the training and testing data. We train the test and train set separately 

In [62]:
train_vars = [var for var in x_train.columns if var not in ['Id', 'SalePrice']]
len(train_vars)

79

In [63]:
scaler = MinMaxScaler()
scaler.fit(x_train[train_vars])


  return self.partial_fit(X, y)


MinMaxScaler(copy=True, feature_range=(0, 1))

In [64]:

#Fit the data on the training variables and concatenate with the ID and Sale Price
X_train = pd.concat([x_train[['Id', 'SalePrice']].reset_index(drop=True), pd.DataFrame(scaler.transform(x_train[train_vars]), columns=train_vars)],axis=1)
X_test = pd.concat([x_test[['Id', 'SalePrice']].reset_index(drop=True), pd.DataFrame(scaler.transform(x_test[train_vars]), columns=train_vars)],axis=1)

In [65]:
X_train

Unnamed: 0,Id,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,931,12.211060,0.000000,0.75,0.461171,0.377048,1.0,1.0,0.333333,1.000000,...,0.000000,0.000000,0.0,0.75,1.0,0.000000,0.783092,0.75,0.666667,0.75
1,657,11.887931,0.000000,0.75,0.456066,0.399443,1.0,1.0,0.333333,0.333333,...,0.000000,0.000000,0.0,0.50,1.0,0.000000,0.836829,0.50,0.666667,0.75
2,46,12.675764,0.795881,0.75,0.394699,0.347082,1.0,1.0,0.000000,0.333333,...,0.000000,0.000000,0.0,0.75,1.0,0.000000,0.278943,1.00,0.666667,0.75
3,1349,12.278393,0.000000,0.75,0.388581,0.493677,1.0,1.0,0.666667,0.666667,...,0.000000,0.000000,0.0,0.75,1.0,0.000000,0.836829,0.25,0.666667,0.75
4,56,12.103486,0.000000,0.75,0.577658,0.402702,1.0,1.0,0.333333,0.333333,...,0.000000,0.000000,0.0,0.75,1.0,0.000000,0.783092,0.50,0.666667,0.75
5,1229,12.813918,0.795881,0.75,0.418208,0.373596,1.0,1.0,0.000000,0.333333,...,0.466667,0.000000,0.0,0.75,1.0,0.000000,0.926628,0.50,1.000000,1.00
6,964,12.384219,0.000000,0.75,0.651261,0.433730,1.0,1.0,0.333333,0.333333,...,0.000000,0.000000,0.0,0.75,1.0,0.000000,0.647685,0.75,0.666667,0.75
7,922,11.890677,0.668095,0.75,0.429425,0.373775,1.0,1.0,0.000000,0.333333,...,0.000000,0.000000,0.0,1.00,1.0,0.000000,0.884228,0.50,0.666667,0.75
8,459,11.989160,0.556464,0.25,0.388581,0.267521,1.0,0.0,0.000000,0.333333,...,0.000000,0.000000,0.0,0.50,1.0,0.000000,0.721057,0.50,0.666667,0.75
9,1387,12.429216,0.487992,0.75,0.495064,0.499581,1.0,1.0,0.333333,0.333333,...,0.916667,0.703252,1.0,0.50,0.0,0.129032,0.783092,0.00,0.666667,0.75


In [66]:
X_train.to_csv('xtrain.csv', index=False)
X_test.to_csv('xtest.csv', index=False)