# Make Features

This notebook makes all the features for both the train and test datasets.

In [1]:
# Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Preprocessing
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

## 0.0 Get Data

In [2]:
# Get data
df_test = pd.read_csv('../data/interim/test_cleaned.csv',index_col=False)
df_train = pd.read_csv('../data/interim/train_cleaned.csv',index_col=False)

## 1.0 Transformations

In [3]:
# Function to make Panda Series covert into log scale
def feature_log(df,col):
    df['Log'+col] = np.log1p(df[col])
    return df

In [4]:
df_train = feature_log(df_train,'SalePrice')
df_train = feature_log(df_train,'Gr Liv Area')
df_test = feature_log(df_test,'Gr Liv Area')


### 1.1 Datatypes

In [5]:
df_train['MS SubClass'] = df_train['MS SubClass'].apply(lambda x: str(x))
df_test['MS SubClass'] = df_test['MS SubClass'].apply(lambda x: str(x))

## 2.0 Outliers

In [6]:
df_train = df_train[(df_train['Gr Liv Area'] < 4000) & (df_train['LogSalePrice'] > 10)]

## 3.0 Categorical

### 3.1 Ordinal

In [7]:
''' 
Input: Pass in a custom ordered dictionary and the column names you want to apply to the dateframe.
Output: Trasformed dataframe
'''
def feature_ordinal_custom(ordered_dict,column_list,df):
    for col in column_list:
        df[col] = df[col].map(ordered_dict)
    return df

In [8]:
# Make order numeric values for quality
ordered_rating_qual = { "None": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}

# Not all of them but only a select few columns I want to convert
qual_list = ['Exter Qual',
             'Exter Cond',
             'Bsmt Qual',
             'Kitchen Qual',
             'Fireplace Qu',
             'Garage Qual'
            ]
# Apply to dataframes
df_train = feature_ordinal_custom(ordered_rating_qual,qual_list,df_train)
df_test = feature_ordinal_custom(ordered_rating_qual,qual_list,df_test)

### 3.2 Dummies

In [9]:
'''
Input: Dataframe and list of columns to apply dummy
Output: 
'''
def feature_make_dummies(df,dummy_list):
    # Create dummies on dataset
    for col in dummy_list:
        dummy = pd.get_dummies(df[col], columns=col, prefix=col)
        df = pd.concat([df, dummy], axis=1)
    return df


In [10]:
#Setup dummies on short list, doing all will not be helpful
dummy_list = ['MS Zoning',
              'MS SubClass',
              'Neighborhood',
              'Bldg Type',
              'House Style',
              'Roof Style',
              'Exterior 1st',
              'Exterior 2nd',
              'Foundation',
              'Functional',
              'Garage Type',
              'Garage Finish',
              'Paved Drive']

In [11]:
# We know that doing get dummies on the test set will short change us on the columns so lets prep for that
# DNR AGAIN
for col in dummy_list:
    df_test[col] = df_test[col].fillna("None")
    test_values = sorted(list(df_test[col].unique()))
    train_values = sorted(list(df_train[col].unique()))
    categories = set(train_values + test_values)
    df_test[col] = pd.Categorical(df_test[col], categories=categories)
    df_train[col] = pd.Categorical(df_train[col], categories=categories)

In [12]:
df_train = feature_make_dummies(df_train,dummy_list)
df_test = feature_make_dummies(df_test,dummy_list)

In [13]:
print(df_train.shape)
print(df_test.shape)

(2045, 213)
(879, 213)


## 4.0 Feature Engineering

### 4.1 Age of Remodel

In [14]:
df_train['Age of Remodel'] = df_train['Yr Sold']-df_train['Year Remod/Add']
df_test['Age of Remodel'] = df_test['Yr Sold']-df_test['Year Remod/Add']

### 4.2 Total Baths

In [15]:
df_train['Total Bath'] = df_train['Full Bath'] + df_train['Bsmt Full Bath'] + (df_train['Half Bath'] + df_train['Bsmt Half Bath'])/2
df_test['Total Bath'] = df_test['Full Bath'] + df_test['Bsmt Full Bath'] + (df_test['Half Bath'] + df_test['Bsmt Half Bath'])/2

### 4.3 Polynomial Features

### 4.4 Total Quality

In [16]:
df_train['Sum Qual'] = df_train['Exter Qual'] + df_train['Bsmt Qual'] + df_train['Kitchen Qual'] + df_train['Fireplace Qu'] + df_train['Garage Qual']
df_test['Sum Qual'] = df_test['Exter Qual'] + df_test['Bsmt Qual'] + df_test['Kitchen Qual'] + df_test['Fireplace Qu'] + df_test['Garage Qual']

## 5.0 Preprocessing

In [17]:
# Create matrices
num_columns = list(df_train._get_numeric_data().drop(['SalePrice', 'LogSalePrice'], axis=1).columns)
cat_columns = list(df_train.select_dtypes(include=['object']).columns)

In [18]:
for feature in num_columns:
    df_test[feature] = df_test[feature].fillna(0)

In [29]:
# Scale the data
ss = StandardScaler() # Instantiate Standard Scaler
df_train[num_columns] = ss.fit_transform(df_train[num_columns]) 
df_test[num_columns] = ss.fit_transform(df_test[num_columns])

In [30]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,...,Garage Finish_Unf,Garage Finish_None,Garage Finish_Fin,Garage Finish_RFn,Paved Drive_N,Paved Drive_P,Paved Drive_Y,Age of Remodel,Total Bath,Sum Qual
0,-1.730802,60,RL,0.011267,0.535035,Pave,,IR1,Lvl,AllPub,...,-0.839146,-0.242975,-0.574151,1.591211,-0.280333,-0.139433,0.31818,-0.88442,0.347332,-0.295543
1,-1.729113,60,RL,-1.258784,0.226401,Pave,,IR1,Lvl,AllPub,...,-0.839146,-0.242975,-0.574151,1.591211,-0.280333,-0.139433,0.31818,-0.551118,1.59128,0.909612
2,-1.727424,20,RL,-0.04226,-0.317709,Pave,,Reg,Lvl,AllPub,...,1.191687,-0.242975,-0.574151,-0.628452,-0.280333,-0.139433,0.31818,-0.979649,-0.274642,-0.596832
3,-1.725735,60,RL,0.201045,-0.031175,Pave,,Reg,Lvl,AllPub,...,-0.839146,-0.242975,1.741703,-0.628452,-0.280333,-0.139433,0.31818,-0.979649,0.347332,-0.596832
4,-1.724046,50,RL,0.638994,0.644467,Pave,,IR1,Lvl,AllPub,...,1.191687,-0.242975,-0.574151,-0.628452,3.567188,-0.139433,-3.142874,-0.313045,-0.274642,-1.199409


In [31]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,...,Garage Finish_Unf,Garage Finish_None,Garage Finish_Fin,Garage Finish_RFn,Paved Drive_N,Paved Drive_P,Paved Drive_Y,Age of Remodel,Total Bath,Sum Qual
0,-1.730081,2658,902301120,190,RM,-0.029251,-0.119395,Pave,Grvl,Reg,...,1.140635,-0.232286,-0.576037,-0.600567,-0.282582,-0.163918,0.333544,1.593181,-0.259157,-0.992884
1,-1.72614,2718,905108090,90,RL,-0.001151,-0.067611,Pave,,IR1,...,-0.876705,-0.232286,1.736001,-0.600567,-0.282582,-0.163918,0.333544,0.274485,-0.259157,-0.992884
2,-1.7222,2414,528218130,60,RL,-0.544416,0.673505,Pave,,IR1,...,-0.876705,-0.232286,-0.576037,1.665092,-0.282582,-0.163918,0.333544,-1.141892,1.5979,1.150067
3,-1.718259,1989,902207150,30,RM,-0.450749,-0.181337,Pave,,Reg,...,1.140635,-0.232286,-0.576037,-0.600567,3.538796,-0.163918,-2.998105,-1.093051,-1.497195,-0.992884
4,-1.714318,625,535105100,20,RL,-0.001151,-0.083744,Pave,,IR1,...,-0.876705,-0.232286,-0.576037,1.665092,-0.282582,-0.163918,0.333544,1.104775,0.359862,0.924493


## Export Data

In [32]:
df_test.to_csv('../data/processed/test_scaled.csv')
df_train.to_csv('../data/processed/train_scaled.csv')