# Make Features

This notebook makes all the features for both the train and test datasets.

In [200]:
# Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Preprocessing
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

## 0.0 Get Data

In [201]:
# Get data
df_test = pd.read_csv('../data/interim/test_cleaned.csv',index_col=False)
df_train = pd.read_csv('../data/interim/train_cleaned.csv',index_col=False)

## 1.0 Transformations

In [202]:
# Function to make Panda Series covert into log scale
def feature_log(df,col):
    df['Log'+col] = np.log1p(df[col])
    return df

In [203]:
df_train = feature_log(df_train,'SalePrice')
df_train = feature_log(df_train,'Gr Liv Area')
df_test = feature_log(df_test,'Gr Liv Area')


### 1.1 Datatypes

In [204]:
df_train['MS SubClass'] = df_train['MS SubClass'].apply(lambda x: str(x))
df_test['MS SubClass'] = df_test['MS SubClass'].apply(lambda x: str(x))

## 2.0 Outliers

In [205]:
df_train = df_train[(df_train['Gr Liv Area'] < 4000) & (df_train['LogSalePrice'] > 10)]

## 3.0 Categorical

### 3.1 Ordinal

In [206]:
''' 
Input: Pass in a custom ordered dictionary and the column names you want to apply to the dateframe.
Output: Trasformed dataframe
'''
def feature_ordinal_custom(ordered_dict,column_list,df):
    for col in column_list:
        df[col] = df[col].map(ordered_dict)
    return df

In [207]:
# Make order numeric values for quality
ordered_rating_qual = { "0": 0, "None": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}

# Not all of them but only a select few columns I want to convert
qual_list = ['Exter Qual',
             'Exter Cond',
             'Bsmt Qual',
             'Kitchen Qual',
             'Fireplace Qu',
             'Garage Qual'
            ]
# Apply to dataframes
df_train = feature_ordinal_custom(ordered_rating_qual,qual_list,df_train)
df_test = feature_ordinal_custom(ordered_rating_qual,qual_list,df_test)

### 3.2 Dummies

In [208]:
'''
Input: Dataframe and list of columns to apply dummy
Output: 
'''
def feature_make_dummies(df,dummy_list):
    # Create dummies on dataset
    for col in dummy_list:
        dummy = pd.get_dummies(df[col], columns=col, prefix=col)
        df = pd.concat([df, dummy], axis=1)
    return df


In [209]:
#Setup dummies on short list, doing all will not be helpful
dummy_list = ['MS Zoning',
              'MS SubClass',
              'Neighborhood',
              'Bldg Type',
              'House Style',
              'Roof Style',
              'Exterior 1st',
              'Exterior 2nd',
              'Foundation',
              'Functional',
              'Garage Type',
              'Garage Finish',
              'Paved Drive']

In [210]:
# We know that doing get dummies on the test set will short change us on the columns so lets prep for that
# DNR AGAIN
for col in dummy_list:
    df_test[col] = df_test[col].fillna("None")
    test_values = sorted(list(df_test[col].unique()))
    train_values = sorted(list(df_train[col].unique()))
    categories = set(train_values + test_values)
    df_test[col] = pd.Categorical(df_test[col], categories=categories)
    df_train[col] = pd.Categorical(df_train[col], categories=categories)

In [211]:
df_train = feature_make_dummies(df_train,dummy_list)
df_test = feature_make_dummies(df_test,dummy_list)

In [212]:
print(df_train.shape)
print(df_test.shape)

(2045, 213)
(879, 213)


## 4.0 Feature Engineering

### 4.1 Age of Remodel

In [213]:
df_train['Age of Remodel'] = df_train['Yr Sold']-df_train['Year Remod/Add']
df_test['Age of Remodel'] = df_test['Yr Sold']-df_test['Year Remod/Add']

### 4.2 Total Baths

In [214]:
df_train['Total Bath'] = df_train['Full Bath'] + df_train['Bsmt Full Bath'] + (df_train['Half Bath'] + df_train['Bsmt Half Bath'])/2
df_test['Total Bath'] = df_test['Full Bath'] + df_test['Bsmt Full Bath'] + (df_test['Half Bath'] + df_test['Bsmt Half Bath'])/2

### 4.3 Interaction Terms

### 4.4 Total Quality

In [215]:
df_train['Sum Qual'] = df_train['Exter Qual'] + df_train['Bsmt Qual'] + df_train['Kitchen Qual'] + df_train['Fireplace Qu'] + df_train['Garage Qual']
df_test['Sum Qual'] = df_test['Exter Qual'] + df_test['Bsmt Qual'] + df_test['Kitchen Qual'] + df_test['Fireplace Qu'] + df_test['Garage Qual']

## 5.0 Preprocessing

In [216]:
# Isolate which values to Scale
num_discrete_col = df_train.select_dtypes('uint8').columns.tolist()
num_conti_col = df_train.drop(['SalePrice', 'LogSalePrice'], axis=1).select_dtypes(['float64','int64']).columns.tolist()

In [218]:
# Scale the data
ss = StandardScaler() # Instantiate Standard Scaler
ss.fit(pd.concat([df_train[num_conti_col],df_test[num_conti_col]]))
df_train[num_conti_col] = ss.transform(df_train[num_conti_col]) 
df_test[num_conti_col] = ss.transform(df_test[num_conti_col])

In [219]:
# Create matrices
num_columns = list(df_train._get_numeric_data().drop(['SalePrice', 'LogSalePrice'], axis=1).columns)
cat_columns = list(df_train.select_dtypes(include=['object']).columns)

In [220]:
for feature in num_columns:
    df_test[feature] = df_test[feature].fillna(0)

## Export Data

In [221]:
df_test.to_csv('../data/processed/test_scaled2.csv')
df_train.to_csv('../data/processed/train_scaled2.csv')