# Make Features

This notebook makes all the features for both the train and test datasets.

In [5]:
# Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

## 0.0 Get Data

In [6]:
# Get data
df_test = pd.read_csv("../data/interim/test_cleaned.csv")
df_train = pd.read_csv("../data/interim/train_cleaned.csv")

In [7]:
data_frame_list = [df_test, df_train]

## 1.0 Transformations

In [8]:
# Function to make Panda Series covert into log scale
def feature_log(df,col):
    df['Log'+col] = np.log1p(df[col])
    return df

In [10]:
df_train = feature_log(df_train,'SalePrice')
df_train = feature_log(df_train,'Gr Liv Area')
df_test = feature_log(df_test,'Gr Liv Area')


### 1.1 Datatypes

In [24]:
df_train['MS SubClass'] = df_train['MS SubClass'].apply(lambda x: str(x))
df_test['MS SubClass'] = df_test['MS SubClass'].apply(lambda x: str(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


## 2.0 Outliers

In [12]:
df_train = df_train[(df_train['Gr Liv Area'] < 4000) & (df_train['LogSalePrice'] > 10)]

## 3.0 Categorical

### 3.1 Ordinal

In [13]:
''' 
Input: Pass in a custom ordered dictionary and the column names you want to apply to the dateframe.
Output: Trasformed dataframe
'''
def feature_ordinal_custom(ordered_dict,column_list,df):
    for col in column_list:
        df[col] = df[col].map(ordered_dict)
    return df

In [25]:
# Make order numeric values for quality
ordered_rating_qual = { "NA": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}

# Not all of them but only a select few columns I want to convert
qual_list = ['Exter Qual',
             'Exter Cond',
             'Bsmt Qual',
             'Kitchen Qual',
             'Fireplace Qu',
             'Garage Qual'
            ]
# Apply to dataframes
df_train = feature_ordinal_custom(ordered_rating_qual,qual_list,df_train)
df_test = feature_ordinal_custom(ordered_rating_qual,qual_list,df_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


### 3.2 Dummies

In [30]:
'''
Input: Dataframe and list of columns to apply dummy
Output: 
'''
def feature_make_dummies(df,dummy_list):
    # Create dummies on dataset
    for col in dummy_list:
        dummy = pd.get_dummies(df[col], columns=col, prefix=col)
        df = pd.concat([df, dummy], axis=1)
    return df


In [31]:
#Setup dummies on short list, doing all will not be helpful
dummy_list = ['MS Zoning',
              'MS SubClass',
              'Neighborhood',
              'Bldg Type',
              'House Style',
              'Roof Style',
              'Exterior 1st',
              'Exterior 2nd',
              'Foundation',
              'Functional',
              'Garage Type',
              'Garage Finish',
              'Paved Drive']

In [32]:
df_train = feature_make_dummies(df_train,dummy_list)
df_test = feature_make_dummies(df_test,dummy_list)

In [33]:
print(df_train.shape)
print(df_test.shape)

(2045, 208)
(879, 201)


In [None]:
# We know that doing get dummies on the test set will short change us on the columns so lets prep for that
# DNR AGAIN
for col in dummy_list:
    df_test[col] = df_test[col].fillna("NA")
    test_values = sorted(list(df_test[col].unique()))
    train_values = sorted(list(df_train[col].unique()))
    categories = set(train_values + test_values)
    df_test[col] = pd.Categorical(df_test[col], categories=categories)
    df_train[col] = pd.Categorical(df_train[col], categories=categories)

## 4.0 Feature Engineering

### 4.1 Age of Remodel

In [None]:
df_train['Age of Remodel'] = df_train['Yr Sold']-df_train['Year Remod/Add']

### 4.2 Polynomial Features