In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
import statsmodels.api as sm

In [2]:
ames = pd.read_csv('./datasets/train_clean.csv', keep_default_na=False, index_col=0)
ames_test = pd.read_csv('./datasets/test_clean.csv', keep_default_na=False, index_col=0)

In [3]:
def sum_features(df, new_col_name, features_list):
    #df : dataframe
    #new_col_name : string, name to give the new column
    #features_llist : list of strings, to index features from df
    
    dfcopy = df.copy()
    
    #add_col : column that will contain the sum of the data in the features
    sum_col = np.zeros_like(dfcopy[features_list[0]])
    
    for feature in features_list:
        sum_col = sum_col + dfcopy[feature]
        
    dfcopy[new_col_name] = sum_col
    return dfcopy

def multiply_features(df, new_col_name, features_list):
    #df : dataframe
    #new_col_name : string, name to give the new column
    #features_llist : list of strings, to index features from df
    
    dfcopy = df.copy()
    
    #product_col : column that will contain the product of the data in the features
    product_col = np.zeros_like(dfcopy[features_list[0]]) + 1 #start with 1, because 1 * x = x
    
    for feature in features_list:
        product_col = product_col * df[feature]
    
    dfcopy[new_col_name] = product_col
    return dfcopy

In [4]:
#Create a new column for Total SF, sum of above-ground Living area, Total Basement square feet, and garage are

ames = sum_features(ames, 'Total SF', ['Gr Liv Area', 'Total Bsmt SF', 'Garage Area'])
ames_test = sum_features(ames_test, 'Total SF', ['Gr Liv Area', 'Total Bsmt SF', 'Garage Area'])

#Create a new column for porch SF, sum of all porch areas

ames = sum_features(ames, 'Porch SF', ['Wood Deck SF', 'Open Porch SF','Enclosed Porch', '3Ssn Porch', 'Screen Porch'])
ames_test = sum_features(ames_test, 'Porch SF', ['Wood Deck SF', 'Open Porch SF','Enclosed Porch', '3Ssn Porch', 'Screen Porch'])

In [5]:
#Create a new column for the interaction of Total SF and Overall Quality

ames = multiply_features(ames, 'SF Qual', ['Total SF', 'Overall Qual'])
ames_test = multiply_features(ames_test, 'SF Qual', ['Total SF', 'Overall Qual'])

In [6]:
#Create a new column for the age of the home at time of sale

ames['Age'] = ames['Yr Sold'] - ames['Year Built']
ames_test['Age'] = ames_test['Yr Sold'] - ames_test['Year Built']

In [7]:
#Create a new column that is the log of age - with 0.5 years added (possibly better estimates, but more importantly avoids infinities)

ames['Log Age'] = np.log(ames['Age'] + 0.5)
ames_test['Log Age'] = np.log(ames_test['Age'] + 0.5)

In [8]:
ames.shape

(2047, 86)

In [9]:
ames_test.shape

(878, 85)

In [10]:
ames.to_csv('./datasets/train_features.csv')
ames_test.to_csv('./datasets/test_features.csv')