In [26]:
#Import needed packages
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import math 
import random

In [27]:
#Read in CSV and drop the ID column
train = pd.read_csv('train.csv')
train.drop('Id', axis = 1, inplace = True)

In [28]:
#Creating new features
#Season: combining the year and season to know which season and year the house was sold in 
#newHouse: to see if the house was sold the same year as it was purchased
#remodeled: to see if the house was remodeled before it was sold the same year as it was purchased
#randInit: generating a column of random numbers to filter on later
train['remodeled'] = [1 if (train['YearBuilt'].iloc[i] == (train['YearRemodAdd'].iloc[i])) else 0 for i in range(0,1460)]
train['newHouse'] = [1 if (train['YearBuilt'].iloc[i] == (train['YrSold'].iloc[i])) else 0 for i in range(0,1460)]
arr = []
for i in range(0,1460):
    if train['MoSold'].iloc[i] in [12, 1, 2]: 
        arr.append('Spring' + "-" + str(train['YrSold'].iloc[i]))
    elif train['MoSold'].iloc[i] in [3, 4, 5]:
        arr.append('Summer' + "-" + str(train['YrSold'].iloc[i]))
    elif train['MoSold'].iloc[i] in [3, 4, 5]: 
        arr.append('Fall' + "-" + str(train['YrSold'].iloc[i]))
    else:
        arr.append('Winter' + "-" + str(train['YrSold'].iloc[i]))
train['season'] = arr
random.seed(23)
randInit = np.random.choice(np.arange(0,10000), size=1460)
randCol = pd.DataFrame(randInit)
randCol.columns = ['randInit']
train['randInit'] = randCol['randInit']

In [29]:
#Drop MiscFeature and PoolQC as there were too many nulls
train = train.drop(['MiscFeature', 'PoolQC'], axis = 1)

In [30]:
#Masonry data was missing. Replace with 0 and None.
train['MasVnrType'].fillna(value = 'None', inplace = True)
train['MasVnrArea'].replace(np.nan, 0, inplace = True)

#Changing some of the columns to ordinal values
mapping = {'Gtl': 0, 'Mod': 1, 'Sev': 2}
train.replace({'LandSlope' : mapping}, inplace = True)
mapping = {'Reg': 0, 'IR1': 1, 'IR2': 2, 'IR3': 3}
train.replace({'LotShape' : mapping}, inplace = True)
mapping = {'Grvl': 0, 'Pave': 1}
train.replace({'Street' : mapping}, inplace = True)

#Alley and Basements have the string NA, which is not missing. Alley and other variables have NA on purpose.
#It needs to be convert to string. All basement variables were consistent.
train.loc[train.Alley.isnull(), 'Alley'] = 'NoAlley'
train.loc[train.BsmtCond.isnull(), 'BsmtCond'] = 'NoBasement'
train.loc[train.BsmtExposure.isnull(), 'BsmtExposure'] = 'NoBasement'
train.loc[train.BsmtFinType1.isnull(), 'BsmtFinType1'] = 'NoBasement'
train.loc[train.BsmtFinType2.isnull(), 'BsmtFinType2'] = 'NoBasement'
train.loc[train.BsmtQual.isnull(), 'BsmtQual'] = 'NoBasement'
train['Fence'].replace(np.nan, 'NoFence', inplace=True)
train['FireplaceQu'].replace(np.nan, 'NoFireplace', inplace=True)
train['GarageCond'].replace(np.nan, 'NoGarage', inplace=True)
train['GarageFinish'].replace(np.nan, 'NoGarage', inplace=True)
train['GarageQual'].replace(np.nan, 'NoGarage', inplace=True)
train['GarageType'].replace(np.nan, 'NoGarage', inplace=True)

#Central Air changed to 1 or 0.
centralair = {'N': 0,'Y': 1}
train.CentralAir = [centralair[i] for i in train.CentralAir]

#Further missing values
train['randInit'].replace(np.nan, '2138', inplace=True)
train['Electrical'].replace(np.nan, 'SBrkr', inplace=True)

In [31]:
#Changing the nulls to imputed values
#Using Lot Area to fill in values for Lot Frontage.
#Adding together all SF columns to make a total SF column
train['GarageYrBlt'].replace(np.nan, train['GarageYrBlt'].mean(), inplace = True)
train['areaBin'] = pd.cut(train.LotArea, bins = 10).astype(str)
X = train.groupby('areaBin')[['LotFrontage']].median()
X.reset_index(inplace=True)
X['areaBin'] = X['areaBin'].astype(str)
train['LotFrontage'] = [train.LotFrontage.iloc[i] if not np.isnan(train.LotFrontage.iloc[i])
          else X[X.areaBin == train.iloc[i]['areaBin']]['LotFrontage'].iloc[0] for i in range(len(train))]

train['TotalSF'] = (train['TotalBsmtSF'] + train['1stFlrSF'] + train['2ndFlrSF'] + train['WoodDeckSF'] + 
                train['OpenPorchSF'] + train['EnclosedPorch'] + train['3SsnPorch'] + train['ScreenPorch'] +
                train['GarageArea'])
train["LotFrontage"] = train["LotFrontage"].replace(np.nan, 106)

In [32]:
#Write to CSV
train.to_csv('train_updated.csv', index = 0)