In [10]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler

# Import Data
df = pd.read_csv("data/train.csv")
df.drop(['Id'], axis=1, inplace=True) # Not needed
df.drop(['YearRemodAdd'], axis=1, inplace=True) # Easier to leave out
df.drop(['GarageYrBlt'], axis=1, inplace=True) # Easier to leave out

# Hardcode categorical and numeric features
numeric_vars = ["LotFrontage","LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
                 "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd",
                 "Fireplaces", "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch",
                 "ScreenPorch", "PoolArea", "MiscVal"]

categorical_vars = ["MSSubClass", "MSZoning", "Street", "Alley", "LotShape", "LandContour",
                     "Utilities", "LotConfig", "LandSlope", "Neighborhood", "Condition1", "Condition2", 
                     "BldgType", "HouseStyle", "OverallQual", "OverallCond", "YearBuilt", "RoofStyle", "RoofMatl",
                     "Exterior1st", "Exterior2nd", "MasVnrType", "ExterQual", "ExterCond", "Foundation", "BsmtQual", "BsmtCond", 
                     "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "Heating", "HeatingQC", "CentralAir", 
                     "Electrical", "KitchenQual", "Functional", "FireplaceQu", "GarageType", 
                     "GarageFinish", "GarageQual", "GarageCond", "PavedDrive", "PoolQC", "Fence", "MiscFeature", "MoSold",
                     "YrSold", "SaleType", "SaleCondition"]


# Hardcode name of categorical columns the will have nan vals converted to "None"
cat_nantonone = ["Alley", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "FireplaceQu",
                 "GarageType", "GarageFinish", "GarageQual", "GarageCond", "PoolQC", "Fence", "MiscFeature"]


# Replace Nan values in LotFrontage with 0
df['LotFrontage'].fillna(value=0, inplace=True)

# Replace Nan in categorical columns with "None"
for col in cat_nantonone:
    df[col].fillna(value='None', inplace=True)

# Observe remaining NA entries
for col in numeric_vars:
    print(f"{col}: {df[col].isnull().sum()}")

for col in categorical_vars:
    print(f"{col}: {df[col].isnull().sum()}")

print(f"\nTotal remaining NA entries: {df.isnull().sum().sum()}")

# Impute remaining NA values
df['MasVnrType'].fillna(value='None', inplace=True)
df['MasVnrArea'].fillna(value=0, inplace=True)
df.fillna(df['Electrical'].value_counts().index[0], inplace=True)

# Convert all numeric columns to int type
for col in numeric_vars:
    df[col] = pd.to_numeric(df[col])


# Convert all categorical columns to str type
for col in categorical_vars:
    df[col] = df[col].astype(str)


# Remove rows containing outliers
for col in numeric_vars:
    df = df[(np.abs(stats.zscore(df[col])) < 3.5)]


# Standardize numeric variables between 0 and 1
scaler = StandardScaler()
df[numeric_vars] = scaler.fit_transform(df[numeric_vars])


# Encode categorical columns
encoded_data = pd.get_dummies(df[categorical_vars])


# Combine encoded data with numeric features
numeric_data = df[numeric_vars]
target = df.iloc[:,-1]
df_final = pd.concat([encoded_data, numeric_data, target], axis=1)


# Save df_final
df_final = df_final.dropna()
df_final.to_csv("data/df_final.csv", index=False)

LotFrontage: 0
LotArea: 0
MasVnrArea: 8
BsmtFinSF1: 0
BsmtFinSF2: 0
BsmtUnfSF: 0
TotalBsmtSF: 0
1stFlrSF: 0
2ndFlrSF: 0
LowQualFinSF: 0
GrLivArea: 0
BsmtFullBath: 0
BsmtHalfBath: 0
FullBath: 0
HalfBath: 0
BedroomAbvGr: 0
KitchenAbvGr: 0
TotRmsAbvGrd: 0
Fireplaces: 0
GarageCars: 0
GarageArea: 0
WoodDeckSF: 0
OpenPorchSF: 0
EnclosedPorch: 0
3SsnPorch: 0
ScreenPorch: 0
PoolArea: 0
MiscVal: 0
MSSubClass: 0
MSZoning: 0
Street: 0
Alley: 0
LotShape: 0
LandContour: 0
Utilities: 0
LotConfig: 0
LandSlope: 0
Neighborhood: 0
Condition1: 0
Condition2: 0
BldgType: 0
HouseStyle: 0
OverallQual: 0
OverallCond: 0
YearBuilt: 0
RoofStyle: 0
RoofMatl: 0
Exterior1st: 0
Exterior2nd: 0
MasVnrType: 8
ExterQual: 0
ExterCond: 0
Foundation: 0
BsmtQual: 0
BsmtCond: 0
BsmtExposure: 0
BsmtFinType1: 0
BsmtFinType2: 0
Heating: 0
HeatingQC: 0
CentralAir: 0
Electrical: 1
KitchenQual: 0
Functional: 0
FireplaceQu: 0
GarageType: 0
GarageFinish: 0
GarageQual: 0
GarageCond: 0
PavedDrive: 0
PoolQC: 0
Fence: 0
MiscFeature: 0
M