In [62]:
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:20,.4f}'.format

## Bring in the Data

In [64]:
train = pd.read_csv("train.csv")
del train["GarageYrBlt"]
del train['Exterior1st']
del train['Exterior2nd']
del train["Condition2"]
del train['MSSubClass']
train_x = train.drop("SalePrice", axis = 1)
train_y = train["SalePrice"]

## Variable Exploration

In [65]:
#impute some missing values
columns = ["PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu", 
           "GarageType", "GarageCond", "GarageFinish", "GarageQual", 
           "BsmtExposure", "BsmtFinType2", "BsmtFinType1", "BsmtCond", "BsmtQual"]

for col in columns:
    train_x[col] = train_x[col].fillna("NA")
    test[col] = test[col].fillna("NA")
    
train_x["LotFrontage"] = train_x["LotFrontage"].fillna(0)
test["LotFrontage"] = test["LotFrontage"].fillna(0)

In [66]:
train_con_cols = ['LotFrontage', 'LotArea',  'MasVnrArea','BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF',
                  '1stFlrSF', '2ndFlrSF','LowQualFinSF','GrLivArea','GarageArea','WoodDeckSF', 'OpenPorchSF',
                  'EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal']

train_counts_cols = ['BsmtFullBath','BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'MSZoning', 
                     'KitchenAbvGr', 'TotRmsAbvGrd',  'Fireplaces', 'GarageCars', "remodel_flag"]

train_cat = ["EraBuilt", "SeasonSold", "Neighborhood", "HouseStyle",'Utilities','Street',
             'OverallQual', 'OverallCond','Alley','LotShape','LandContour','MSZoning',
             'LotConfig','LandSlope','Condition1', 'BldgType','HouseStyle','RoofStyle',
             'RoofMatl', 'MasVnrType', 'ExterQual','ExterCond','Foundation','BsmtQual','BsmtCond',
             'BsmtExposure','BsmtFinType1','BsmtFinType2','Heating','HeatingQC','CentralAir','Electrical',
             'KitchenQual','Functional','FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond',
             'PavedDrive','PoolQC', 'Fence','MiscFeature','SaleType','SaleCondition']

drop_cols = ["YearBuilt", "MoSold", 'YearRemodAdd']

## Categorical Variables

In [67]:
train_x["SaleType"] = np.where(~train_x["SaleType"].isin(["WD", "New"]), "Other", train_x["SaleType"])

In [68]:
train_x["Condition1"] = np.where(train_x["Condition1"].isin(["RRAn", "RRAe", "RRNn", "RRNe"]), "rr_adj",
                           np.where(train_x["Condition1"].isin(["Artery", "Feedr"]), "road_adj",
                              np.where(train_x["Condition1"].isin(["PosA", "PosN"]), "pos_adj", 
                                       "norm"
                           )))

In [69]:
train_x["EraBuilt"] = np.where(train_x["YearBuilt"] < 1950, "pre_1950", 
                          np.where(train_x["YearBuilt"] < 1960, "1950s", 
                             np.where(train_x["YearBuilt"] < 1970, "1960s",
                                np.where(train_x["YearBuilt"] < 1980, "1970s", 
                                   np.where(train_x["YearBuilt"] < 1990, "1980s", 
                                      np.where(train_x["YearBuilt"] < 2000, "1990s", 
                                         "2000s"))))))

In [70]:
train_x["SeasonSold"] = np.where(train_x["MoSold"].isin([12, 1, 2]), "Winter", 
                           np.where(train_x["MoSold"].isin([3, 4, 5]), "Spring", 
                              np.where(train_x["MoSold"].isin([6, 7, 8]), "Summer",
                                       "Fall")))

In [71]:
train_x["remodel_flag"] = np.where(train_x["YearBuilt"] != train_x['YearRemodAdd'], 1, 0)

In [72]:
train_x['MSZoning'] = np.where(train_x['MSZoning'].str.contains("R"), "res", "non_res")

In [73]:
train.groupby(["Neighborhood"])["SalePrice"].describe().reset_index().sort_values("mean",
                                                                                  ascending = False)
highend_hoods = ["NoRidge", "NridgeHt", "StoneBr", "Timber", "Veenker", "Somerst", 
                 "ClearCr", "Crawfor"]
midend_hoods = ["CollgCr", "Blmngtn", "Gilbert", "NWAmes", "SawyerW", "Mitchel", 
                "NAMes", "NPkVill"]
lowend_hoods = ["SWISU", "Blueste", "Sawyer", "OldTown", "Edwards","BrkSide", "BrDale", 
                "IDOTRR", "MeadowV"]
train_x["Neighborhood"] = np.where(train_x["Neighborhood"].isin(highend_hoods), "highend_hood", 
                             np.where(train_x["Neighborhood"].isin(midend_hoods), "midend_hood", 
                                  "lowend_hood"))

In [74]:
train_x["HouseStyle"] = np.where(train_x["HouseStyle"].isin(["1Story", "1.5Fin", "1.5Unf"]), "1Story", 
                             np.where(train_x["HouseStyle"].isin(["2Story", "2.5Fin", "2.5Unf"]), "2Story",
                                     "Split"))

In [75]:
train_x['OverallQual'] = np.where(train_x['OverallQual'] <= 5, "bad_quality", 
                            np.where(train_x['OverallQual'] <= 7, "med_quality", 
                                     "high_quality"))

In [76]:
train_x['OverallCond'] = np.where(train_x['OverallCond'] <= 4, "bad_cond", 
                            np.where(train_x['OverallCond'] == 5, "med_cond", 
                                     "great_cond"))

## Scaling

In [77]:
from sklearn import preprocessing

In [78]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train_x[train_con_cols] = scaler.fit_transform(train_x[train_con_cols].fillna(0))

## Data For Model

In [82]:
# Get Dummy Variables
train_model = pd.get_dummies(train_x, columns = train_cat)
# Drop Duplicates
train_model = train_model.drop(train_cat, axis= 1).drop(drop_cols, axis = 1)

In [84]:
pd.concat([train_model,
           train_y], axis = 1).to_csv("train_model.csv")

In [86]:
train_model.shape

(1460, 237)