In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format='retina'

In [2]:
def cleanHousingData(data_dir):
    
    #Reading the Data
    train_data = pd.read_csv(data_dir + 'train.csv')
    test_data = pd.read_csv(data_dir + 'test.csv')
    
    
    #Getting All columns in both training and testing datasets that have nans
    test_nan_cols = set(test_data.isna().sum()[test_data.isna().sum()>0].index)
    train_nan_cols = set(train_data.isna().sum()[train_data.isna().sum()>0].index)
    
    del_cols = list(train_nan_cols.union(test_nan_cols))
    #del_cols.append('Id')#We can get rid of Id column as well
    
    #These are the categorical columns to use later
    categorical_cols = ['MSSubClass', 'MSZoning', 'Street', 'LotShape', 'LandContour', 
                   'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
                   'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
                   'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
                   'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
                   'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
                   'PavedDrive', 'SaleType', 'SaleCondition']
    
    #Dropping all the columns in del_cols in both training and testing set
    clean_train = train_data.drop(del_cols, axis=1)
    clean_test = test_data.drop(del_cols, axis=1)
    
    #Saving the SalePrice column
    Y_train = clean_train.SalePrice
    clean_train = clean_train.drop(['SalePrice'], axis=1)

    #Marking the data as training or testing
    clean_train.loc[:, 'type'] = pd.Series(['train'] * clean_train.shape[0])
    clean_test.loc[:, 'type'] = pd.Series(['test'] * clean_test.shape[0])

    #Concatenating the data
    data = pd.concat([clean_train, clean_test], axis=0)
    
    #Creating another dataframe to efficiently One-Hot Encode the data
    new_df = pd.DataFrame()
    for col in list(data.columns):
        #Checks if the column is a categorical column
        if col in categorical_cols:
            one_hot_col = pd.get_dummies(data.loc[:, col])
            new_df = pd.concat([new_df, one_hot_col], axis=1)
        
        #If col is not categorical, it will remain the same
        else:
            column = data.loc[:, col]
            new_df = pd.concat([new_df, column], axis=1)
    
    
    #Separating the training and the testing data again
    final_train = new_df.loc[new_df.type == 'train', :]
    final_test = new_df.loc[new_df.type == 'test', :]

    final_train = final_train.drop(['type'], axis=1)
    final_test = final_test.drop(['type'], axis=1)

    final_train.loc[:, 'SalePrice'] = Y_train
    
    
    #Creates the new csv file in the same directory
    final_train.to_csv(data_dir + 'clean_train.csv', index=False)
    final_test.to_csv(data_dir + 'clean_test.csv', index=False)
    
    print('clean_train.csv' + ' is saved in ' + data_dir)
    print('clean_test.csv' + 'is saved in ' + data_dir)

In [3]:
data_dir = '/Users/adamepstein/Downloads/house-prices-advanced-regression-techniques/'
cleanHousingData(data_dir)

clean_train.csv is saved in /Users/adamepstein/Downloads/house-prices-advanced-regression-techniques/
clean_test.csvis saved in /Users/adamepstein/Downloads/house-prices-advanced-regression-techniques/
