# Test Data Preparation

#### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Lasso, LassoCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

**Read in data**

In [2]:
test = pd.read_csv('datasets/test.csv')
ames = pd.read_csv('./datasets/ames_v1.csv', keep_default_na=False)
ames1 = pd.read_csv('datasets/amesv2.csv')
test_id = test['Id'].to_frame()

**Ensure predictors in the test data match train dataset**

In [3]:
test.columns = test.columns.str.lower().str.replace(" ","_")
test.drop(columns = ['pid','alley','misc_feature','mas_vnr_type'], inplace=True)

**Apply same cleaning methods from train dataset to test data set**

In [4]:
test.drop(columns=['pool_qc','fence'],inplace=True)

test['bsmt_qual'].fillna("NA",inplace=True)
test['bsmt_cond'].fillna("NA",inplace=True)
test['bsmt_exposure'].fillna("NA",inplace=True)
test['bsmtfin_type_1'].fillna("NA",inplace=True)
test['bsmtfin_type_2'].fillna("NA",inplace=True)
test['garage_finish'].fillna("NA",inplace=True)
test['garage_qual'].fillna("NA",inplace=True)
test['garage_cond'].fillna('TA', inplace=True)
test['fireplace_qu'].fillna("NA",inplace=True)

test.rename(columns = {'kitchen_abvgr':'kitchen'}, inplace=True)
test.drop(columns = ['garage_yr_blt','mo_sold'], inplace=True)
test['bsmt_full_bath'].fillna(0, inplace=True)
test['bsmt_half_bath'].fillna(0, inplace=True)
test['garage_cars'].fillna(0, inplace=True)
test.drop(columns='misc_val', inplace=True)
test['lot_frontage'].fillna(69, inplace=True)
test['mas_vnr_area'].fillna(0, inplace=True)
test['bsmtfin_sf_1'].fillna(0, inplace=True)
test['bsmtfin_sf_2'].fillna(0, inplace=True)
test['garage_area'].fillna(0, inplace=True)
test['bsmt_unf_sf'].fillna(568, inplace=True)
test['total_bsmt_sf'].fillna(1058, inplace=True)
test['total_bath'] = test['bsmt_full_bath'] + test['bsmt_half_bath']*0.5 + test['full_bath'] + test['half_bath']*0.5
test['extra_rms'] = test['totrms_abvgrd'] - test['kitchen'] - test['bedroom_abvgr']
test['total_sf'] = test['total_bsmt_sf'] + test['gr_liv_area']
test['porch_sf'] = test['screen_porch']+test['3ssn_porch']+test['enclosed_porch']+test['open_porch_sf']+test['wood_deck_sf']

In [5]:
def binarize(col):
    new_col = [int(1) if i > 1 else int(0) for i in col]
    return new_col

In [6]:
test['has_bsmt'] = binarize(test['total_bsmt_sf'])
test['has_porch'] = binarize(test['porch_sf'])
test['has_2fl'] = binarize(test['2nd_flr_sf'])
test['has_garage'] = binarize(test['garage_area'])
test['has_fireplace'] = binarize(test['fireplaces'])
test['has_extrms'] = binarize(test['extra_rms'])
test['total_sf'] = test['total_bsmt_sf'] + test['gr_liv_area']

test.drop(columns=['screen_porch','3ssn_porch','enclosed_porch','open_porch_sf','wood_deck_sf','total_bsmt_sf','gr_liv_area','bsmtfin_sf_1','bsmtfin_sf_2','bsmt_unf_sf',
                '1st_flr_sf','2nd_flr_sf','low_qual_fin_sf','totrms_abvgrd','bsmt_full_bath','bsmt_half_bath','full_bath','half_bath'], inplace=True)

test.drop(columns = 'condition_2', inplace=True)
test.drop(columns='roof_matl', inplace=True)
test.drop(columns='exterior_1st', inplace=True)
test.drop(columns='exterior_2nd', inplace=True)
test.drop(columns='heating',inplace=True)
test.drop(columns='pool_area', inplace=True)
test.drop(columns='electrical', inplace=True)
test.drop(columns = 'street',inplace=True)
test.drop(columns = 'garage_qual', inplace=True)
test.drop(columns = 'bsmt_exposure', inplace=True)
test.drop(columns='sale_type',inplace=True)

Dropped columns and that had differing values from the train data set.

In [7]:
test.drop(columns = 'bsmt_cond', inplace=True)
test.drop(columns = 'bsmt_qual', inplace=True)
test.drop(columns = ['bsmtfin_type_1', 'bsmtfin_type_2'], inplace=True)
test.drop(columns = 'functional', inplace=True)
test.drop(columns = 'fireplace_qu', inplace=True)
test.drop(columns = 'heating_qc', inplace=True)
test.drop(columns = 'utilities', inplace=True)
test.drop(columns = 'garage_type', inplace=True)
test.drop(columns = 'garage_finish', inplace=True)
test.drop(columns = 'exter_cond', inplace=True)
test.drop(columns = 'ms_zoning', inplace=True)

Dropped columns from the pearson correlation matrix

In [8]:
test.drop(columns = ['fireplaces','garage_area','year_built','overall_qual','has_garage','has_bsmt','overall_cond','extra_rms','year_remod/add'], inplace=True)

In [9]:
#replaced inconsistent values with the training data with the mode of the testing data

In [10]:
test['kitchen_qual'].replace('Po','TA', inplace=True)
#test['bsmt_cond'].replace('Po','TA',inplace=True)
#test['utilities'].replace('NoSewr','AllPub',inplace=True)
#test['ms_zoning'].replace('I (all)','RL',inplace=True)
#test['exter_cond'].replace('Po','TA',inplace=True)
#test['bsmt_qual'].replace('Po','TA',inplace=True)
#test['neighborhood'].replace('BrDale','NAmes',inplace=True)
#test['ms_zoning'].replace('FV','RL',inplace=True)
test['foundation'].replace('Slab','PConc',inplace=True)
test['foundation'].replace('Wood','PConc',inplace=True) 
#test['garage_type'].replace('CarPort','Attchd',inplace=True)

In [11]:
test.to_csv('datasets/test_clean.csv', index=False)