# Project 2: Features, Galore: A Linear Regression Analysis on Home Features Predicting Sale Price
---

## Test Data Cleaning and Pre-processing

This section includes test data that will be ran through the optimal Linear Regression model. The data is cleaned and pre-processed with the same steps as the training data. The following steps are outlined below. 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [2]:
#Reading in the test.csv data 
data=pd.read_csv('../datasets/test.csv')
X_train=pd.read_csv('../datasets/X_train_pre_ss.csv')

In [3]:
data.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 80 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               878 non-null    int64  
 1   PID              878 non-null    int64  
 2   MS SubClass      878 non-null    int64  
 3   MS Zoning        878 non-null    object 
 4   Lot Frontage     718 non-null    float64
 5   Lot Area         878 non-null    int64  
 6   Street           878 non-null    object 
 7   Alley            58 non-null     object 
 8   Lot Shape        878 non-null    object 
 9   Land Contour     878 non-null    object 
 10  Utilities        878 non-null    object 
 11  Lot Config       878 non-null    object 
 12  Land Slope       878 non-null    object 
 13  Neighborhood     878 non-null    object 
 14  Condition 1      878 non-null    object 
 15  Condition 2      878 non-null    object 
 16  Bldg Type        878 non-null    object 
 17  House Style     

### Converting column names to lowercase snake case

In [5]:
#Function from Ben's brain to convert column names to snakecase 
import re 

def snake_case(df):
    new_cols = {col: re.sub(r"([a-z]{1})([A-Z]{1})", r"\1 \2", col).replace(" ", "_").lower() for col in df.columns}
    return df.rename(columns = new_cols, inplace = True)

In [6]:
#Applying function to dataframe 
snake_case(data)

In [7]:
data.head()

Unnamed: 0,id,pid,ms_sub_class,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


#### Initial Data Cleaning 

In [8]:
#Making data ms_sub_class an object
data['ms_sub_class']=data['ms_sub_class'].astype('str')

In [9]:
#Dropping rows with minimal null values 
(data.dropna(subset=['bsmt_fin_sf_1', 'bsmt_fin_sf_2','bsmt_unf_sf', 'total_bsmt_sf',
                     'garage_cars', 'garage_area'], inplace=True))

In [10]:
#Changing central air to binary b/c missing no values (1 and O)
data['central_air']=np.where(data['central_air']=='Y', 1, 0)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 878 entries, 0 to 877
Data columns (total 80 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               878 non-null    int64  
 1   pid              878 non-null    int64  
 2   ms_sub_class     878 non-null    object 
 3   ms_zoning        878 non-null    object 
 4   lot_frontage     718 non-null    float64
 5   lot_area         878 non-null    int64  
 6   street           878 non-null    object 
 7   alley            58 non-null     object 
 8   lot_shape        878 non-null    object 
 9   land_contour     878 non-null    object 
 10  utilities        878 non-null    object 
 11  lot_config       878 non-null    object 
 12  land_slope       878 non-null    object 
 13  neighborhood     878 non-null    object 
 14  condition_1      878 non-null    object 
 15  condition_2      878 non-null    object 
 16  bldg_type        878 non-null    object 
 17  house_style     

#### Feature Engineering New Columns/Dropping redundancies  

In [12]:
#Making new columns for variables that should be yes or no (ie. 1 for yes and 0 for no). No values need to be filled so okay now. 
data['has_wood_deck']=np.where(data['wood_deck_sf'] != 0, 1,0)
data['has_open_porch']=np.where(data['open_porch_sf'] != 0, 1,0)
data['has_enclosed_porch']=np.where(data['enclosed_porch']!= 0, 1,0)
data['has_3season_porch']=np.where(data['3ssn_porch'] != 0, 1, 0)
data['has_screen_porch']=np.where(data['screen_porch'] != 0, 1, 0)
data['has_pool']=np.where(data['pool_area'] != 0, 1, 0)

In [13]:
#Dropping old columns from dataframe 
data.drop(columns=['wood_deck_sf', 'open_porch_sf', 'enclosed_porch', '3ssn_porch', 'screen_porch', 'pool_area'], inplace=True)

In [14]:
#Dropping 'garage_area' due to redundancy with 'garage_cars'
data.drop(columns=['garage_area'], inplace=True)

In [15]:
#Dropping because redundant with garage_qual 
data.drop(columns=['garage_cond'], inplace=True)

In [16]:
#Dropping because likely correlated with the mas_vnr_type and I think mas_vnr_type is more important
data.drop(columns=['mas_vnr_area'], inplace=True)

In [17]:
data.head()

Unnamed: 0,id,pid,ms_sub_class,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,misc_val,mo_sold,yr_sold,sale_type,has_wood_deck,has_open_porch,has_enclosed_porch,has_3season_porch,has_screen_porch,has_pool
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,4,2006,WD,0,1,1,0,0,0
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,8,2006,WD,1,0,0,0,0,0
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,9,2006,New,1,1,0,0,0,0
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,7,2007,WD,0,0,1,0,0,0
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,7,2009,WD,0,1,0,0,1,0


#### Replacing numerical null values with median 

In [18]:
#Replace null values with median in 'Lot Frontage'
data['lot_frontage']=data['lot_frontage'].replace(np.nan, data['lot_frontage'].median())

#Replace null values with median in 'Bsmt Full Bath'
data['bsmt_full_bath']=data['bsmt_full_bath'].replace(np.nan, data['bsmt_full_bath'].median())

#Replace null values with median in 'Bsmt Half Bath'
data['bsmt_half_bath']=data['bsmt_half_bath'].replace(np.nan, data['bsmt_half_bath'].median())

#Replace null values with median in 'Garage Yr Blt'
data['garage_yr_blt']=data['garage_yr_blt'].replace(np.nan, data['garage_yr_blt'].median())

#### Replacing categorical null values with 0

In [19]:
#Filling categorical columns with 0
data['alley']=data['alley'].replace(np.nan, 0)
data['mas_vnr_type']=data['mas_vnr_type'].replace(np.nan, 0)
data['bsmt_qual']=data['bsmt_qual'].replace(np.nan, 0)
data['bsmt_cond']=data['bsmt_cond'].replace(np.nan, 0)
data['bsmt_exposure']=data['bsmt_exposure'].replace(np.nan, 0)
data['bsmt_fin_type_1']=data['bsmt_fin_type_1'].replace(np.nan, 0)
data['bsmt_fin_type_2']=data['bsmt_fin_type_2'].replace(np.nan, 0)
data['fireplace_qu']=data['fireplace_qu'].replace(np.nan, 0)
data['garage_type']=data['garage_type'].replace(np.nan, 0)
data['garage_finish']=data['garage_finish'].replace(np.nan, 0)
data['garage_qual']=data['garage_qual'].replace(np.nan, 0)
data['pool_qc']=data['pool_qc'].replace(np.nan, 0)
data['fence']=data['fence'].replace(np.nan, 0)
data['misc_feature']=data['misc_feature'].replace(np.nan, 0)

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 878 entries, 0 to 877
Data columns (total 77 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  878 non-null    int64  
 1   pid                 878 non-null    int64  
 2   ms_sub_class        878 non-null    object 
 3   ms_zoning           878 non-null    object 
 4   lot_frontage        878 non-null    float64
 5   lot_area            878 non-null    int64  
 6   street              878 non-null    object 
 7   alley               878 non-null    object 
 8   lot_shape           878 non-null    object 
 9   land_contour        878 non-null    object 
 10  utilities           878 non-null    object 
 11  lot_config          878 non-null    object 
 12  land_slope          878 non-null    object 
 13  neighborhood        878 non-null    object 
 14  condition_1         878 non-null    object 
 15  condition_2         878 non-null    object 
 16  bldg_typ

#### Ordering Ordinal Columns 

In [21]:
#Handeling ordinal columns

ordinal_dict = [
#Lot Shape
{'IR3': 1, 'IR2': 2, 'IR1': 3, 'Reg': 4},
    
#Utilities
{'AllPub':4,'NoSewr':3,'NoSeWa':2,'ELO':1},

#Land Slope
{'Gtl':3,'Mod':2,'Sev':1},
    
#Exter Qual
{'Ex':5,'Gd':4, 'TA':3, 'Fa':2,'Po':1}, 

#Exter Cond
{'Ex':5,'Gd':4, 'TA':3, 'Fa':2,'Po':1},

#Bsmt Qual
{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1, 0:0},
    
#Bsmt Cond
{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1, 0:0},
    
#Bsmt Exposure
{'Gd':4,'Av':3,'Mn':2,'No':1, 0:0},

#BsmtFin Type 1
{'GLQ':6,'ALQ':5,'BLQ':4,'Rec':3,'LwQ':2,'Unf':1, 0:0},

#BsmtFinType 2
{'GLQ':6,'ALQ':5,'BLQ':4,'Rec':3,'LwQ':2,'Unf':1, 0:0},

#HeatingQC 
{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1},

#Electrical
{'SBrkr':5,'FuseA':4,'FuseF':3,'FuseP':2,'Mix':1},

#KitchenQual
{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1},

#Functional
{'Typ':8,'Min1':7,'Min2':6,'Mod':5,'Maj1':4,'Maj2':3,'Sev':2,'Sal':1},

#FireplaceQu
{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1, 0:0},

#Garage Finish
{'Fin':3,'RFn':2,'Unf':1, 0:0},

#Garage Qual
{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1, 0:0},

#Paved Drive
{'Y':3,'P':2,'N':1},

#Pool QC
{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1, 0:0},

#Fence
{'GdPrv':4,'MnPrv':3,'GdWo':2,'MnWw':1, 0:0}
]

ordinal_columns = ['lot_shape', 'utilities', 'land_slope', 'exter_qual', 'exter_cond', 'bsmt_qual',
       'bsmt_cond', 'bsmt_exposure', 'bsmt_fin_type_1', 'bsmt_fin_type_2',
       'heating_qc', 'electrical', 'kitchen_qual', 'functional',
       'fireplace_qu', 'garage_finish', 'garage_qual',
       'paved_drive', 'pool_qc', 'fence']

#zip column names to dictionary values
ordinal_dicts = dict(zip(ordinal_columns ,ordinal_dict))

def ordinal_replace(df,columns):
    for column in columns:
        df[column] = df[column].map(ordinal_dicts[column])
    return df

In [22]:
#Calling function on training data
data=ordinal_replace(data, ordinal_columns)
data.head()

Unnamed: 0,id,pid,ms_sub_class,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,misc_val,mo_sold,yr_sold,sale_type,has_wood_deck,has_open_porch,has_enclosed_porch,has_3season_porch,has_screen_porch,has_pool
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,4,Lvl,...,0,4,2006,WD,0,1,1,0,0,0
1,2718,905108090,90,RL,68.0,9662,Pave,0,3,Lvl,...,0,8,2006,WD,1,0,0,0,0,0
2,2414,528218130,60,RL,58.0,17104,Pave,0,3,Lvl,...,0,9,2006,New,1,1,0,0,0,0
3,1989,902207150,30,RM,60.0,8520,Pave,0,4,Lvl,...,0,7,2007,WD,0,0,1,0,0,0
4,625,535105100,20,RL,68.0,9500,Pave,0,3,Lvl,...,0,7,2009,WD,0,1,0,0,1,0


#### One-Hot-Encoding Categorical Columns with "Get_dummies"

In [23]:
#Dummy test.csv nominal columns 
data=pd.get_dummies(data, columns=['ms_sub_class','ms_zoning',
                                         'street', 'alley', 'land_contour', 'lot_config', 
                                         'neighborhood', 'condition_1', 'condition_2', 
                                         'bldg_type', 'house_style', 'roof_style', 'roof_matl', 
                                         'exterior_1st', 'exterior_2nd', 'mas_vnr_type', 
                                         'foundation', 'heating', 'garage_type', 'misc_feature', 
                                         'sale_type'], drop_first=True)

#### Bringing in X_train Un-scaled Dataset 

This X_train dataset is what the previous models were built upon. This dataset has already been cleaned, values imputed, ordinal columns handled and dummy columns created. This dataset will now be scaled with the training data and all other pre-modeling steps will be repeated to X_train and the test data accordingly. 


In [24]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,id,pid,lot_frontage,lot_area,lot_shape,utilities,land_slope,overall_qual,overall_cond,...,misc_feature_Shed,misc_feature_TenC,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD
0,1957,239,905452110,50.0,9350,4,4,3,4,6,...,0,0,0,0,0,0,0,0,0,1
1,1966,2443,528315030,82.0,9452,4,4,3,8,5,...,0,0,0,0,0,0,0,0,0,1
2,838,441,528120170,77.0,10872,3,4,3,9,5,...,0,0,0,0,0,0,0,0,0,1
3,427,1379,905103030,80.0,13014,4,4,3,6,5,...,0,0,0,0,0,0,0,0,0,1
4,1489,1218,534428020,68.0,12493,3,4,3,4,5,...,0,0,0,0,0,0,0,0,0,1


In [25]:
X_train.rename(columns={"Unnamed: 0": "index"}, inplace=True)

In [26]:
X_train=X_train.set_index('index')

In [27]:
X_train.head()

Unnamed: 0_level_0,id,pid,lot_frontage,lot_area,lot_shape,utilities,land_slope,overall_qual,overall_cond,year_built,...,misc_feature_Shed,misc_feature_TenC,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1957,239,905452110,50.0,9350,4,4,3,4,6,1947,...,0,0,0,0,0,0,0,0,0,1
1966,2443,528315030,82.0,9452,4,4,3,8,5,1997,...,0,0,0,0,0,0,0,0,0,1
838,441,528120170,77.0,10872,3,4,3,9,5,2006,...,0,0,0,0,0,0,0,0,0,1
427,1379,905103030,80.0,13014,4,4,3,6,5,1978,...,0,0,0,0,0,0,0,0,0,1
1489,1218,534428020,68.0,12493,3,4,3,4,5,1960,...,0,0,0,0,0,0,0,0,0,1


#### Dropping all columns that are not in both datasets from X_train and test data 

In [28]:
#Getting datasets to match 
set(X_train.columns)-set(data.columns)

{'condition_2_Feedr',
 'condition_2_PosN',
 'condition_2_RRAe',
 'condition_2_RRNn',
 'exterior_1st_CBlock',
 'exterior_1st_ImStucc',
 'exterior_1st_Stone',
 'exterior_2nd_Stone',
 'heating_OthW',
 'heating_Wall',
 'misc_feature_TenC',
 'ms_sub_class_150',
 'ms_zoning_C (all)',
 'neighborhood_GrnHill',
 'neighborhood_Landmrk',
 'roof_matl_Membran'}

In [29]:
#Dropping columns in X_train that are not in test data

X_train.drop(columns=['condition_2_Feedr','condition_2_PosN','condition_2_RRAe','condition_2_RRNn',
                      'exterior_1st_CBlock','exterior_1st_ImStucc','exterior_1st_Stone','exterior_2nd_Stone',
                      'heating_OthW','heating_Wall','misc_feature_TenC','ms_sub_class_150','ms_zoning_C (all)',
                      'neighborhood_GrnHill','neighborhood_Landmrk','roof_matl_Membran'], inplace=True)

In [30]:
#Getting datasets to match 
set(data.columns)-set(X_train.columns)

{'exterior_1st_AsphShn',
 'exterior_1st_PreCast',
 'exterior_2nd_Other',
 'exterior_2nd_PreCast',
 'heating_GasA',
 'mas_vnr_type_CBlock',
 'ms_zoning_I (all)',
 'roof_matl_Metal',
 'roof_matl_Roll',
 'sale_type_VWD'}

In [31]:
#Dropping columns in test data that are not in X_train

data.drop(columns=['exterior_1st_AsphShn','exterior_1st_PreCast','exterior_2nd_Other','exterior_2nd_PreCast',
                   'heating_GasA','mas_vnr_type_CBlock','ms_zoning_I (all)','roof_matl_Metal','roof_matl_Roll',
                   'sale_type_VWD'], inplace=True)

In [32]:
X_train.shape

(1637, 189)

In [33]:
data.shape

(878, 189)

#### Scaling the data using StandardScaler Class 

In [34]:
#Scaling data, dropping ID and PID so that they don't scale 
X_train_pre_ss=X_train.drop(columns=['id', 'pid'])
X_train_ids=X_train[['id', 'pid']]

data_pre_ss=data.drop(columns=['id', 'pid'])
data_ids=data[['id', 'pid']]

#Instantiating Standard Scaler 
ss=StandardScaler()

In [35]:
#Fitting the scaler 

Z_train_noid=pd.DataFrame(ss.fit_transform(X_train_pre_ss), 
                     columns=X_train_pre_ss.columns, index=X_train_pre_ss.index)
data_noid=pd.DataFrame(ss.transform(data_pre_ss), columns=data_pre_ss.columns, index=data_pre_ss.index)

In [36]:
#Join dataframes back together 
Z_train=pd.concat([X_train_ids, Z_train_noid], axis=1)
Z_test_data=pd.concat([data_ids, data_noid], axis=1)

In [37]:
Z_train.head(2)

Unnamed: 0_level_0,id,pid,lot_frontage,lot_area,lot_shape,utilities,land_slope,overall_qual,overall_cond,year_built,...,misc_feature_Othr,misc_feature_Shed,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1957,239,905452110,-0.921358,-0.106446,0.718504,0.0,0.218298,-1.47884,0.389398,-0.815609,...,-0.042848,-0.14781,-0.065532,-0.034975,-0.08947,-0.055351,-0.049492,-0.298588,-0.049492,0.395061
1966,2443,528315030,0.643738,-0.089214,0.718504,0.0,0.218298,1.337753,-0.505889,0.838116,...,-0.042848,-0.14781,-0.065532,-0.034975,-0.08947,-0.055351,-0.049492,-0.298588,-0.049492,0.395061


In [38]:
Z_test_data.head(2)

Unnamed: 0,id,pid,lot_frontage,lot_area,lot_shape,utilities,land_slope,overall_qual,overall_cond,year_built,...,misc_feature_Othr,misc_feature_Shed,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD
0,2658,902301120,0.007918,-0.141587,0.718504,0.0,0.218298,-0.070544,2.179972,-2.039365,...,-0.042848,-0.14781,-0.065532,-0.034975,-0.08947,-0.055351,-0.049492,-0.298588,-0.049492,0.395061
1,2718,905108090,-0.040992,-0.053735,-1.039629,0.0,0.218298,-0.774692,-1.401176,0.176626,...,-0.042848,-0.14781,-0.065532,-0.034975,-0.08947,-0.055351,-0.049492,-0.298588,-0.049492,0.395061


#### Dropping VIF columns from both datasets 

In [39]:
#Dropping columns in both datasets from the previous VIF
Z_train.drop(columns=['ms_sub_class_20', 'exterior_2nd_VinylSd', 'garage_type_Attchd',
                         'house_style_1Story','roof_style_Gable', 'exterior_2nd_CmentBd',
                         'exterior_1st_MetalSd', 'ms_zoning_RL','house_style_2Story','house_style_SLvl',
                         'mas_vnr_type_None','bldg_type_2fmCon','pool_qc','exterior_1st_VinylSd',
                         'year_built', 'neighborhood_NAmes', 'exterior_1st_HdBoard',
                         'exterior_2nd_Wd Sdng', 'ms_sub_class_60', 'foundation_PConc',
                         'exterior_2nd_Brk Cmn','ms_sub_class_75', 'neighborhood_Somerst',
                         'bsmt_qual', 'fireplace_qu'], inplace=True)

In [40]:
#Dropping columns in both datasets from the previous VIF
Z_test_data.drop(columns=['ms_sub_class_20', 'exterior_2nd_VinylSd', 'garage_type_Attchd',
                         'house_style_1Story','roof_style_Gable', 'exterior_2nd_CmentBd',
                         'exterior_1st_MetalSd', 'ms_zoning_RL','house_style_2Story','house_style_SLvl',
                         'mas_vnr_type_None','bldg_type_2fmCon','pool_qc','exterior_1st_VinylSd',
                         'year_built', 'neighborhood_NAmes', 'exterior_1st_HdBoard',
                         'exterior_2nd_Wd Sdng', 'ms_sub_class_60', 'foundation_PConc',
                         'exterior_2nd_Brk Cmn','ms_sub_class_75', 'neighborhood_Somerst',
                         'bsmt_qual', 'fireplace_qu'], inplace=True)

In [41]:
Z_train.shape

(1637, 164)

In [42]:
Z_test_data.shape

(878, 164)

In [43]:
#Saving dataframes for model 
Z_train.to_csv('Z_train_final.csv')
Z_test_data.to_csv('clean_test.csv')