In [1]:
#Import libraries for use

import numpy as np
import pandas as pd
import patsy

from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline


In [2]:
test = pd.read_csv("../datasets/test.csv")

In [3]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 0 to 878
Data columns (total 80 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               879 non-null    int64  
 1   PID              879 non-null    int64  
 2   MS SubClass      879 non-null    int64  
 3   MS Zoning        879 non-null    object 
 4   Lot Frontage     719 non-null    float64
 5   Lot Area         879 non-null    int64  
 6   Street           879 non-null    object 
 7   Alley            58 non-null     object 
 8   Lot Shape        879 non-null    object 
 9   Land Contour     879 non-null    object 
 10  Utilities        879 non-null    object 
 11  Lot Config       879 non-null    object 
 12  Land Slope       879 non-null    object 
 13  Neighborhood     879 non-null    object 
 14  Condition 1      879 non-null    object 
 15  Condition 2      879 non-null    object 
 16  Bldg Type        879 non-null    object 
 17  House Style     

In [4]:
#Code to clean test set 
#Values that underwent imp3
imp3 = SimpleImputer(missing_values=np.nan, fill_value=69.0, strategy='constant')
test.loc[:,"Lot Frontage"] = imp3.fit_transform(test['Lot Frontage'].values.reshape(-1,1))

#Values that underwent imp2
imp2 = SimpleImputer(missing_values=np.nan, fill_value='None', strategy='constant')
test.loc[:,"Alley"] = imp2.fit_transform(test['Alley'].values.reshape(-1,1))
test.loc[:,"Garage Type"] = imp2.fit_transform(test['Garage Type'].values.reshape(-1,1))
test.loc[:,"Misc Feature"] = imp2.fit_transform(test['Misc Feature'].values.reshape(-1,1))
basement_test = test.loc[:,['Bsmt Qual','Bsmt Cond','Bsmt Exposure','BsmtFin Type 1','BsmtFin Type 2']]
test.loc[:,['Bsmt Qual','Bsmt Cond','Bsmt Exposure','BsmtFin Type 1','BsmtFin Type 2']] = imp2.fit_transform(basement_test)
test.loc[:,"Fireplace Qu"] = imp2.fit_transform(test['Fireplace Qu'].values.reshape(-1,1))
garage_test = test.loc[:,['Garage Finish','Garage Qual','Garage Cond']]
test.loc[:,['Garage Finish','Garage Qual','Garage Cond']] = imp2.fit_transform(garage_test)
test.loc[:,"Pool QC"] = imp2.fit_transform(test['Pool QC'].values.reshape(-1,1))
test.loc[:,"Fence"] = imp2.fit_transform(test['Fence'].values.reshape(-1,1))

#15 ^  

#Values that underwent imp1
imp1 = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
test.loc[:,"Mas Vnr Type":"Mas Vnr Area"] = imp1.fit_transform(test.loc[:,"Mas Vnr Type":"Mas Vnr Area"])

#Values that underwent fillna:
test['Garage Yr Blt'].fillna(1978,inplace=True)
test['Electrical'].fillna(0,inplace=True)


In [5]:
#Prepare pandas to be able to view all the columns after one-hot encoding
pd.set_option('max_columns',250)

#One-hot encode the non-null nominal variables
test = pd.get_dummies(test, columns=["MS SubClass",
                                         "MS Zoning",
                                         "Street",
                                         "Land Contour",
                                         "Lot Config",
                                         "Neighborhood",
                                         "Condition 1",
                                         "Condition 2",
                                         "Bldg Type",
                                         "House Style",
                                         "Roof Style",
                                         "Roof Matl",
                                         "Exterior 1st",
                                         "Exterior 2nd",
                                         "Foundation",
                                         "Heating",
                                         "Central Air",
                                         "Sale Type",
                                         "Alley",
                                         "Mas Vnr Type",
                                         "Garage Type",
                                         "Misc Feature"],
                          
                     drop_first=True )

In [9]:
test['Lot Shape'] = test['Lot Shape'].map({'Reg':2,'IR1':1,'IR2':0,'IR3':-1})

test['Utilities'] = test['Utilities'].map({'AllPub':2,'NoSewr':1,'NoSeWa':0})

#Most properties are flat, so i penalize for slopes/severe slopes
test['Land Slope'] = test['Land Slope'].map({'Gtl':0,'Mod':-1,'Sev':-2})

#The next four variables follow the same rating system according to the data description
#I give higher weight to the extreme variables, as they will likely have bigger influence
test['Exter Qual'] = test['Exter Qual'].map({'Ex':5,'Gd':2,'TA':0,'Fa':-2,'Po':-5})
test['Exter Cond'] = test['Exter Cond'].map({'Ex':5,'Gd':2,'TA':0,'Fa':-2,'Po':-5})
test['Heating QC'] = test['Heating QC'].map({'Ex':5,'Gd':2,'TA':0,'Fa':-2,'Po':-5})
test['Kitchen Qual'] = test['Kitchen Qual'].map({'Ex':5,'Gd':2,'TA':0,'Fa':-2,'Po':-5})


test['Bsmt Qual'] = test['Bsmt Qual'].map({'Ex':5,'Gd':2,'TA':0,'Fa':-1,'Po':-2,'None':-5})
test['Bsmt Cond'] = test['Bsmt Cond'].map({'Ex':5,'Gd':2,'TA':0,'Fa':-1,'Po':-2,'None':-5})

test['Bsmt Exposure'] = test['Bsmt Exposure'].map({'Gd':5,'Av':2,'Mn':0,'No':-2,'None':-5})

#These two variables follow the same rating system according to the data description
test['BsmtFin Type 1'] = test['BsmtFin Type 1'].map({'GLQ':5,'ALQ':2,'BLQ':1,'Rec':0,'LwQ':-1,'Unf':-2,'None':-5})
test['BsmtFin Type 2'] = test['BsmtFin Type 2'].map({'GLQ':5,'ALQ':2,'BLQ':1,'Rec':0,'LwQ':-1,'Unf':-2,'None':-5})

test['Electrical'] = test['Electrical'].map({'SBrkr':5,'FuseA':2,'FuseF':0,'FuseP':-2,'Mix':-5})


#I weigh negative factors more heavily as I believe home owners expect typical functionality to be present
test['Functional'] = test['Functional'].map({'Typ':3,'Min1':2,'Min2':1,'Mod':0,'Maj1':-1,'Maj2':-2,'Sev':-5,'Sal':-8})


#These two variables follow the same rating system according to the data description
test['Fireplace Qu'] = test['Fireplace Qu'].map({'Ex':5,'Gd':2,'TA':0,'Fa':-1,'Po':-2,'None':-5})
test['Garage Qual'] = test['Garage Qual'].map({'Ex':5,'Gd':2,'TA':0,'Fa':-1,'Po':-2,'None':-5})
test['Garage Cond'] = test['Garage Cond'].map({'Ex':5,'Gd':2,'TA':0,'Fa':-1,'Po':-2,'None':-5})

#I did not include 0 as I do not consider any of the response neutral
test['Garage Finish'] = test['Garage Finish'].map({'Fin':5,'RFn':2,'Unf':-2,'None':-5})

test['Paved Drive'] = test['Paved Drive'].map({'Y':3,'P':0,'N':3})

#I consider pools to be a luxury item, and hence the scales are 
#largely positive
test['Pool QC'] = test['Pool QC'].map({'Ex':6,'Gd':3,'TA':2,'Fa':1,'None':0})


test['Fence'] = test['Fence'].map({'GdPrv':5,'MnPrv':2,'GdWo':1,'MnWw':-1,'None':-3})

In [11]:
test.describe().shape

(8, 202)

In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 0 to 878
Columns: 202 entries, Id to Misc Feature_Shed
dtypes: float64(3), int64(34), object(21), uint8(144)
memory usage: 522.0+ KB


In [7]:
test.shape

(879, 202)