### Part III: Scaling

First, import the libraries and read the data.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

X_train_exp = pd.read_csv("X_train_exp.csv")
X_test_exp = pd.read_csv("X_test_exp.csv")
X_train_OH = pd.read_csv("X_train_OH.csv")
X_test_OH = pd.read_csv("X_test_OH.csv")

Drop the unnecessary column that has been generated by the CSV.

In [2]:
X_train_exp = X_train_exp.drop('Unnamed: 0', axis=1)
X_test_exp = X_test_exp.drop('Unnamed: 0', axis=1)
X_train_OH = X_train_OH.drop('Unnamed: 0', axis=1)
X_test_OH = X_test_OH.drop('Unnamed: 0', axis=1)

Create an array of column names to be used later.

In [3]:
X_train_exp_col = X_train_exp.columns.tolist()
X_test_exp_col = X_test_exp.columns.tolist()
X_train_OH_col = X_train_OH.columns.tolist()
X_test_OH_col = X_test_OH.columns.tolist()

Use standard scaler to scale and transform the dataframes into an array.

In [4]:
X_test_OH.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Columns: 234 entries, Id to GA_binned
dtypes: float64(178), int64(56)
memory usage: 267.0 KB


In [5]:
scaler = StandardScaler()

X_train_exp = scaler.fit_transform(X_train_exp)
X_test_exp = scaler.transform(X_test_exp)

X_train_OH = scaler.fit_transform(X_train_OH)
X_test_OH = scaler.transform(X_test_OH)

Create the new scaled dataframes.

In [6]:
X_train_exp = pd.DataFrame(X_train_exp, columns = X_train_exp_col)
X_test_exp = pd.DataFrame(X_test_exp, columns = X_test_exp_col)
X_train_OH = pd.DataFrame(X_train_OH, columns = X_train_OH_col)
X_test_OH = pd.DataFrame(X_test_OH, columns  = X_test_OH_col)

In [7]:
X_train_exp.to_csv("X_train_exp_scaled.csv")
X_test_exp.to_csv("X_test_exp_scaled.csv")
X_train_OH.to_csv("X_train_OH_scaled.csv")
X_test_OH.to_csv("X_test_OH_scaled.csv")

### Part IV: Feature Selection

Filtering the variables of the One-hot encoded dataframe using PCA.

In [8]:
# copied from: https://www.youtube.com/watch?v=Lsue2gEM9D0

from sklearn.decomposition import PCA

pca = PCA()
OH_scaled_pca = pca.fit_transform(X_train_OH)

In [9]:
# top 187 variables for X_train_OH
OH_vars = X_train_OH.columns.tolist()

loading_scores = pd.Series(pca.components_[0], index=OH_vars)
sorted_loading_scores = loading_scores.abs().sort_values(ascending=False)
top_187_var = sorted_loading_scores[0:187].index.values

Impt_var_OH = top_187_var.tolist()
#print(Impt_var_OH)

In [10]:
SalePrice_train = X_train_OH['SalePrice']
SalePrice_test = X_test_OH['SalePrice']

X_train_OH = X_train_OH[Impt_var_OH].copy()
pd.concat([X_train_OH, SalePrice_train], axis = 1)

X_test_OH = X_test_OH[Impt_var_OH].copy()
pd.concat([X_test_OH, SalePrice_test], axis = 1)

Unnamed: 0,SalePrice,OverallQual,House_Age,ExterQual,GA_binned,GarageCars,GarageFinish,KitchenQual,BsmtQual,GarageArea,...,RoofMatlWdShake,Condition1PosA,SaleTypeCon,Exterior2ndCBlock,Exterior1stCBlock,LotConfigCorner,YrSold2010,Condition2RRNn,Exterior2ndBrkFace,SalePrice.1
0,0.583948,0.637627,-1.008324,1.030083,-0.886177,0.307098,0.328976,0.733780,0.573100,0.318823,...,-0.055279,-0.073211,-0.027608,-0.027608,-0.027608,-0.464479,-0.364518,-0.027608,-0.12437,0.583948
1,-0.877287,-0.800869,-0.447854,-0.697247,2.513532,-2.363232,-1.920739,-0.778706,-0.563577,-2.207766,...,-0.055279,-0.073211,-0.027608,-0.027608,-0.027608,2.152950,-0.364518,-0.027608,-0.12437,-0.877287
2,-0.648572,0.637627,-1.074261,1.030083,-0.886177,0.307098,-0.795882,0.733780,0.573100,-0.442869,...,-0.055279,-0.073211,-0.027608,-0.027608,-0.027608,-0.464479,-0.364518,-0.027608,-0.12437,-0.648572
3,-0.769282,-1.520117,0.013710,-0.697247,-0.036250,0.307098,-0.795882,-0.778706,-0.563577,0.244512,...,-0.055279,-0.073211,-0.027608,-0.027608,-0.027608,-0.464479,-0.364518,-0.027608,-0.12437,-0.769282
4,-0.217825,0.637627,-0.678635,-0.697247,-0.886177,0.307098,1.453833,0.733780,0.573100,-0.387136,...,-0.055279,-0.073211,-0.027608,-0.027608,-0.027608,-0.464479,-0.364518,-0.027608,-0.12437,-0.217825
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,-0.167000,-0.081621,-0.184103,-0.697247,-0.036250,0.307098,0.328976,-0.778706,0.573100,0.718247,...,-0.055279,-0.073211,-0.027608,-0.027608,-0.027608,-0.464479,-0.364518,-0.027608,-0.12437,-0.167000
142,1.841880,2.076124,-1.074261,1.030083,-0.886177,1.642263,1.453833,0.733780,1.709777,1.377762,...,-0.055279,-0.073211,-0.027608,-0.027608,-0.027608,-0.464479,-0.364518,-0.027608,-0.12437,1.841880
143,0.507709,0.637627,-1.008324,1.030083,-0.886177,0.307098,0.328976,0.733780,0.573100,0.253801,...,-0.055279,-0.073211,-0.027608,-0.027608,-0.027608,-0.464479,-0.364518,-0.027608,-0.12437,0.507709
144,-1.163180,-0.081621,1.695120,-0.697247,2.513532,-2.363232,-1.920739,-2.291192,-0.563577,-2.207766,...,-0.055279,-0.073211,-0.027608,-0.027608,-0.027608,-0.464479,-0.364518,-0.027608,-0.12437,-1.163180


Filtering the variables of the experimentally-encoded dataframe using PCA.

In [11]:
OH_scaled_pca = pca.fit_transform(X_train_exp)

In [12]:
# top 187 variables for X_train_exp
exp_vars = X_train_exp.columns.tolist()

loading_scores = pd.Series(pca.components_[0], index=exp_vars)
sorted_loading_scores = loading_scores.abs().sort_values(ascending=False)
top_187_var = sorted_loading_scores[0:187].index.values

Impt_var_exp = top_187_var.tolist()

In [13]:
X_train_exp = X_train_exp[Impt_var_exp].copy()
X_test_exp = X_test_exp[Impt_var_exp].copy()

## Model Training

In [14]:
print('X_train_exp ranges from', X_train_exp['SalePrice'].min(), 'to', X_train_exp['SalePrice'].max())
print('X_test_exp ranges from', X_test_exp['SalePrice'].min(), 'to', X_test_exp['SalePrice'].max())
print('X_train_OH ranges from', X_train_OH['SalePrice'].min(), 'to', X_train_OH['SalePrice'].max())
print('X_test_OH ranges from', X_test_OH['SalePrice'].min(), 'to', X_test_OH['SalePrice'].max())

X_train_exp ranges from -1.85694899874938 to 7.1658556396680115
X_test_exp ranges from -1.601550641852901 to 7.292919498820488
X_train_OH ranges from -1.85694899874938 to 7.1658556396680115
X_test_OH ranges from -1.601550641852901 to 7.292919498820488


### Part I: Experimental Dataset

**Mean Absolute Error**

In [15]:
y_train_exp = X_train_exp['SalePrice'].copy()

In [16]:
y_test_exp = X_test_exp['SalePrice'].copy()
X_test_exp = X_test_exp.drop(['SalePrice'], axis = 1)

In [17]:
# LOOK AT THIS

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

average_saleprice = X_train_exp["SalePrice"].mean()
baseline_test_predictions = [average_saleprice]*len(X_test_exp)

MAE = mean_absolute_error(baseline_test_predictions, y_test_exp)

**Linear Regression**

In [18]:
X_train_exp = X_train_exp.drop(['SalePrice'], axis = 1)

In [19]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train_exp, y_train_exp)
lr_predictions_exp = lr.predict(X_test_exp)

lr_MAE = mean_absolute_error(baseline_test_predictions, y_test_exp)

**Random Forest Regressor**

In [20]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor()
rfr.fit(X_train_exp, y_train_exp)
rfr_predictions = rfr.predict(X_test_exp)

rfr_MAE = mean_absolute_error(baseline_test_predictions, y_test_exp)

### Part II: One-hot Encoded Dataset

**Mean Absolute Error**

In [21]:
y_train_OH = X_train_OH['SalePrice'].copy()

In [22]:
y_test_OH = X_test_OH['SalePrice'].copy()
X_test_OH = X_test_OH.drop(['SalePrice'], axis = 1)

In [23]:
average_saleprice = X_train_OH["SalePrice"].mean()
baseline_test_predictions = [average_saleprice]*len(X_test_OH)

mean_absolute_error(baseline_test_predictions, y_test_OH)

0.7157420768476682

**Linear Regression**

In [24]:
X_train_OH = X_train_OH.drop(['SalePrice'], axis = 1)

In [25]:
lr.fit(X_train_OH, y_train_OH)
lr_predictions_OH = lr.predict(X_test_OH)

lr_perc_OH = mean_absolute_error(lr_predictions_OH, y_test_OH)
print(lr_perc_OH)

6242599217.667765


**Random Forest Regressor**

In [26]:
rfr.fit(X_train_OH, y_train_OH)
rfr_predictions = rfr.predict(X_test_OH)

rfr_perc = mean_absolute_error(rfr_predictions, y_test_OH)
print(rfr_perc)

0.21743034074089718
