In [3]:
#import libraries
import pandas as pd
import numpy as np
import sklearn

#import package to split data into training/test sets
from sklearn.model_selection import train_test_split

#import evaluation metric libraries
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

#import sklearn one-hot encoder
from sklearn.preprocessing import OneHotEncoder

#import feature scaling libraries
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

#import PCA (principal component analysis) library
from sklearn.decomposition import PCA

#import linear regression from sklearn
from sklearn.linear_model import LinearRegression



In [4]:
#import and look at the data
df_houses = pd.read_csv('housing_data.csv')
print(df_houses.shape)
df_houses.head()

(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [7]:
df_houses.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [8]:
#remove colums missing more than 40% of data
X = df_houses.drop(['Id', 'SalePrice', 'MSSubClass'], axis = 1)
y = df_houses['SalePrice']
cols_to_keep = X.columns[(X.isna().sum()/X.shape[0] < 0.4)]
X = X[cols_to_keep]
X.shape

(1460, 73)

In [23]:
#split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 1)
X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

In [24]:
#split the columns into numerical and categorical data frames for preprocessing
X_train_num = X_train[X_train.columns[X_train.dtypes != 'object']]
X_test_num = X_test[X_train.columns[X_train.dtypes != 'object']]
X_train_cat = X_train[X_train.columns[X_train.dtypes == 'object']]
X_test_cat = X_test[X_train.columns[X_train.dtypes == 'object']]

#get medians of numerical training columns 
train_medians = X_train_num.median()
#get modes of training columns (includes categorical columns)
train_modes = X_train_cat.mode().iloc[0]

#replace missing numerical values with the training column median
X_train_num.fillna(train_medians, inplace = True)
X_test_num.fillna(train_medians, inplace = True)

#replace missing categorical value with most common training value in column
X_train_cat.fillna(train_modes, inplace = True)
X_test_cat.fillna(train_modes, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [25]:
#create a one-hot encoder
one_hot = OneHotEncoder(drop = 'first', handle_unknown = 'ignore', feature_name_combiner = 'concat')
#fit and transform the categorical training features to the one-hot encoder
X_train_onehot = one_hot.fit_transform(X_train_cat)
#only transform the categorical test features to the one-hot encoder
X_test_onehot = one_hot.transform(X_test_cat)



In [28]:
#concatenate the numerical and dummy encoded features
X_train_final = pd.concat([X_train_num.reset_index(drop = True), pd.DataFrame(X_train_onehot.toarray())], axis = 1)
X_train_final.columns = X_train_final.columns.astype(str)
X_test_final = pd.concat([X_test_num.reset_index(drop = True), pd.DataFrame(X_test_onehot.toarray())], axis = 1)
X_test_final.columns = X_test_final.columns.astype(str)
#check the shapes of the final feature matrices
print(X_train_final.shape)
print(X_test_final.shape)

(1168, 228)
(292, 228)


In [29]:
#fit the model
reg_model = LinearRegression()
reg_model.fit(X_train_final, y_train)

In [37]:
#get the model predictions on the test features
reg_model_train_preds = reg_model.predict(X_train_final)
reg_model_test_preds = reg_model.predict(X_test_final)
#get R^2 and RMSE
print("R^2:", r2_score(y_train, reg_model_train_preds))
print("RMSE:", np.sqrt(mean_squared_error(y_train, reg_model_train_preds)))
print("R^2:", r2_score(y_test, reg_model_test_preds))
print("RMSE:", np.sqrt(mean_squared_error(y_test, reg_model_test_preds)))

R^2: 0.9329606474446183
RMSE: 20209.359897483617
R^2: 0.6430297149639179
RMSE: 50456.83164824453


In [31]:
#initiate scaler and pca instances
scaler = StandardScaler()
pca = PCA(n_components = 0.9)

In [32]:
#fit and apply the scaler and pca transformations
X_train_scaled = scaler.fit_transform(X_train_final)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_scaled = scaler.transform(X_test_final)
X_test_pca = pca.transform(X_test_scaled)

In [33]:
#checck the shapes of the final pca feature matrices
print(X_train_pca.shape)
print(X_test_pca.shape)

(1168, 126)
(292, 126)


In [34]:
#fit the pca model
pca_reg_model = LinearRegression()
pca_reg_model.fit(X_train_pca, y_train)

In [38]:
#get the model predictions on the test features
pca_reg_model_train_preds = pca_reg_model.predict(X_train_pca)
pca_reg_model_test_preds = pca_reg_model.predict(X_test_pca)
#get R^2 and RMSE
print("R^2:", r2_score(y_train, pca_reg_model_train_preds))
print("RMSE:", np.sqrt(mean_squared_error(y_train, pca_reg_model_train_preds)))
print("R^2:", r2_score(y_test, pca_reg_model_test_preds))
print("RMSE:", np.sqrt(mean_squared_error(y_test, pca_reg_model_test_preds)))

R^2: 0.8753038423832521
RMSE: 27562.219565937437
R^2: 0.8285106832409114
RMSE: 34972.16174051531
