**Consulting Project 2**

Goal: Construct a model to predict the sale price of houses in the test data set


---------------------------------------------
---------------------------------------------

In [None]:
#mounting drive to get it started
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#importing all our packages
import pandas as pd
import numpy as np
#statsmodels
import statsmodels.api as smapi
import statsmodels.formula.api as smf
import statsmodels.stats as stats
#all our sklearn tools
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from sklearn import linear_model
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
#seaborn, mpl for visualizations
import seaborn as sns
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_confusion_matrix



In [None]:
#importing 3 training datasets to get started
a = pd.read_csv('/content/drive/MyDrive/CollegeCr.csv')
b = pd.read_csv('/content/drive/MyDrive/Edwards.csv')
c = pd.read_csv('/content/drive/MyDrive/OldTown.csv')

Data Cleaning for a (CollegeCr)

In [None]:
a.head()

In [None]:
a.duplicated().sum()

In [None]:
a1 = a.drop_duplicates()

In [None]:
a1.duplicated().sum()

In [None]:
#getting a read on the missing values of the dataset
a1.isna().sum()

In [None]:
#we're going to drop the observations with missing values
a2 = a1.dropna()

In [None]:
#double checking the outcome
a2.isna().sum() #none, nice

In [None]:
#descriptive statistics
a2.describe()

In [None]:
a2.info()

Splitting the LotInfo into values: LotConfig, LotShape, LotArea, and LotFrontage

In [None]:
a2[['LotConfig', 'LotShape', 'LotArea', 'LotFrontage']] = a2['LotInfo'].str.split(';',expand=True)
a2

Splitting the Exterior into values: Exterior1st, ExterQual, and ExterCond

In [None]:
a2[['Exterior1st', 'ExterQual', 'ExterCond']] = a2['Exterior'].str.split(';',expand=True)
a2

Adding a column that's a neighborhood Identifier

In [None]:
a2['Neighborhood'] = "CollegeCr"
a2

Cleaning for b (Edwards)

In [None]:
b.head()

In [None]:
b.duplicated().sum() #no duplicates

In [None]:
#getting a read on the missing values of the dataset
b.isna().sum()

In [None]:
#we're going to drop the observations with missing values
b1 = b.dropna()

In [None]:
#double checking the outcome
b1.isna().sum() #none, nice

In [None]:
#descriptive statistics
b1.describe()

In [None]:
b1.info()

Splitting the LotInfo into values: LotConfig, LotShape, LotArea, and LotFrontage

In [None]:
b1[['LotConfig', 'LotShape', 'LotArea', 'LotFrontage']] = b1['LotInfo'].str.split(';',expand=True)
b1

Splitting the Exterior into values: Exterior1st, ExterQual, and ExterCond

In [None]:
b1[['Exterior1st', 'ExterQual', 'ExterCond']] = b1['Exterior'].str.split(';',expand=True)
b1

Adding a column that's a neighborhood Identifier

In [None]:
b1['Neighborhood'] = "Edwards"
b1

Finally onto cleaning for c (OldTown)

In [None]:
c.head()

In [None]:
c.duplicated().sum() #one duplicate

In [None]:
c1 = c.drop_duplicates()

In [None]:
c1.duplicated().sum()

In [None]:
#getting a read on the missing values of the dataset
c1.isna().sum()

In [None]:
#we're going to drop the observations with missing values
c2 = c1.dropna()

In [None]:
#double checking the outcome
c2.isna().sum() #none, nice

In [None]:
#descriptive statistics
c2.describe()

In [None]:
c2.info()

Splitting the LotInfo into values: LotConfig, LotShape, LotArea, and LotFrontage

In [None]:
c2[['LotConfig', 'LotShape', 'LotArea', 'LotFrontage']] = c2['LotInfo'].str.split(';',expand=True)
c2

Splitting the Exterior into values: Exterior1st, ExterQual, and ExterCond

In [None]:
c2[['Exterior1st', 'ExterQual', 'ExterCond']] = c2['Exterior'].str.split(';',expand=True)
c2

Adding a column that's a neighborhood Identifier

In [None]:
c2['Neighborhood'] = "OldTown"
c2

Now we're going to start formatting the three datasets so

In [None]:
a3 = a2[['Neighborhood', 'OverallQual', 'BedroomAbvGr', 'CentralAir', 'BsmtQual', 'Fireplaces', 'YrSold', 'HouseStyle', 'HeatingQC', 'GarageType', 'RoofMatl', 'PavedDrive', 'SaleType', 'FullBath', 'OpenPorchSF', 'RoofStyle', 'Utilities', 'BsmtFinSF1', 'Heating', 'KitchenQual', 'HalfBath', 'BsmtFinType1', 'WoodDeckSF', 'TotRmsAbvGrd', 'SalePrice', 'YearBuilt', 'BsmtCond', 'Foundation', 'Electrical', 'BldgType', 'OverallCond', 'GrLivArea', 'LotConfig', 'LotShape', 'LotArea', 'LotFrontage', 'Exterior1st', 'ExterQual', 'ExterCond']]

In [None]:
a3.head()

In [None]:
a3.shape

In [None]:
b2 = b1[['Neighborhood', 'OverallQual', 'BedroomAbvGr', 'CentralAir', 'BsmtQual', 'Fireplaces', 'YrSold', 'HouseStyle', 'HeatingQC', 'GarageType', 'RoofMatl', 'PavedDrive', 'SaleType', 'FullBath', 'OpenPorchSF', 'RoofStyle', 'Utilities', 'BsmtFinSF1', 'Heating', 'KitchenQual', 'HalfBath', 'BsmtFinType1', 'WoodDeckSF', 'TotRmsAbvGrd', 'SalePrice', 'YearBuilt', 'BsmtCond', 'Foundation', 'Electrical', 'BldgType', 'OverallCond', 'GrLivArea', 'LotConfig', 'LotShape', 'LotArea', 'LotFrontage', 'Exterior1st', 'ExterQual', 'ExterCond']]

In [None]:
b2.head()

In [None]:
b2.shape

In [None]:
c3 = c2[['Neighborhood', 'OverallQual', 'BedroomAbvGr', 'CentralAir', 'BsmtQual', 'Fireplaces', 'YrSold', 'HouseStyle', 'HeatingQC', 'GarageType', 'RoofMatl', 'PavedDrive', 'SaleType', 'FullBath', 'OpenPorchSF', 'RoofStyle', 'Utilities', 'BsmtFinSF1', 'Heating', 'KitchenQual', 'HalfBath', 'BsmtFinType1', 'WoodDeckSF', 'TotRmsAbvGrd', 'SalePrice', 'YearBuilt', 'BsmtCond', 'Foundation', 'Electrical', 'BldgType', 'OverallCond', 'GrLivArea', 'LotConfig', 'LotShape', 'LotArea', 'LotFrontage', 'Exterior1st', 'ExterQual', 'ExterCond']]

In [None]:
c3.head()

In [None]:
c3.shape

In [None]:
train = pd.concat([a3, b2, c3])

In [None]:
train.shape

Now that we have a combined dataset with only full observations and common columns from each of the three data sets, we'll perform the final seclection/ cleaning process before we begin the predictive analysis.

In [None]:
#Heatplot to explore some relationships
sns.heatmap(data = train.corr()) #there's not really any glaring correlations that will influence the analysis, but a couple of situations to watch out for

In [None]:
sns.pairplot(data = train)

Looking into outliers

In [None]:
train.describe()

From the above, we can see clear outliers in OpenPorchSF, BsmtFinSF1, WoodDeckSF, TotRmsAbvGrd, and GrLivArea. We'll start with these first.

In [None]:
#OpenPorchSF outlier removal
q_low = train["OpenPorchSF"].quantile(0.01)
q_hi  = train["OpenPorchSF"].quantile(0.99)
train1 = train
train1 = train[(train["OpenPorchSF"] < q_hi) & (train["OpenPorchSF"] > q_low)]

In [None]:
train.shape

In [None]:
train1.shape

In [None]:
#BsmtFinSF1 outlier removal
q_low = train1["BsmtFinSF1"].quantile(0.01)
q_hi  = train1["BsmtFinSF1"].quantile(0.99)
train1 = train1[(train1["BsmtFinSF1"] < q_hi) & (train1["BsmtFinSF1"] > q_low)]

In [None]:
train1.shape

In [None]:
#WoodDeckSF outlier removal
q_low = train1["WoodDeckSF"].quantile(0.01)
q_hi  = train1["WoodDeckSF"].quantile(0.99)
train1 = train1[(train1["WoodDeckSF"] < q_hi) & (train1["WoodDeckSF"] > q_low)]

In [None]:
train1.shape

In [None]:
#TotRmsAbvGrd outlier removal
q_low = train1["TotRmsAbvGrd"].quantile(0.01)
q_hi  = train1["TotRmsAbvGrd"].quantile(0.99)
train1 = train1[(train1["TotRmsAbvGrd"] < q_hi) & (train1["TotRmsAbvGrd"] > q_low)]

In [None]:
train1.shape

In [None]:
#GrLivArea outlier removal
q_low = train1["GrLivArea"].quantile(0.01)
q_hi  = train1["GrLivArea"].quantile(0.99)
train1 = train1[(train1["GrLivArea"] < q_hi) & (train1["GrLivArea"] > q_low)]

In [None]:
train1.shape

Since this is clearly not going the way we're hoping for, we'll need to remove those at the model stage using the Bonferroni test:

"The Bonferroni Outlier Tests uses a t distribution to test whether the model's largest studentized residual value's outlier status is statistically different from the other observations in the model. A significant p-value indicates an extreme outlier that warrants further examination."

model.outlier_test(method='bonf', alpha=0.5, cutoff=1)
df.drop(df.index[[outlier indices identified in the above test]])

The final steps will be completed after we run our initial model: model assumption checking, recategorization of categorical variables, outlier removal, etc. so we'll wrap up by exporting the combined dataset.

In [None]:
train.to_csv('TrainFinal.csv')

----------------------
----------------------
----------------------

Iteration 2:
Starting the analysis again, there were some offline modifications made to the data set so we'll reimport.

- ID column added: concatenated index column and Neighborhood column
- observations where YrSold < YrBuilt

In [None]:
train2 = pd.read_csv('/content/drive/MyDrive/TrainFinal.csv')

In [None]:
safe_copy_train2 = pd.read_csv('/content/drive/MyDrive/TrainFinal.csv')

In [None]:
test = pd.read_csv('/content/drive/MyDrive/TestFinal.csv')

In [None]:
safe_copy_test = pd.read_csv('/content/drive/MyDrive/TestFinal.csv')

We need to take a few steps to transform our Test set into the same formatting of the train:

In [None]:
test[['LotConfig', 'LotShape', 'LotArea', 'LotFrontage']] = test['LotInfo'].str.split(';',expand=True)

In [None]:
test[['Exterior1st', 'ExterQual', 'ExterCond']] = test['Exterior'].str.split(';',expand=True)

In [None]:
test = test[['SalePrice','Neighborhood','OverallQual','BedroomAbvGr','CentralAir','BsmtQual','Fireplaces','YrSold','HouseStyle','HeatingQC','GarageType','RoofMatl','PavedDrive','SaleType','FullBath','OpenPorchSF','RoofStyle','Utilities','BsmtFinSF1','Heating','KitchenQual','HalfBath','BsmtFinType1','WoodDeckSF','TotRmsAbvGrd','YearBuilt','BsmtCond','Foundation','Electrical','BldgType','OverallCond','GrLivArea','LotConfig','LotShape','LotArea','Exterior1st','ExterQual','ExterCond']]

In [None]:
test.info()

In [None]:
test_mapped = test[['SalePrice','Neighborhood','OverallQual','BedroomAbvGr','CentralAir','BsmtQual','Fireplaces','YrSold','HouseStyle','HeatingQC','GarageType','RoofMatl','PavedDrive','SaleType','FullBath','OpenPorchSF','RoofStyle','Utilities','BsmtFinSF1','Heating','KitchenQual','HalfBath','BsmtFinType1','WoodDeckSF','TotRmsAbvGrd','YearBuilt','BsmtCond','Foundation','Electrical','BldgType','OverallCond','GrLivArea','LotConfig','LotShape','LotArea','Exterior1st','ExterQual','ExterCond']]

In [None]:
d = {'CollegeCr': 0, 'Edwards': 1, 'OldTown': 2}
test_mapped['Neighborhood'] = test_mapped['Neighborhood'].map(d)

e = {'Y': 1, 'N': 0}
test_mapped['CentralAir'] = test_mapped['CentralAir'].map(e)

f = {'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'NA': 4}
test_mapped['BsmtQual'] = test_mapped['BsmtQual'].map(f)

g = {'1Story': 0, '1.5Story': 1, '2Story': 2, '2.5Fin': 3}
test_mapped['HouseStyle'] = test_mapped['HouseStyle'].map(g)

h = {'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'NA': 4}
test_mapped['HeatingQC'] = test_mapped['HeatingQC'].map(h)

i = {'Detchd': 0, 'Attchd': 1}
test_mapped['GarageType'] = test_mapped['GarageType'].map(i)

j = {'notCompShg': 0, 'CompShg': 1}
test_mapped['RoofMatl'] = test_mapped['RoofMatl'].map(j)

k = {'N': 0, 'Y': 1, 'P': 2}
test_mapped['PavedDrive'] = test_mapped['PavedDrive'].map(k)

l = {'notWD': 0, 'WD': 1}
test_mapped['SaleType'] = test_mapped['SaleType'].map(l)

m = {'Gable': 1, 'notGable': 0}
test_mapped['RoofStyle'] = test_mapped['RoofStyle'].map(m)

n = {'AllPub': 1}
test_mapped['Utilities'] = test_mapped['Utilities'].map(n)

o = {'GasA': 0, 'GasW': 1, 'Grav': 2}
test_mapped['Heating'] = test_mapped['Heating'].map(o)

p = {'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'NA': 4}
test_mapped['KitchenQual'] = test_mapped['KitchenQual'].map(p)

q = {'ALQ': 0, 'BLQ': 1, 'GLQ': 2, 'Unf': 3}
test_mapped['BsmtFinType1'] = test_mapped['BsmtFinType1'].map(q)

r = {'TA': 1}
test_mapped['BsmtCond'] = test_mapped['BsmtCond'].map(r)

s = {'BrkTil': 0, 'CBlock': 1, 'PConc': 2}
test_mapped['Foundation'] = test_mapped['Foundation'].map(s)

t = {'SBrkr': 0, 'Fuse': 1}
test_mapped['Electrical'] = test_mapped['Electrical'].map(t)

u = {'1Fam': 0, '2fmCon': 1, 'Duplex': 2, 'Twnhs': 3}
test_mapped['BldgType'] = test_mapped['BldgType'].map(u)

v = {'Corner': 0, 'CulDSac': 1, 'FR2': 2, 'Inside': 3}
test_mapped['LotConfig'] = test_mapped['LotConfig'].map(v)

w = {'IR1': 0, 'IR2': 1, 'IR3': 2, 'Reg': 3}
test_mapped['LotShape'] = test_mapped['LotShape'].map(w)

xx = {'MetalSd': 0, 'OtherSd': 1, 'VinylSd': 2}
test_mapped['Exterior1st'] = test_mapped['Exterior1st'].map(xx)

yy = {'Gd': 1, 'TA': 0}
test_mapped['ExterQual'] = test_mapped['ExterQual'].map(yy)

z = {'Gd': 1, 'TA': 0}
test_mapped['ExterCond'] = test_mapped['ExterCond'].map(z)

**EDA for the combined set**:

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
g = sns.displot(sns.displot(train2, x="SalePrice", hue="Neighborhood"))

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
g = sns.displot(sns.displot(train2, x="SalePrice", hue="TotRmsAbvGrd"))

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
g = sns.displot(sns.displot(train2, x="SalePrice", hue="YrSold"))

In [None]:
sns.heatmap(data = train2.corr())

In [None]:
train2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218 entries, 0 to 217
Data columns (total 41 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    218 non-null    int64  
 1   ID            218 non-null    object 
 2   Neighborhood  218 non-null    object 
 3   OverallQual   218 non-null    int64  
 4   BedroomAbvGr  218 non-null    int64  
 5   CentralAir    218 non-null    object 
 6   BsmtQual      218 non-null    object 
 7   Fireplaces    218 non-null    int64  
 8   YrSold        218 non-null    int64  
 9   HouseStyle    218 non-null    object 
 10  HeatingQC     218 non-null    object 
 11  GarageType    218 non-null    object 
 12  RoofMatl      218 non-null    object 
 13  PavedDrive    218 non-null    object 
 14  SaleType      218 non-null    object 
 15  FullBath      218 non-null    int64  
 16  OpenPorchSF   218 non-null    int64  
 17  RoofStyle     218 non-null    object 
 18  Utilities     218 non-null    

Dimension Reduction: ANOVA

In [None]:
train2.info()

In [None]:
#ANOVA with all variables (except ID)
model = smf.ols(formula='SalePrice ~ Neighborhood + OverallQual + BedroomAbvGr + CentralAir + BsmtQual + Fireplaces + YrSold + HouseStyle + HeatingQC + GarageType + RoofMatl + PavedDrive + SaleType + FullBath + OpenPorchSF + RoofStyle + Utilities + BsmtFinSF1 + Heating + KitchenQual + HalfBath + BsmtFinType1 + WoodDeckSF + TotRmsAbvGrd + YearBuilt + BsmtCond + Foundation + Electrical + BldgType + OverallCond + GrLivArea + LotConfig + LotShape + LotArea + LotFrontage + Exterior1st + ExterQual + ExterCond', data=train2)

In [None]:
Fit_model = model.fit()
Fit_model.summary()

In [None]:
#Now a Type III One-Way ANOVA model
smapi.stats.anova_lm(Fit_model, typ=3)

We want to look at 3 different models to predict our response SalePrice: mutliple linear regression, support vector regression, and random forest.

Before we get into any of that, we need to work on our dimension reduction and check model assumptions.

Dimension Reduction: PCA on our continuous data

In [None]:
train2.info()

In [None]:
pca_prep = train2[['OverallQual', 'BedroomAbvGr', 'Fireplaces', 'YrSold', 'FullBath', 'OpenPorchSF', 'BsmtFinSF1', 'HalfBath', 'WoodDeckSF', 'TotRmsAbvGrd', 'SalePrice', 'YearBuilt', 'OverallCond', 'GrLivArea', 'LotArea']]

Let's get to the bit about scaling:

In [None]:
scaler = StandardScaler()
scaler.fit(pca_prep)
train_scaled = scaler.transform(pca_prep[['OverallQual', 'BedroomAbvGr', 'Fireplaces', 'YrSold', 'FullBath', 'OpenPorchSF', 'BsmtFinSF1', 'HalfBath', 'WoodDeckSF', 'TotRmsAbvGrd', 'SalePrice', 'YearBuilt', 'OverallCond', 'GrLivArea', 'LotArea']])

In [None]:
#we're starting with 10 before we can make our selection for the optimal number
pca = PCA(n_components=10)

In [None]:
pca_model = pca_fit = pca.fit(train_scaled)

Now onto the PC scores:

In [None]:
PC_scores = pd.DataFrame(pca.fit_transform(train_scaled),
               columns = ['PC 1', 'PC 2', 'PC 3', 'PC 4', 'PC 5', 'PC 6', 'PC 7', 'PC 8', 'PC 9', 'PC 10'])
PC_scores.head(6)

Quite a lot of into above. Let's take a look at the loading scores:

In [None]:
loadings = pd.DataFrame(pca.components_.T, columns=['PC 1', 'PC 2', 'PC 3', 'PC 4', 'PC 5', 'PC 6', 'PC 7', 'PC 8', 'PC 9', 'PC 10'], index=['OverallQual', 'BedroomAbvGr', 'Fireplaces', 'YrSold', 'FullBath', 'OpenPorchSF', 'BsmtFinSF1', 'HalfBath', 'WoodDeckSF', 'TotRmsAbvGrd', 'SalePrice', 'YearBuilt', 'OverallCond', 'GrLivArea', 'LotArea'])
loadings

Next step is the scree plot. This is our step to figure out actually how many principal components we need before it becomes disproportionately extra calculations with not much help.

In [None]:
PC_values = np.arange(pca.n_components_) + 1
plt.plot(PC_values, pca.explained_variance_ratio_, 'o-', linewidth=2, color='blue')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.show()

The plot shows significant variance being explained for the first PC, the second PC, and arguably the third PC. But after the fourth, the variance explained contribution drops fairly considerably, there's not that much being contributed per component for how much extra work it involves.

Going forward, we're going to be using details from the first three PC's.

Now, onto the Biplot:

In [None]:
PC1 = pca.fit_transform(train_scaled)[:,0]
PC2 = pca.fit_transform(train_scaled)[:,1]
ldngs = pca.components_

In [None]:
scalePC1 = 1.0/(PC1.max() - PC1.min())
scalePC2 = 1.0/(PC2.max() - PC2.min())
features = ['OverallQual', 'BedroomAbvGr', 'Fireplaces', 'YrSold', 'FullBath', 'OpenPorchSF', 'BsmtFinSF1', 'HalfBath', 'WoodDeckSF', 'TotRmsAbvGrd', 'SalePrice', 'YearBuilt', 'OverallCond', 'GrLivArea', 'LotArea']

In [None]:
fig, ax = plt.subplots(figsize=(14, 9))

for i, feature in enumerate(features):
    ax.arrow(0, 0, ldngs[0, i],
             ldngs[1, i])
    ax.text(ldngs[0, i] * 1.15,
            ldngs[1, i] * 1.15,
            feature, fontsize=18)

ax.scatter(PC1 * scalePC1,PC2 * scalePC2)

ax.set_xlabel('PC1', fontsize=20)
ax.set_ylabel('PC2', fontsize=20)
ax.set_title('PCA Biplot', fontsize=20)

----------------------------------------------------------------------------

----------------------------------------------------------------------------

----------------------------------------------------------------------------

Multiple Correspondence Analysis

----------------------------------------------------------------------------

----------------------------------------------------------------------------

In this section, we map EVERYTHING for future use

In [None]:
train3 = train2[['SalePrice','Neighborhood','OverallQual','BedroomAbvGr','CentralAir','BsmtQual','Fireplaces','YrSold','HouseStyle','HeatingQC','GarageType','RoofMatl','PavedDrive','SaleType','FullBath','OpenPorchSF','RoofStyle','Utilities','BsmtFinSF1','Heating','KitchenQual','HalfBath','BsmtFinType1','WoodDeckSF','TotRmsAbvGrd','YearBuilt','BsmtCond','Foundation','Electrical','BldgType','OverallCond','GrLivArea','LotConfig','LotShape','LotArea','Exterior1st','ExterQual','ExterCond']]

In [None]:
#variable mapping
d = {'CollegeCr': 0, 'Edwards': 1, 'OldTown': 2}
train3['Neighborhood'] = train3['Neighborhood'].map(d)

e = {'Y': 1, 'N': 0}
train3['CentralAir'] = train3['CentralAir'].map(e)

f = {'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'NA': 4}
train3['BsmtQual'] = train3['BsmtQual'].map(f)

g = {'1Story': 0, '1.5Story': 1, '2Story': 2, '2.5Fin': 3}
train3['HouseStyle'] = train3['HouseStyle'].map(g)

h = {'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'NA': 4}
train3['HeatingQC'] = train3['HeatingQC'].map(h)

i = {'Detchd': 0, 'Attchd': 1}
train3['GarageType'] = train3['GarageType'].map(i)

j = {'notCompShg': 0, 'CompShg': 1}
train3['RoofMatl'] = train3['RoofMatl'].map(j)

k = {'N': 0, 'Y': 1, 'P': 2}
train3['PavedDrive'] = train3['PavedDrive'].map(k)

l = {'notWD': 0, 'WD': 1}
train3['SaleType'] = train3['SaleType'].map(l)

m = {'Gable': 1, 'notGable': 0}
train3['RoofStyle'] = train3['RoofStyle'].map(m)

n = {'AllPub': 1}
train3['Utilities'] = train3['Utilities'].map(n)

o = {'GasA': 0, 'GasW': 1, 'Grav': 2}
train3['Heating'] = train3['Heating'].map(o)

p = {'Ex': 0, 'Gd': 1, 'TA': 2, 'Fa': 3, 'NA': 4}
train3['KitchenQual'] = train3['KitchenQual'].map(p)

q = {'ALQ': 0, 'BLQ': 1, 'GLQ': 2, 'Unf': 3}
train3['BsmtFinType1'] = train3['BsmtFinType1'].map(q)

r = {'TA': 1}
train3['BsmtCond'] = train3['BsmtCond'].map(r)

s = {'BrkTil': 0, 'CBlock': 1, 'PConc': 2}
train3['Foundation'] = train3['Foundation'].map(s)

t = {'SBrkr': 0, 'Fuse': 1}
train3['Electrical'] = train3['Electrical'].map(t)

u = {'1Fam': 0, '2fmCon': 1, 'Duplex': 2, 'Twnhs': 3}
train3['BldgType'] = train3['BldgType'].map(u)

v = {'Corner': 0, 'CulDSac': 1, 'FR2': 2, 'Inside': 3}
train3['LotConfig'] = train3['LotConfig'].map(v)

w = {'IR1': 0, 'IR2': 1, 'IR3': 2, 'Reg': 3}
train3['LotShape'] = train3['LotShape'].map(w)

xx = {'MetalSd': 0, 'OtherSd': 1, 'VinylSd': 2}
train3['Exterior1st'] = train3['Exterior1st'].map(xx)

yy = {'Gd': 1, 'TA': 0}
train3['ExterQual'] = train3['ExterQual'].map(yy)

z = {'Gd': 1, 'TA': 0}
train3['ExterCond'] = train3['ExterCond'].map(z)

In [None]:
train3.info() #checking the data types

In [None]:
train4 = train3.dropna() #since we somehow have a few missing values, we drop these

In [None]:
train4.isna().sum()

In [None]:
train4['HouseStyle'] = train4['HouseStyle'].astype(int) #convert float to in as they are 2.0 => 2

In [None]:
train4.head(20) #looks good

----------------------------------------------------------------------------

----------------------------------------------------------------------------

Multiple Linear Regression

In [None]:
train4.info()

In [None]:
#First with the entire set
x = train4[['Neighborhood','OverallQual','BedroomAbvGr','CentralAir','BsmtQual','Fireplaces','YrSold','HeatingQC','GarageType','RoofMatl','PavedDrive','SaleType','FullBath','OpenPorchSF','RoofStyle','Utilities','BsmtFinSF1','Heating','KitchenQual','HalfBath','BsmtFinType1','WoodDeckSF','TotRmsAbvGrd','YearBuilt','BsmtCond','Foundation','Electrical','BldgType','OverallCond','GrLivArea','LotConfig','LotShape','LotArea','Exterior1st','ExterQual','ExterCond']]
x = smapi.add_constant(x)
#x -= np.average(x)
y = train4['SalePrice']


In [None]:
#fit linear regression model
model = smapi.OLS(y, x).fit()
print(model.summary())

In [None]:
print(model.mse_total)

In [None]:
print(model.mse_resid)

**From the above results, we have the following model with **

In [None]:
#a different model attempt
x = train4[['Neighborhood','OverallQual','BedroomAbvGr','CentralAir','BsmtQual','Fireplaces','YrSold','HeatingQC','GarageType','RoofMatl','PavedDrive','SaleType','FullBath','OpenPorchSF','RoofStyle','Utilities','BsmtFinSF1','Heating','KitchenQual','HalfBath','BsmtFinType1','WoodDeckSF','TotRmsAbvGrd','YearBuilt','BsmtCond','Foundation','Electrical','BldgType','OverallCond','GrLivArea','LotConfig','LotShape','LotArea','Exterior1st','ExterQual','ExterCond']]
y = train4['SalePrice']

regr = linear_model.LinearRegression()
regr.fit(x,y)

In [None]:
test_MLR = test_mapped[['Neighborhood','OverallQual','BedroomAbvGr','CentralAir','BsmtQual','Fireplaces','YrSold','HeatingQC','GarageType','RoofMatl','PavedDrive','SaleType','FullBath','OpenPorchSF','RoofStyle','Utilities','BsmtFinSF1','Heating','KitchenQual','HalfBath','BsmtFinType1','WoodDeckSF','TotRmsAbvGrd','YearBuilt','BsmtCond','Foundation','Electrical','BldgType','OverallCond','GrLivArea','LotConfig','LotShape','LotArea','Exterior1st','ExterQual','ExterCond']]

In [None]:
test_MLR.isna().sum()

In [None]:
test_MLR_na = test_MLR.dropna()

In [None]:
predict = regr.predict(test_MLR_na)
#for i in range(len(test_MLR_na)):
#  print("X=%s, Predicted=%s" % (test_MLR_na[i], test_MLR_na[i]))
predict

array([278893.68127161, 141868.94806995, 129505.3938858 , 236530.59122743,
       217140.13431111, 206124.48189637, 201943.57956215, 261236.12394682,
       248368.98857277, 237036.44659846, 295985.59890618, 194373.55988347,
       132414.13193629, 176057.46004612, 257477.2973875 , 222461.89622824,
       267278.89606749, 194693.30524999, 221330.42370985, 227232.90606983,
       159780.57777804, 225717.0502303 , 201371.43497828, 126099.74467429,
       254420.16879828, 215752.56242974, 157451.3237438 , 143231.72826874,
       160325.05076867,  81535.43250542, 146858.70396315, 569866.73777331,
       126808.93592888, 101653.71652756, 108800.80210007, 120064.99543059,
        97354.97341579,  97673.95220332, 164763.54822056,  85559.08922329,
       126143.18062701, 140394.83703696, 157158.62393333, 122081.24642878,
       169106.6644168 , 114169.79718369,  61005.07869064, 163169.0009764 ,
       110546.41110737, 153041.46712305, 100756.60096453,  68733.6419543 ,
        75248.4606408 ])

This one gives us results but is not robust enough for our predictions with missing values. Ultimately, we have to throw these results out.

----------------------------------------------------------------------------

----------------------------------------------------------------------------

**Random Forest/ Decision Tree**:

Since this model only uses numerical values, we'll use the mapping from above. Now onto the good stuff:

In [None]:
train3[['SalePrice','Neighborhood','OverallQual','BedroomAbvGr','CentralAir','BsmtQual','Fireplaces','YrSold','HouseStyle','HeatingQC','GarageType','RoofMatl','PavedDrive','SaleType','FullBath','OpenPorchSF','RoofStyle','Utilities','BsmtFinSF1','Heating','KitchenQual','HalfBath','BsmtFinType1','WoodDeckSF','TotRmsAbvGrd','YearBuilt','BsmtCond','Foundation','Electrical','BldgType','OverallCond','GrLivArea','LotConfig','LotShape','LotArea','Exterior1st','ExterQual','ExterCond']]

In [None]:
train3[['Neighborhood','OverallQual','BedroomAbvGr','CentralAir','BsmtQual','Fireplaces','YrSold','HouseStyle','HeatingQC','GarageType','RoofMatl','PavedDrive','SaleType','FullBath','OpenPorchSF','RoofStyle','Utilities','BsmtFinSF1','Heating','KitchenQual','HalfBath','BsmtFinType1','WoodDeckSF','TotRmsAbvGrd','YearBuilt','BsmtCond','Foundation','Electrical','BldgType','OverallCond','GrLivArea','LotConfig','LotShape','LotArea','Exterior1st','ExterQual','ExterCond']].isna().sum()

In [None]:
#the entire set minus LotFrontage
features = ['Neighborhood','OverallQual','BedroomAbvGr','CentralAir','BsmtQual','Fireplaces','YrSold','HeatingQC','GarageType','RoofMatl','PavedDrive','SaleType','FullBath','OpenPorchSF','RoofStyle','Utilities','BsmtFinSF1','Heating','KitchenQual','HalfBath','BsmtFinType1','WoodDeckSF','TotRmsAbvGrd','YearBuilt','BsmtCond','Foundation','Electrical','BldgType','OverallCond','GrLivArea','LotConfig','LotShape','LotArea','Exterior1st','ExterQual','ExterCond']

x = train3[['Neighborhood','OverallQual','BedroomAbvGr','CentralAir','BsmtQual','Fireplaces','YrSold','HeatingQC','GarageType','RoofMatl','PavedDrive','SaleType','FullBath','OpenPorchSF','RoofStyle','Utilities','BsmtFinSF1','Heating','KitchenQual','HalfBath','BsmtFinType1','WoodDeckSF','TotRmsAbvGrd','YearBuilt','BsmtCond','Foundation','Electrical','BldgType','OverallCond','GrLivArea','LotConfig','LotShape','LotArea','Exterior1st','ExterQual','ExterCond']]
y = train3['SalePrice']

dtree = DecisionTreeClassifier()
dtree = dtree.fit(x, y)

In [None]:
tree.plot_tree(dtree, feature_names=['Neighborhood','OverallQual','BedroomAbvGr','CentralAir','BsmtQual','Fireplaces','YrSold','HeatingQC','GarageType','RoofMatl','PavedDrive','SaleType','FullBath','OpenPorchSF','RoofStyle','Utilities','BsmtFinSF1','Heating','KitchenQual','HalfBath','BsmtFinType1','WoodDeckSF','TotRmsAbvGrd','YearBuilt','BsmtCond','Foundation','Electrical','BldgType','OverallCond','GrLivArea','LotConfig','LotShape','LotArea','Exterior1st','ExterQual','ExterCond'])

We need to import our test data set to see what we can get out of this:

In [None]:
test_MLR_na.head(70)

In [None]:
#Prediction
print(dtree.predict(test_MLR_na))

[260000 144000 112500 192500 204900 194000 173900 475000 191000 255900
 295493 196500 136500 220000 203000 287000 230000 176432 194000 222500
 155900 211000 192000 149900 228500 191000 130500 130000 122000 129000
 206000 320000 185000 113000 110000 187100 125000 105000 123000 125000
 104900  95000 112000 107000 130000 163000 100000 143000 130000 224900
 122500 100000 194000]


This one gives us results but is not robust enough for our predictions with missing values. Ultimately, we have to throw these results out.

----------------------------------------------------------------------------

----------------------------------------------------------------------------

**Histogram Boosting Gradient Classifier**

In [None]:
x = train3[['Neighborhood','OverallQual','BedroomAbvGr','CentralAir','BsmtQual','Fireplaces','YrSold','HeatingQC','GarageType','RoofMatl','PavedDrive','SaleType','FullBath','OpenPorchSF','RoofStyle','Utilities','BsmtFinSF1','Heating','KitchenQual','HalfBath','BsmtFinType1','WoodDeckSF','TotRmsAbvGrd','YearBuilt','BsmtCond','Foundation','Electrical','BldgType','OverallCond','GrLivArea','LotConfig','LotShape','LotArea','Exterior1st','ExterQual','ExterCond']]
y = train3['SalePrice']

In [None]:
hgb_classifier = HistGradientBoostingClassifier()
hgb_classifier.fit(x,y)

In [None]:
test_HBGC = test_mapped[['Neighborhood','OverallQual','BedroomAbvGr','CentralAir','BsmtQual','Fireplaces','YrSold','HeatingQC','GarageType','RoofMatl','PavedDrive','SaleType','FullBath','OpenPorchSF','RoofStyle','Utilities','BsmtFinSF1','Heating','KitchenQual','HalfBath','BsmtFinType1','WoodDeckSF','TotRmsAbvGrd','YearBuilt','BsmtCond','Foundation','Electrical','BldgType','OverallCond','GrLivArea','LotConfig','LotShape','LotArea','Exterior1st','ExterQual','ExterCond']]

In [None]:
y_pred_hgb = hgb_classifier.predict(test_HBGC)

In [None]:
y_pred_hgb

array([236500, 197900, 127000, 223500, 212900, 194000, 112500, 195000,
       228500, 235000, 185500, 111000, 383970, 213000, 144000, 173900,
       287000, 275000, 320000, 163990, 214000, 233230, 235000, 275000,
       235000, 163990, 127000, 230000, 235000, 130250,  93000,  94750,
       130000, 159500, 127500, 115000, 100000, 155000, 112000, 160000,
       108000, 100000, 130250, 125000, 105900, 116000, 105000, 200500,
       116000, 126000, 245350, 107900, 125000, 119000, 157000, 116000,
       159500, 115000, 116900, 100000, 125500, 131000, 107000, 122000,
       115000, 112000, 100000])

This looks like our best prediction.

----------------------------------------------------------------------------

----------------------------------------------------------------------------

**Support Vector Regression**: a machine learning clustering method that determines a line/ hyperplane separating classes in a continuous space.

https://www.analyticsvidhya.com/blog/2020/03/support-vector-regression-tutorial-for-machine-learning/

In [None]:
x = train2[['OverallQual', 'BedroomAbvGr', 'Fireplaces', 'YrSold', 'FullBath', 'OpenPorchSF', 'BsmtFinSF1', 'HalfBath', 'WoodDeckSF', 'TotRmsAbvGrd', 'YearBuilt', 'OverallCond', 'GrLivArea', 'LotArea']].values.astype(float)
y = train2['SalePrice'].values.astype(float)

In [None]:
#normalizing the data
sc_x = StandardScaler()
sc_y = StandardScaler()
x = sc_x.fit_transform(x)
y = sc_y.fit_transform(y.reshape(-1,1))

In [None]:
x.shape

In [None]:
y.shape

In [None]:
#Fitting SVR to the data
regressor = SVR(kernel = 'rbf') #kernel can be linear/ gaussian/ etc.
regressor.fit(x, y)

In [None]:
test_SVR = test_mapped[['OverallQual', 'BedroomAbvGr', 'Fireplaces', 'YrSold', 'FullBath', 'OpenPorchSF', 'BsmtFinSF1', 'HalfBath', 'WoodDeckSF', 'TotRmsAbvGrd', 'YearBuilt', 'OverallCond', 'GrLivArea', 'LotArea']]

In [None]:
#Prediction
y_pred = regressor.predict(test_SVR)
y_pred



array([0.24942747, 0.24942747, 0.24942747, 0.24942747, 0.24942747,
       0.24942747, 0.24942747, 0.24942747, 0.24942747, 0.24942747,
       0.24942747, 0.24942747, 0.24942747, 0.24942747, 0.24942747,
       0.24942747, 0.24942747, 0.24942747, 0.24942747, 0.24942747,
       0.24942747, 0.24942747, 0.24942747, 0.24942747, 0.24942747,
       0.24942747, 0.24942747, 0.24942747, 0.24942747, 0.24942747,
       0.24942747, 0.24942747, 0.24942747, 0.24942747, 0.24942747,
       0.24942747, 0.24942747, 0.24942747, 0.24942747, 0.24942747,
       0.24942747, 0.24942747, 0.24942747, 0.24942747, 0.24942747,
       0.24942747, 0.24942747, 0.24942747, 0.24942747, 0.24942747,
       0.24942747, 0.24942747, 0.24942747, 0.24942747, 0.24942747,
       0.24942747, 0.24942747, 0.24942747, 0.24942747, 0.24942747,
       0.24942747, 0.24942747, 0.24942747, 0.24942747, 0.24942747,
       0.24942747, 0.24942747])

This is obviously nonsensical.