# Ensemble Model

### Import Packages and Data

In [36]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostRegressor
import matplotlib.pyplot as plt

In [2]:
# Import train data
train_data_url = 'https://raw.githubusercontent.com/cal-dortiz/W207_Applied-_Machine_Learning/main/Final_Project/Data/train.csv'
df_train = pd.read_csv(train_data_url)

### General Data Cleansing

In [3]:
# Check missing data
df_train.isnull().sum().sort_values(ascending = False).head(20)

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
FireplaceQu      690
LotFrontage      259
GarageCond        81
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
BsmtExposure      38
BsmtFinType2      38
BsmtFinType1      37
BsmtCond          37
BsmtQual          37
MasVnrArea         8
MasVnrType         8
Electrical         1
Utilities          0
dtype: int64

In [4]:
# Remove "Junk" columns
df_train = df_train.drop(columns=['Id', 'MiscFeature'])

# Drop the row with the missing electrical data
df_train = df_train.dropna( how='any', subset=['Electrical'])

In [5]:
#Describe numerical variables
NumDataSet = df_train.select_dtypes(exclude='O')


#Describe categorical variables
CatDataSet = df_train.select_dtypes(include='O')
Categorical_Cols = CatDataSet.columns.tolist()
Categorical_Cols

Numerical_Cols = NumDataSet.columns.tolist()
Numerical_Cols 


#Within categorical variables, there are several categories that are not 
#purely nominal/unordered but insteted are ordered categories - In other
#words, they are ordinal but have text values, not numerical values. We separate
#these into a new category

Ordered_Category = CatDataSet[['ExterQual','ExterCond','BsmtQual','BsmtCond',
                   'HeatingQC','KitchenQual','FireplaceQu','GarageQual','GarageCond']].copy()

    
#Remove these columns from the categorical dataset, which will now include only
#unordered categories
CatDataSet = CatDataSet.drop(['ExterQual','ExterCond','BsmtQual','BsmtCond', 'HeatingQC','KitchenQual','FireplaceQu','GarageQual','GarageCond'], axis = 'columns')


#4.2.2 Convert all cateogrical values to numeric values
#For ordered category data, use the ordinal encoder from sklearn.
#Given a dataset with unique features, we let the encoder find the unique values per feature and transform the data to an ordinal encoding.
Ordered_Category['ExterQual'].replace(to_replace=['Ex', 'Gd', 'TA', 'Fa', 'Po', np.nan], value=[5, 4, 3, 2, 1, 0], inplace=True)
Ordered_Category['ExterCond'].replace(to_replace=['Ex', 'Gd', 'TA', 'Fa', 'Po', np.nan], value=[5, 4, 3, 2, 1, 0], inplace=True)
Ordered_Category['BsmtQual'].replace(to_replace=['Ex', 'Gd', 'TA', 'Fa', 'Po', np.nan], value=[5, 4, 3, 2, 1, 0], inplace=True)
Ordered_Category['BsmtCond'].replace(to_replace=['Ex', 'Gd', 'TA', 'Fa', 'Po', np.nan], value=[5, 4, 3, 2, 1, 0], inplace=True)
Ordered_Category['HeatingQC'].replace(to_replace=['Ex', 'Gd', 'TA', 'Fa', 'Po', np.nan], value=[5, 4, 3, 2, 1, 0], inplace=True)
Ordered_Category['KitchenQual'].replace(to_replace=['Ex', 'Gd', 'TA', 'Fa', 'Po', np.nan], value=[5, 4, 3, 2, 1, 0], inplace=True)
Ordered_Category['FireplaceQu'].replace(to_replace=['Ex', 'Gd', 'TA', 'Fa', 'Po', np.nan], value=[5, 4, 3, 2, 1, 0], inplace=True)
Ordered_Category['GarageQual'].replace(to_replace=['Ex', 'Gd', 'TA', 'Fa', 'Po', np.nan], value=[5, 4, 3, 2, 1, 0], inplace=True)
Ordered_Category['GarageCond'].replace(to_replace=['Ex', 'Gd', 'TA', 'Fa', 'Po', np.nan], value=[5, 4, 3, 2, 1, 0], inplace=True)
NumDataSet['GarageYrBlt'].replace(to_replace=[np.nan], value=[0], inplace=True)
NumDataSet['LotFrontage'].replace(to_replace=[np.nan], value=[0], inplace=True)
NumDataSet['MasVnrArea'].replace(to_replace=[np.nan], value=[0], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


In [6]:
# Encode Catagorical data

le = preprocessing.LabelEncoder()

#For categorical data, use LabelEncoder to categorize the variables.
for col in CatDataSet:
    i = CatDataSet.columns.get_loc(col)
    CatDataSet.iloc[:,i] = CatDataSet.apply(lambda i:le.fit_transform(i.astype(str)), axis=0, result_type='expand')

# glue data sets together
df_train2 = pd.concat((CatDataSet, Ordered_Category, NumDataSet), axis=1, join='inner')

In [7]:
#See if there are any nulls left.
df_train2.isnull().sum().sort_values(ascending = False).head(20)

SalePrice        0
PavedDrive       0
BsmtFinType2     0
Heating          0
CentralAir       0
Electrical       0
Functional       0
GarageType       0
GarageFinish     0
PoolQC           0
BsmtExposure     0
Fence            0
SaleType         0
SaleCondition    0
ExterQual        0
ExterCond        0
BsmtQual         0
BsmtCond         0
BsmtFinType1     0
Foundation       0
dtype: int64

## KNN

### KNN Model Preprocessing

In [11]:
# Copy Dataset for KNN
knn_train = df_train2.copy()

In [12]:
# Encode labels
#knn_enc = preprocessing.LabelEncoder()
#knn_train['SalePrice'] = knn_enc.fit_transform(knn_train['SalePrice'])

use_cols=list(knn_train.columns)
use_cols.remove('SalePrice')

knn_x_train, knn_x_test, knn_y_train, knn_y_test = train_test_split(knn_train[use_cols],knn_train['SalePrice'], 
                                                    test_size = 0.25, random_state = 2)

In [13]:
knn_x_train.shape, knn_x_test.shape

((1094, 78), (365, 78))

In [14]:
knn_x_train = knn_x_train.to_numpy()
knn_x_test = knn_x_test.to_numpy()
knn_y_train = knn_y_train.to_numpy()
knny_test = knn_y_test.to_numpy()
k_values = range(1,len(knn_x_train[0]))

### KNN Model

In [15]:
#Initiate KNN Classifier and set the number of neighbors 
nn = KNeighborsClassifier(n_neighbors = 1)
    
# Fit KNN Model
nn.fit(knn_x_train, knn_y_train)
    
# Predict Lable
knn_y_predict_values = nn.predict(knn_x_test)

In [16]:
knn_rmse = mean_squared_error(np.log(knn_y_test),
                              np.log(knn_y_predict_values))
                                     
print('KNN RMSE =', knn_rmse)


KNN RMSE = 0.09560935737002291


## Random Forest

### Random Forest Model Data PreProcessing

In [21]:
# Copy Dataset for RF
rf_train = df_train2.copy()

In [22]:
# Encode labels
#lab_enc = preprocessing.LabelEncoder()
#knn_train['SalePrice'] = lab_enc.fit_transform(knn_train['SalePrice'])

use_cols=list(rf_train.columns)
use_cols.remove('SalePrice')

rf_x_train, rf_x_test, rf_y_train, rf_y_test = train_test_split(rf_train[use_cols],rf_train['SalePrice'], 
                                                    test_size = 0.25, random_state = 2)

In [23]:
rf_x_train.shape, rf_x_test.shape

((1094, 78), (365, 78))

### RF Model

In [26]:
# Initiate model
clf = RandomForestClassifier(n_estimators=123)

# Train Model
clf.fit(rf_x_train, rf_y_train)

# Predict Labels
rf_prediction = clf.predict(rf_x_test)


In [27]:
rf_rmse = mean_squared_error(np.log(knn_y_test),
                              np.log(knn_y_predict_values))
                                     
print('Random Forrest RMSE =', rf_rmse)

KNN RMSE = 0.09560935737002291


## AdaBoost Classifier

### AdaBoost PreProcess

In [33]:
# Copy Dataset for RF
abc_train = df_train2.copy()

In [34]:
# Encode labels
#lab_enc = preprocessing.LabelEncoder()
#knn_train['SalePrice'] = lab_enc.fit_transform(knn_train['SalePrice'])

use_cols=list(abc_train.columns)
use_cols.remove('SalePrice')

abc_x_train, abc_x_test, abc_y_train, abc_y_test = train_test_split(abc_train[use_cols],abc_train['SalePrice'], 
                                                                    test_size = 0.25, random_state = 2)

###  Adaboost Model

In [39]:
# Initiate Model
clf = AdaBoostRegressor(n_estimators=28)

# Train Model
clf.fit(abc_x_train, abc_y_train)

# Predict Labels
abc_prediction = clf.predict(abc_x_test)

In [40]:
abc_rmse = mean_squared_error(np.log(knn_y_test),
                              np.log(knn_y_predict_values))
                                     
print('Random Forrest RMSE =', abc_rmse)

Random Forrest RMSE = 0.09560935737002291


## Linear Regression

## Ensemble

In [41]:
ensemble_value = []

for i in range(len(knn_y_test)):
    ensemble_value.append((knn_y_predict_values[i] + 
                           rf_prediction[i] +
                           abc_prediction[i])/3)

In [42]:
rmse = mean_squared_error(np.log(knn_y_test),
                          np.log(ensemble_value))

print(rmse)

0.034662754242944745
