# Ensemble Model

In this notebook we pulled together all of our models, averaged the results, and then run the root-mean-squared calculation.

### Import Packages and Data

In [70]:
#brew install libomp
#pip install xgboost

In [71]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import LassoCV
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
import matplotlib.pyplot as plt

In [72]:
# Import train data
train_data_url = 'https://raw.githubusercontent.com/cal-dortiz/W207_Applied-_Machine_Learning/main/Final_Project/Data/train.csv'
df_train = pd.read_csv(train_data_url)

### General Data Cleansing

In [73]:
# Check missing data
df_train.isnull().sum().sort_values(ascending = False).head(20)

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
FireplaceQu      690
LotFrontage      259
GarageCond        81
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
BsmtExposure      38
BsmtFinType2      38
BsmtFinType1      37
BsmtCond          37
BsmtQual          37
MasVnrArea         8
MasVnrType         8
Electrical         1
Utilities          0
dtype: int64

In [74]:
# Remove "Junk" columns
df_train = df_train.drop(columns=['Id', 'MiscFeature'])

# Drop the row with the missing electrical data
df_train = df_train.dropna( how='any', subset=['Electrical'])

In [75]:
# Remove Outliers (based on data documentation pencil notes)
df_train.drop(df_train[df_train.GrLivArea >= 4000].index, inplace=True)

In [76]:
#Describe numerical variables
NumDataSet = df_train.select_dtypes(exclude='O')


#Describe categorical variables
CatDataSet = df_train.select_dtypes(include='O')
Categorical_Cols = CatDataSet.columns.tolist()
Categorical_Cols

Numerical_Cols = NumDataSet.columns.tolist()
Numerical_Cols 


#Within categorical variables, there are several categories that are not 
#purely nominal/unordered but insteted are ordered categories - In other
#words, they are ordinal but have text values, not numerical values. We separate
#these into a new category

Ordered_Category = CatDataSet[['ExterQual','ExterCond','BsmtQual','BsmtCond',
                   'HeatingQC','KitchenQual','FireplaceQu','GarageQual','GarageCond']].copy()

    
#Remove these columns from the categorical dataset, which will now include only
#unordered categories
CatDataSet = CatDataSet.drop(['ExterQual','ExterCond','BsmtQual','BsmtCond', 'HeatingQC','KitchenQual','FireplaceQu','GarageQual','GarageCond'], axis = 'columns')


#4.2.2 Convert all cateogrical values to numeric values
#For ordered category data, use the ordinal encoder from sklearn.
#Given a dataset with unique features, we let the encoder find the unique values per feature and transform the data to an ordinal encoding.
Ordered_Category['ExterQual'].replace(to_replace=['Ex', 'Gd', 'TA', 'Fa', 'Po', np.nan], value=[5, 4, 3, 2, 1, 0], inplace=True)
Ordered_Category['ExterCond'].replace(to_replace=['Ex', 'Gd', 'TA', 'Fa', 'Po', np.nan], value=[5, 4, 3, 2, 1, 0], inplace=True)
Ordered_Category['BsmtQual'].replace(to_replace=['Ex', 'Gd', 'TA', 'Fa', 'Po', np.nan], value=[5, 4, 3, 2, 1, 0], inplace=True)
Ordered_Category['BsmtCond'].replace(to_replace=['Ex', 'Gd', 'TA', 'Fa', 'Po', np.nan], value=[5, 4, 3, 2, 1, 0], inplace=True)
Ordered_Category['HeatingQC'].replace(to_replace=['Ex', 'Gd', 'TA', 'Fa', 'Po', np.nan], value=[5, 4, 3, 2, 1, 0], inplace=True)
Ordered_Category['KitchenQual'].replace(to_replace=['Ex', 'Gd', 'TA', 'Fa', 'Po', np.nan], value=[5, 4, 3, 2, 1, 0], inplace=True)
Ordered_Category['FireplaceQu'].replace(to_replace=['Ex', 'Gd', 'TA', 'Fa', 'Po', np.nan], value=[5, 4, 3, 2, 1, 0], inplace=True)
Ordered_Category['GarageQual'].replace(to_replace=['Ex', 'Gd', 'TA', 'Fa', 'Po', np.nan], value=[5, 4, 3, 2, 1, 0], inplace=True)
Ordered_Category['GarageCond'].replace(to_replace=['Ex', 'Gd', 'TA', 'Fa', 'Po', np.nan], value=[5, 4, 3, 2, 1, 0], inplace=True)
NumDataSet['GarageYrBlt'].replace(to_replace=[np.nan], value=[0], inplace=True)
NumDataSet['LotFrontage'].replace(to_replace=[np.nan], value=[0], inplace=True)
NumDataSet['MasVnrArea'].replace(to_replace=[np.nan], value=[0], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


In [77]:
# Encode Catagorical data

le = preprocessing.LabelEncoder()

#For categorical data, use LabelEncoder to categorize the variables.
for col in CatDataSet:
    i = CatDataSet.columns.get_loc(col)
    CatDataSet.iloc[:,i] = CatDataSet.apply(lambda i:le.fit_transform(i.astype(str)), axis=0, result_type='expand')
    
# glue data sets together
df_train2 = pd.concat((CatDataSet, Ordered_Category, NumDataSet), axis=1, join='inner')

In [78]:
#See if there are any nulls left.
df_train2.isnull().sum().sort_values(ascending = False).head(20)

SalePrice        0
PavedDrive       0
BsmtFinType2     0
Heating          0
CentralAir       0
Electrical       0
Functional       0
GarageType       0
GarageFinish     0
PoolQC           0
BsmtExposure     0
Fence            0
SaleType         0
SaleCondition    0
ExterQual        0
ExterCond        0
BsmtQual         0
BsmtCond         0
BsmtFinType1     0
Foundation       0
dtype: int64

## KNN

### KNN Model Preprocessing - NEED TO SCALE THIS!!!!

In [79]:
# Copy Dataset for KNN
knn_train = df_train2.copy()

In [80]:
# Encode labels
#knn_enc = preprocessing.LabelEncoder()
#knn_train['SalePrice'] = knn_enc.fit_transform(knn_train['SalePrice'])

use_cols=list(knn_train.columns)
use_cols.remove('SalePrice')

knn_x_train, knn_x_test, knn_y_train, knn_y_test = train_test_split(knn_train[use_cols],knn_train['SalePrice'], 
                                                    test_size = 0.25, random_state = 2)

In [81]:
knn_x_train.shape, knn_x_test.shape

((1091, 78), (364, 78))

In [82]:
knn_x_train = knn_x_train.to_numpy()
knn_x_test = knn_x_test.to_numpy()
knn_y_train = knn_y_train.to_numpy()
knny_test = knn_y_test.to_numpy()
k_values = range(1,len(knn_x_train[0]))

### KNN Model

In [83]:
#Initiate KNN Classifier and set the number of neighbors 
nn = KNeighborsClassifier(n_neighbors = 1)
    
# Fit KNN Model
nn.fit(knn_x_train, knn_y_train)
    
# Predict Lable
knn_y_predict_values = nn.predict(knn_x_test)

In [84]:
knn_rmse = mean_squared_error(np.log(knn_y_test),
                              np.log(knn_y_predict_values))
                                     
print('KNN RMSE =', knn_rmse)


KNN RMSE = 0.09322723804857086


## Random Forest

### Random Forest Model Data PreProcessing

In [16]:
# Copy Dataset for RF
rf_train = df_train2.copy()

In [17]:
# Encode labels
#lab_enc = preprocessing.LabelEncoder()
#knn_train['SalePrice'] = lab_enc.fit_transform(knn_train['SalePrice'])

use_cols=list(rf_train.columns)
use_cols.remove('SalePrice')

rf_x_train, rf_x_test, rf_y_train, rf_y_test = train_test_split(rf_train[use_cols],rf_train['SalePrice'], 
                                                    test_size = 0.25, random_state = 2)

In [18]:
rf_x_train.shape, rf_x_test.shape

((1091, 78), (364, 78))

### RF Model

In [19]:
# Initiate model
clf = RandomForestClassifier(n_estimators=123)

# Train Model
clf.fit(rf_x_train, rf_y_train)

# Predict Labels
rf_prediction = clf.predict(rf_x_test)


In [20]:
rf_rmse = mean_squared_error(np.log(knn_y_test),
                              np.log(knn_y_predict_values))
                                     
print('Random Forrest RMSE =', rf_rmse)

Random Forrest RMSE = 0.09322723804857086


## AdaBoost Classifier

### AdaBoost Classifier PreProcess

In [21]:
# Copy Dataset for RF
abc_train = df_train2.copy()

In [22]:
# Encode labels
#lab_enc = preprocessing.LabelEncoder()
#knn_train['SalePrice'] = lab_enc.fit_transform(knn_train['SalePrice'])

use_cols=list(abc_train.columns)
use_cols.remove('SalePrice')

abc_x_train, abc_x_test, abc_y_train, abc_y_test = train_test_split(abc_train[use_cols],abc_train['SalePrice'], 
                                                                    test_size = 0.25, random_state = 2)

###  Adaboost Classifier Model

In [23]:
# Initiate Model
clf = AdaBoostRegressor(n_estimators=28)

# Train Model
clf.fit(abc_x_train, abc_y_train)

# Predict Labels
abc_prediction = clf.predict(abc_x_test)

In [24]:
abc_rmse = mean_squared_error(np.log(knn_y_test),
                              np.log(knn_y_predict_values))
                                     
print('Random Forrest RMSE =', abc_rmse)

Random Forrest RMSE = 0.09322723804857086


# Regression

## Regression Preprocessing

In [25]:
r_data = df_train2.copy()

In [26]:
r_data = r_data.drop(columns=['PoolQC','Alley','Fence','TotalBsmtSF'])

In [27]:
# Combine SqFt
r_data['TotSqFt'] = r_data['1stFlrSF'] + r_data['2ndFlrSF']
r_Data = r_data.drop(columns=['1stFlrSF','2ndFlrSF'])

In [28]:
# Log Transforms of the data (more normal data distribution)
r_data['SalePrice'] = np.log(r_data['SalePrice'])

### Linear Regression Model

#### Select Attributes for the model

In [29]:
# Select model dependent and independent variables
LR1Xtrain = pd.DataFrame(data=r_data,columns=['OverallQual', 'TotSqFt', 'GrLivArea', 'GarageCars', 'GarageArea', 'BsmtQual', 'FullBath', 'YearBuilt', 'YearRemodAdd'])
LR1Ytrain = pd.DataFrame(data= r_data,columns=['SalePrice'])

In [30]:
# Scale Data
lr_ss = StandardScaler()
lr_ss.fit_transform(LR1Xtrain)

array([[ 0.66501519,  0.42527296,  0.40945634, ...,  0.80080009,
         1.05468111,  0.88141452],
       [-0.06524677, -0.48507456, -0.49230299, ...,  0.80080009,
         0.16027966, -0.42647479],
       [ 0.66501519,  0.57970691,  0.56243337, ...,  0.80080009,
         0.98842915,  0.83297417],
       ...,
       [ 0.66501519,  1.70544916,  1.67755539, ...,  0.80080009,
        -0.99912964,  1.02673555],
       [-0.79550874, -0.8589673 , -0.86266842, ..., -1.02547598,
        -0.70099582,  0.5423321 ],
       [-0.79550874, -0.49726672, -0.50438012, ..., -1.02547598,
        -0.20410612, -0.95931858]])

In [31]:
# Cut data into train and test
lr_x_train, lr_x_test, lr_y_train, lr_y_test = train_test_split(LR1Xtrain, LR1Ytrain,
                                                    test_size=0.25,random_state=2)

lr_x_train.shape, lr_x_test.shape

((1091, 9), (364, 9))

#### linear Regression Model

In [32]:
# Initiate Model
lr = LinearRegression()

# Train Model
lr.fit(lr_x_train, lr_y_train)

# Predict Label
lr_predict = lr.predict(lr_x_test)

# De-scale data
# Verify this is right
lr_prediction = np.exp(lr_predict)

In [33]:
lr_rmse = mean_squared_error(np.log(lr_prediction),
                              lr_y_test)
                                     
print('Linear Regression RMSE =', lr_rmse)

Linear Regression RMSE = 0.026558292311187268


## Ridge Regression

### Ridge Regression Preprocessing

In [34]:
# Select model dependent and independent variables
rr1Xtrain = pd.DataFrame(data=r_data,columns=['OverallQual', 'TotSqFt', 'GrLivArea', 'GarageCars', 'GarageArea', 'BsmtQual', 'FullBath', 'YearBuilt', 'YearRemodAdd'])
rr1Ytrain = pd.DataFrame(data= r_data,columns=['SalePrice'])

# Scale Data
rr_ss = StandardScaler()
rr_ss.fit_transform(rr1Xtrain)

# Cut data into train and test
rr_x_train, rr_x_test, rr_y_train, rr_y_test = train_test_split(rr1Xtrain, rr1Ytrain,
                                                    test_size=0.25,random_state=2)

rr_x_train.shape, rr_x_test.shape

((1091, 9), (364, 9))

### Ridge Regression Model

In [35]:
# Initiate model
ridge = RidgeCV(alphas=np.linspace(1, 200, 100))

# Fit Model
ridge_model = ridge.fit(rr_x_train, rr_y_train)

# Predict label
rr_predict = ridge_model.predict(rr_x_test)

# Convert back to data value
rr_prediction = np.exp(rr_predict)

In [36]:
rr_rmse = mean_squared_error(np.log(rr_prediction),
                             rr_y_test)
                                     
print('Linear Regression RMSE =', rr_rmse)

Linear Regression RMSE = 0.02648945867526529


## Elasticnet Regression

### Elasticnet Preprocess

In [37]:
# Select model dependent and independent variables
er1Xtrain = pd.DataFrame(data=r_data,columns=['OverallQual', 'TotSqFt', 'GrLivArea', 'GarageCars', 'GarageArea', 'BsmtQual', 'FullBath', 'YearBuilt', 'YearRemodAdd'])
er1Ytrain = pd.DataFrame(data= r_data,columns=['SalePrice'])

# Scale Data
er_ss = StandardScaler()
er_ss.fit_transform(er1Xtrain)

# Cut data into train and test
er_x_train, er_x_test, er_y_train, er_y_test = train_test_split(er1Xtrain, er1Ytrain,
                                                    test_size=0.25,random_state=2)

er_x_train.shape, er_x_test.shape

((1091, 9), (364, 9))

### Elasticnet Model

In [38]:
# Initiate Model
enet_alphas = np.arange(0.01, 1.0, 0.005)
enet_ratio = 0.5
enet = ElasticNetCV(alphas=enet_alphas, l1_ratio=enet_ratio, cv=5, max_iter=5_000)

# Train Model
enet = enet.fit(er_x_train, np.ravel(er_y_train))

# Predict Labels
er_predict = enet.predict(er_x_test)

# Convert back to data value
er_prediction = np.exp(er_predict)

In [39]:
er_rmse = mean_squared_error(np.log(er_prediction),
                             er_y_test)
                                     
print('Elasticnet Model Regression RMSE =', er_rmse)

Elasticnet Model Regression RMSE = 0.026504229524769187


## Lasso Model

### Lasso Model Preprocessing

In [40]:
# Select model dependent and independent variables
lm1Xtrain = pd.DataFrame(data=r_data,columns=['OverallQual', 'TotSqFt', 'GrLivArea', 'GarageCars', 'GarageArea', 'BsmtQual', 'FullBath', 'YearBuilt', 'YearRemodAdd'])
lm1Ytrain = pd.DataFrame(data= r_data,columns=['SalePrice'])

# Scale Data
lm_ss = StandardScaler()
lm_ss.fit_transform(lm1Xtrain)

# Cut data into train and test
lm_x_train, lm_x_test, lm_y_train, lm_y_test = train_test_split(lm1Xtrain, lm1Ytrain,
                                                    test_size=0.25,random_state=2)

lm_x_train.shape, lm_x_test.shape

((1091, 9), (364, 9))

### Lasso Model

In [41]:
# Initiate Model
lasso = LassoCV(n_alphas=100)

# Trian Model
lasso.fit(lm_x_train,np.ravel(lm_y_train))

# Predict Labels
lm_predict = lasso.predict(lm_x_test)

# Convert back to data value
lm_prediction = np.exp(lm_predict)

In [42]:
lm_rmse = mean_squared_error(np.log(lm_prediction),
                             lm_y_test)
                                     
print('Lasso Model Regression RMSE =', lm_rmse)

Lasso Model Regression RMSE = 0.035909927409052335


## Gradient Boost Model

### Gradient Boost Preprocessing

In [43]:
# Select model dependent and independent variables
gb1Xtrain = pd.DataFrame(data=r_data,columns=['OverallQual', 'TotSqFt', 'GrLivArea', 'GarageCars', 'GarageArea', 'BsmtQual', 'FullBath', 'YearBuilt', 'YearRemodAdd'])
gb1Ytrain = pd.DataFrame(data= r_data,columns=['SalePrice'])

# Scale Data
gb_ss = StandardScaler()
gb_ss.fit_transform(gb1Xtrain)

# Cut data into train and test
gb_x_train, gb_x_test, gb_y_train, gb_y_test = train_test_split(gb1Xtrain, gb1Ytrain,
                                                    test_size=0.25,random_state=2)

gb_x_train.shape, gb_x_test.shape

((1091, 9), (364, 9))

### Gradient Boost Model

In [44]:
# Initiate the model
gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, random_state =85)

# Train the Model
gbr.fit(gb_x_train,np.ravel(gb_y_train))

# Predict the labels
gb_predict = gbr.predict(gb_x_test)

# Transform back to values
gb_prediction = np.exp(gb_predict)

In [45]:
gb_rmse = mean_squared_error(np.log(gb_prediction),
                             gb_y_test)
                                     
print('Gradient Boost Model Regression RMSE =', gb_rmse)

Gradient Boost Model Regression RMSE = 0.03256724983099581


## XG Boost Model

## XG Boost Preprocessing

In [46]:
# Select model dependent and independent variables
xg1Xtrain = pd.DataFrame(data=r_data,columns=['OverallQual', 'TotSqFt', 'GrLivArea', 'GarageCars', 'GarageArea', 'BsmtQual', 'FullBath', 'YearBuilt', 'YearRemodAdd'])
xg1Ytrain = pd.DataFrame(data= r_data,columns=['SalePrice'])

# Scale Dataa
xg_ss = StandardScaler()
xg_ss.fit_transform(xg1Xtrain)

# Cut data into train and test
xg_x_train, xg_x_test, xg_y_train, xg_y_test = train_test_split(xg1Xtrain, xg1Ytrain,
                                                    test_size=0.25,random_state=2)

xg_x_train.shape, xg_x_test.shape

((1091, 9), (364, 9))

## XG Boost Model

In [51]:
# Initiate Model
xgboost = XGBRegressor(learning_rate=0.01, n_estimators=3000)

# Train Model
xgboost.fit(xg_x_train,xg_y_train)

# Predict labels
xg_predict = xgboost.predict(xg_x_test)

# Transform back to values
xg_prediction = np.exp(xg_predict)

In [52]:
xg_rmse = mean_squared_error(np.log(xg_prediction),
                             gb_y_test)
                                     
print('XG Boost Model Regression RMSE =', xg_rmse)

XG Boost Model Regression RMSE = 0.028530390677932824


## Ensemble

In [53]:
ensemble_value = []

for i in range(len(knn_y_test)):
    ensemble_value.append((knn_y_predict_values[i] + 
                           rf_prediction[i] +
                           abc_prediction[i] +
                           lr_prediction[i] +
                           rr_prediction[i] +
                           er_prediction[i] +
                           lm_prediction[i] +
                           gb_prediction[i] +
                           xg_prediction[i])/9)

In [54]:
rmse = mean_squared_error(np.log(knn_y_test),
                          np.log(ensemble_value))

print(rmse)

0.021387022743764444
