# Ensemble Modelling

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import modelling as ml
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
import xgboost as xgb
import modelling as ml
sns.set_theme(context='notebook', style='darkgrid', palette='Set3')
import pickle
import gzip, pickle

In [2]:
df = pd.read_csv('../data/clean_test_train.csv')
fillabv = pd.read_csv('../data/modelling_wines.csv')
fillabv.set_index('name', inplace=True)
df.set_index('name', inplace=True)

In [3]:
# Binary encode select categoricals: country, region, vintage
df_encoded = pd.get_dummies(df, columns=['region', 'country', 'vintage'],  drop_first=True, dtype=int)

# Insert imputed abv from workbook 02
df_encoded['abv'] = fillabv['abv']

# Include some feature engineering
df_encoded['age^2'] = df_encoded['age'] ** 2
df_encoded['rating * log_rating_qty'] = df_encoded['rating'] * df_encoded['log_rating_qty']

# Linear Regression Model

In [4]:
X = df_encoded.drop(columns = ['log_price', 'reviewed_by', 'producer', 'wine_variety',
                                'grape_variety', 'price', 'from_vivino'])
y = df_encoded['log_price']

# ss = StandardScaler()
# X = ss.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.85)

lr = LinearRegression()

lr_model, y_pred = ml.model_summary(lr, X_train, X_test, y_train, y_test, log_target=True)

Exponent Root Mean Squared Error (RMSE): 1.517
Root Mean Squared Error (RMSE): 0.417
Train R-squared (R2): 0.769
Test R-squared (R2): 0.762
High varience noted in cross validation: [-3.22956837e+06  7.62924410e-01  7.66956281e-01  7.57706231e-01
 -8.59826455e+13]
Cross validated r2: -17196529736415.965


Exponent Root Mean Squared Error (RMSE): 1.517<br>
Root Mean Squared Error (RMSE): 0.417<br>
Train R-squared (R2): 0.769<br>
Test R-squared (R2): 0.762<br>
High varience noted in cross validation: [-3.22956837e+06  7.62924410e-01  7.66956281e-01  7.57706231e-01
 -8.59826455e+13]<br>
Cross validated r2: -17196529736415.965<br>

In [5]:
# getcols = df_encoded.drop(columns = ['log_price', 'reviewed_by', 'producer', 'wine_variety',
#                                 'grape_variety', 'price', 'from_vivino'])
# # Extract coefficients and return a dataframe
# coefficients = lr_model.coef_
# coeff_df = pd.DataFrame({'Feature': getcols.columns, 'Coefficient': coefficients})

In [6]:
# xg_reg = xgb.XGBRegressor()

# xg_model, y_pred_gx = ml.model_summary(xg_reg,X_train, X_test, y_train, y_test, log_target=True)

**XGBoost Results:**<br>
Exponent Root Mean Squared Error (RMSE): 1.385<br>
Root Mean Squared Error (RMSE): 0.326<br>
Train R-squared (R2): 0.88<br>
Test R-squared (R2): 0.855<br>
Cross validated r2: 0.853<br>

### With PCA

In [7]:
df_encoded_again = pd.get_dummies(df_encoded, columns=['producer'],  drop_first=True, dtype=int)

X = df_encoded_again.drop(columns = ['log_price', 'reviewed_by', 'wine_variety',
                                'grape_variety', 'price', 'from_vivino'])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.85)

pca = PCA(n_components=1000)

pca.fit(X_train)
Z_train = pca.transform(X_train)
Z_test = pca.transform(X_test)
lr_model, y_pred = ml.model_summary(lr, Z_train, Z_test, y_train, y_test, log_target=True)

Exponent Root Mean Squared Error (RMSE): 1.464
Root Mean Squared Error (RMSE): 0.381
Train R-squared (R2): 0.815
Test R-squared (R2): 0.801
Cross validated r2: 0.805


**PCA and LR with Producers - 2000 components:**<br>
Exponent Root Mean Squared Error (RMSE): 1.426<br>
Root Mean Squared Error (RMSE): 0.355<br>
Train R-squared (R2): 0.849<br>
Test R-squared (R2): 0.828<br>
Cross validated r2: 0.832<br>

**PCA and LR with Producers - 1000 components:**<br>
Exponent Root Mean Squared Error (RMSE): 1.465<br>
Root Mean Squared Error (RMSE): 0.382<br>
Train R-squared (R2): 0.815<br>
Test R-squared (R2): 0.8<br>
Cross validated r2: 0.805<br>

In [8]:
# ml.plot_residuals(y_test, y_pred)

In [11]:
# Import price predictive model from 04 - decision tree model
filepath = '../models/casi_rf_production.pkl'
with gzip.open(filepath, 'rb') as f:
    p = pickle.Unpickler(f)
    model_rf = p.load()

### Stacking Models

In [15]:
base_models = [
    ('linear_regression', lr_model),
    ('random_forest', model_rf)
]

# Define the meta-model
meta_model = LinearRegression()

# Create the stacking model
stacked_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)

# Fit the stacking model
stacked_model = stacked_model.fit(X_train, y_train)

final_predictions = stacked_model.predict(X_test)

In [1]:
# ml.plot_residuals(y_test, final_predictions)