# Beer Prediction Final Model and Pipeline

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, LabelEncoder

import torch
import torch.nn as nn
import torch.nn.functional as F

## Data Transformations

In [2]:
#Solution
%load_ext autoreload
%autoreload 2

*Import from CSV*

In [3]:
df_raw_beer = pd.read_csv('/wd/data/raw/beer_reviews.csv')

In [4]:
df_cleaned = df_raw_beer.copy()
df_cleaned.drop(['brewery_id',
                 'review_profilename', 
                 'review_time',
                 'beer_name',
                 'beer_beerid',
                 'review_overall', # temp exclude
                 'brewery_name' # not include in the model
                ], axis=1, inplace=True)
df_cleaned.head()

Unnamed: 0,review_aroma,review_appearance,beer_style,review_palate,review_taste,beer_abv
0,2.0,2.5,Hefeweizen,1.5,1.5,5.0
1,2.5,3.0,English Strong Ale,3.0,3.0,6.2
2,2.5,3.0,Foreign / Export Stout,3.0,3.0,6.5
3,3.0,3.5,German Pilsener,2.5,3.0,5.0
4,4.5,4.0,American Double / Imperial IPA,4.0,4.5,7.7


In [5]:
# Standardise the numeric features 


num_cols = ['review_aroma',
            'review_appearance',
            'review_palate',
            'review_taste',
            'beer_abv']
sc = StandardScaler()
df_cleaned[num_cols] = sc.fit_transform(df_cleaned[num_cols])

df_cleaned.head()


Unnamed: 0,review_aroma,review_appearance,beer_style,review_palate,review_taste,beer_abv
0,-2.487952,-2.177663,Hefeweizen,-3.288833,-3.132454,-0.879382
1,-1.771225,-1.366096,English Strong Ale,-1.090123,-1.083188,-0.362703
2,-1.771225,-1.366096,Foreign / Export Stout,-1.090123,-1.083188,-0.233533
3,-1.054499,-0.55453,German Pilsener,-1.823026,-1.083188,-0.879382
4,1.095679,0.257037,American Double / Imperial IPA,0.375684,0.966078,0.283146


In [6]:
# Create a numeric (integer) version of the target variable 

cats_dict = dict(enumerate(df_cleaned.beer_style.unique()))
le = LabelEncoder()
df_cleaned['beer_style'] = le.fit_transform(df_cleaned['beer_style'])
df_cleaned

Unnamed: 0,review_aroma,review_appearance,beer_style,review_palate,review_taste,beer_abv
0,-2.487952,-2.177663,65,-3.288833,-3.132454,-0.879382
1,-1.771225,-1.366096,51,-1.090123,-1.083188,-0.362703
2,-1.771225,-1.366096,59,-1.090123,-1.083188,-0.233533
3,-1.054499,-0.554530,61,-1.823026,-1.083188,-0.879382
4,1.095679,0.257037,9,0.375684,0.966078,0.283146
...,...,...,...,...,...,...
1586609,0.378953,-0.554530,85,0.375684,0.282989,-0.793269
1586610,1.812405,-2.177663,85,-2.555929,0.282989,-0.793269
1586611,-0.337773,-1.366096,85,-0.357219,0.282989,-0.793269
1586612,1.095679,1.068603,85,1.108588,0.966078,-0.793269


## Split the data

In [7]:
from src.data.sets import split_sets_random, save_sets

# Split the data into training and testing sets with 80-20 ratio
X_train, y_train, X_val, y_val, X_test, y_test = split_sets_random(df_cleaned,
                                                                   target_col='beer_style',
                                                                   test_ratio=0.2)


In [8]:
X_train

# Note: contains 5 features: 
            # 'review_aroma',
            # 'review_appearance',
            # 'review_palate',
            # 'review_taste',
            # 'beer_abv'

Unnamed: 0,review_aroma,review_appearance,review_palate,review_taste,beer_abv
1388520,-3.204678,-1.366096,-1.090123,-0.400099,-1.094665
574735,-1.054499,0.257037,-0.357219,-0.400099,0.627598
235720,-1.054499,-0.554530,0.375684,-0.400099,0.067863
1087888,-0.337773,0.257037,-1.823026,-0.400099,1.187334
282858,0.378953,-0.554530,-0.357219,-0.400099,-0.922439
...,...,...,...,...,...
912020,1.095679,0.257037,0.375684,0.282989,0.326202
1083305,-0.337773,0.257037,-0.357219,0.282989,-0.448816
460341,0.378953,0.257037,0.375684,0.966078,1.144277
626118,0.378953,-0.554530,-1.090123,0.282989,-0.018250


Note: contains 5 features: 
            'review_aroma',
            'review_appearance',
            'review_palate',
            'review_taste',
            'beer_abv'

## Baseline Model

In [9]:
# Import NullModel from src.models.null
from src.models.null import NullModel

baseline_model = NullModel(target_type='classification')
y_base = baseline_model.fit_predict(y_train)

  self.pred_value = mode(y)[0][0]


In [10]:
# Import print_class_perf from src.models.performance
from src.models.performance import print_class_perf

# Print the classification metrics for this baseline model
print_class_perf(y_base, y_train, set_name='Training', average='weighted')

Accuracy Training: 0.0742157299405022
F1 Training: 0.01025487603110527


## Architecture

In [11]:
# Convert all sets to PytorchDataset

from src.models.pytorch import PytorchDataset

train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

In [12]:
# Instantiate PytorchMultiClass with the correct number of input feature 
# and save it into a variable called model

from src.models.pytorch import PytorchMultiClass
model = PytorchMultiClass(X_train.shape[1])


# Set model to use the device available
from src.models.pytorch import get_device

device = get_device()
model.to(device)


PytorchMultiClass(
  (layer_1): Linear(in_features=5, out_features=5, bias=True)
  (layer_out): Linear(in_features=5, out_features=104, bias=True)
)

In [13]:
# Print the architecture of model
print(model)

PytorchMultiClass(
  (layer_1): Linear(in_features=5, out_features=5, bias=True)
  (layer_out): Linear(in_features=5, out_features=104, bias=True)
)


## Train
Neural Network Multi-Class Classification with Pytorch

**learning rate = 0.1** 

**batch size = 500,000**

In [14]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
N_EPOCHS = 5
BATCH_SIZE = 500000


from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                 model=model,
                                                 criterion=criterion,
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device
                                                )
    
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion,
                                                batch_size=BATCH_SIZE,
                                                device=device
                                               )

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.3f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.3f}%')

Epoch: 0
	(train)	|	Loss: nan	|	Acc: 0.769%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 1
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 2
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 3
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 4
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%


In [15]:
torch.save(model, "/wd/models/pytorch_beer_type_prediction.pt")

In [16]:
test_loss, test_acc = test_classification(test_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)
print(f'\tLoss: {test_loss:.4f}\t|\tAccuracy: {test_acc:.4f}')


	Loss: nan	|	Accuracy: 0.0048


## Build Pipeline

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [18]:
num_transformer = Pipeline(
    steps=[
        ('scaler', StandardScaler())
    ]
)

In [29]:
cat_transformer = Pipeline(
    steps=[
        ('label_encoder', LabelEncoder())
    ]
)

In [33]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num_cols', num_transformer, num_cols),
        (['brewery_name'], cat_transformer, ['brewery_name']),
        (['beer_style'], cat_transformer, ['beer_style'])
    ]
)

**I dont know how to create a pipeline that can output the prediction of Pytorch multiclass**

In [32]:
nn_pipe = Pipeline(
    steps=[
        ('preprocessor', preprocessor),       
        ('nn_multiclass', train_classification(train_dataset,
                                                 model=model,
                                                 criterion=nn.CrossEntropyLoss(),
                                                 optimizer=torch.optim.Adam(model.parameters(), lr=0.1),
                                                 batch_size=500000,
                                                 device=torch.device('cpu')
                                                ))
    ]
)

In [34]:
nn_pipe.fit(X_train)

TypeError: Last step of Pipeline should implement fit or be the string 'passthrough'. '(nan, 0.004820540186224747)' (type <class 'tuple'>) doesn't

In [30]:
nn_pipe1 = Pipeline(
    steps=[
        ('preprocessor', preprocessor),       
        ('layer_1', nn.Linear(in_features=5, out_features=5, bias=True),
            'layer_out' ,  nn.Linear(in_features=5, out_features=104, bias=True)
        ),
    ]
)

In [31]:
nn_pipe1.predict(X_train)

AttributeError: 'Linear' object has no attribute 'predict'

In [None]:
obs = pd.DataFrame(X_train.iloc[0]).transpose()
nn_pipe.predict(obs)

In [38]:
from joblib import dump 

dump(nn_pipe,  '/wd/models/nn_pipeline.joblib')

['/wd/models/nn_pipeline.joblib']