# Beer Prediction Full Model Process

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, LabelEncoder

import torch
import torch.nn as nn
import torch.nn.functional as F

## Data Transformations (A)

In [2]:
#Solution
%load_ext autoreload
%autoreload 2

In [3]:
df_raw_beer = pd.read_csv('/wd/data/raw/beer_reviews.csv')

In [34]:
df_cleaned = df_raw_beer.copy()
df_cleaned.drop(['brewery_id',
                 'review_profilename', 
                 'review_time',
                 'beer_name',
                 'beer_beerid',
                 'review_overall' # temp exclude
                ], axis=1, inplace=True)
df_cleaned.head()

Unnamed: 0,brewery_name,review_aroma,review_appearance,beer_style,review_palate,review_taste,beer_abv
0,Vecchio Birraio,2.0,2.5,Hefeweizen,1.5,1.5,5.0
1,Vecchio Birraio,2.5,3.0,English Strong Ale,3.0,3.0,6.2
2,Vecchio Birraio,2.5,3.0,Foreign / Export Stout,3.0,3.0,6.5
3,Vecchio Birraio,3.0,3.5,German Pilsener,2.5,3.0,5.0
4,Caldera Brewing Company,4.5,4.0,American Double / Imperial IPA,4.0,4.5,7.7


In [37]:
# Create a numeric version of the categorical features 

le = LabelEncoder()

cats_dict = dict(enumerate(df_cleaned.brewery_name.unique()))
df_cleaned['brewery_name'] = le.fit_transform(df_cleaned['brewery_name'])

In [38]:
# Standardise the numeric features 

num_cols = ['brewery_name',
            'review_aroma',
            'review_appearance',
            'review_palate',
            'review_taste',
            'beer_abv']

sc = StandardScaler()
df_cleaned[num_cols] = sc.fit_transform(df_cleaned[num_cols])

In [39]:
# Create a numeric (integer) version of the target variable 

cats_dict = dict(enumerate(df_cleaned.beer_style.unique()))
le = LabelEncoder()
df_cleaned['beer_style'] = le.fit_transform(df_cleaned['beer_style'])

In [40]:
df_cleaned

Unnamed: 0,brewery_name,review_aroma,review_appearance,beer_style,review_palate,review_taste,beer_abv
0,1.464266,-2.487952,-2.177663,65,-3.288833,-3.132454,-0.879382
1,1.464266,-1.771225,-1.366096,51,-1.090123,-1.083188,-0.362703
2,1.464266,-1.771225,-1.366096,59,-1.090123,-1.083188,-0.233533
3,1.464266,-1.054499,-0.554530,61,-1.823026,-1.083188,-0.879382
4,-0.823942,1.095679,0.257037,9,0.375684,0.966078,0.283146
...,...,...,...,...,...,...,...
1586609,1.285049,0.378953,-0.554530,85,0.375684,0.282989,-0.793269
1586610,1.285049,1.812405,-2.177663,85,-2.555929,0.282989,-0.793269
1586611,1.285049,-0.337773,-1.366096,85,-0.357219,0.282989,-0.793269
1586612,1.285049,1.095679,1.068603,85,1.108588,0.966078,-0.793269


## Split the data (A)

In [9]:
from src.data.sets import split_sets_random, save_sets

# Split the data into training and testing sets with 80-20 ratio
X_train, y_train, X_val, y_val, X_test, y_test = split_sets_random(df_cleaned,
                                                                   target_col='beer_style',
                                                                   test_ratio=0.2)

In [10]:
save_sets(X_train=X_train,
          y_train=y_train,
          X_val=X_val,
          y_val=y_val,
          X_test=X_test,
          y_test=y_test,
          path='/wd/data/processed/beer_type/'
         )


In [11]:
# Convert all sets to PytorchDataset

from src.models.pytorch import PytorchDataset

train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

## Baseline Model (A)

In [12]:
# Import NullModel from src.models.null
from src.models.null import NullModel

baseline_model = NullModel(target_type='classification')
y_base = baseline_model.fit_predict(y_train)

  self.pred_value = mode(y)[0][0]


In [13]:
# Import print_class_perf from src.models.performance
from src.models.performance import print_class_perf

# Print the classification metrics for this baseline model
print_class_perf(y_base, y_train, set_name='Training', average='weighted')

Accuracy Training: 0.0742157299405022
F1 Training: 0.01025487603110527


## Load Stored Data (A)

In [14]:
from src.data.sets import load_sets
X_train, y_train, X_val, y_val, X_test, y_test = load_sets(path='../data/processed/beer_type/'
                                                          )
                                                           
from src.models.pytorch import PytorchDataset

train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

In [15]:
X_train

# Note: contains 6 features: 
            # 'brewery_name',
            # 'review_aroma',
            # 'review_appearance',
            # 'review_palate',
            # 'review_taste',
            # 'beer_abv'

array([[ 0.08775722, -3.20467771, -1.36609615, -1.09012252, -0.40009933,
        -1.09466486],
       [-0.05561711, -1.05449924,  0.25703654, -0.35721912, -0.40009933,
         0.62759846],
       [ 1.11334619, -1.05449924, -0.55452981,  0.37568428, -0.40009933,
         0.06786288],
       ...,
       [ 1.34054826,  0.37895308,  0.25703654,  0.37568428,  0.96607804,
         1.14427745],
       [ 0.56586437,  0.37895308, -0.55452981, -1.09012252,  0.28298936,
        -0.01825029],
       [ 0.40630261,  0.37895308,  1.06860288,  0.37568428, -0.40009933,
         0.41231554]])

nommesen_april-beer-## Training Experiments (A)

Note: contains 6 features: 
            'brewery_name',
            'review_aroma',
            'review_appearance',
            'review_palate',
            'review_taste',
            'beer_abv'

In [16]:
# Instantiate PytorchMultiClass with the correct number of input feature 
# and save it into a variable called model

from src.models.pytorch import PytorchMultiClass
model = PytorchMultiClass(X_train.shape[1])


# Set model to use the device available
from src.models.pytorch import get_device

device = get_device()
model.to(device)


PytorchMultiClass(
  (layer_1): Linear(in_features=6, out_features=50, bias=True)
  (layer_out): Linear(in_features=50, out_features=104, bias=True)
)

In [17]:
# Print the architecture of model
print(model)

PytorchMultiClass(
  (layer_1): Linear(in_features=6, out_features=50, bias=True)
  (layer_out): Linear(in_features=50, out_features=104, bias=True)
)


### Train (X)
Neural Network Multi-Class Classification with Pytorch

**learning rate = 0.1, 0.01, 0.001** 

**batch size = 500,000**

In [18]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
N_EPOCHS = 5
BATCH_SIZE = 500000


from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                 model=model,
                                                 criterion=criterion,
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device
                                                )
    
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion,
                                                batch_size=BATCH_SIZE,
                                                device=device
                                               )

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.3f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.3f}%')

Epoch: 0
	(train)	|	Loss: nan	|	Acc: 0.671%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 1
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 2
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 3
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 4
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%


In [59]:
torch.save(model, "/wd/models/pytorch_beer_type_prediction.pt")

In [20]:
test_loss, test_acc = test_classification(test_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)
print(f'\tLoss: {test_loss:.4f}\t|\tAccuracy: {test_acc:.4f}')


	Loss: nan	|	Accuracy: 0.0048


In [21]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [22]:
num_transformer = Pipeline(
    steps=[
        ('scaler', StandardScaler())
    ]
)

In [23]:
cat_transformer = Pipeline(
    steps=[
        ('label_encoder', LabelEncoder())
    ]
)

In [43]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num_cols', num_transformer, num_cols),
        (['brewery_name'], cat_transformer, ['brewery_name']),
        (['beer_style'], cat_transformer, ['beer_style'])
    ]
)

In [51]:
nn_pipe = Pipeline(
    steps=[
        ('preprocessor', preprocessor),       
        ('nn_multiclass', train_classification(train_dataset,
                                                 model=model,
                                                 criterion=nn.CrossEntropyLoss(),
                                                 optimizer=torch.optim.Adam(model.parameters(), lr=0.1),
                                                 batch_size=500000,
                                                 device=torch.device('cpu')
                                                ))
    ]
)

In [52]:
nn_pipe.fit(X_train)

TypeError: Last step of Pipeline should implement fit or be the string 'passthrough'. 'PytorchMultiClass(
  (layer_1): Linear(in_features=6, out_features=50, bias=True)
  (layer_out): Linear(in_features=50, out_features=104, bias=True)
)' (type <class 'src.models.pytorch.PytorchMultiClass'>) doesn't

In [56]:
nn_pipe1 = Pipeline(
    steps=[
        ('preprocessor', preprocessor),       
        ('nn_multiclass', PytorchMultiClass(X_train.shape[1])),
    ]
)

In [57]:
nn_pipe1.predict(X_train)

AttributeError: 'PytorchMultiClass' object has no attribute 'predict'

In [None]:
obs = pd.DataFrame(X_train.iloc[0]).transpose()
nn_pipe.predict(obs)

In [58]:
from joblib import dump 

dump(nn_pipe,  '../models/nn_pipeline.joblib')

['../models/nn_pipeline.joblib']