# Beer Prediction Full Model Process

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, LabelEncoder

import torch
import torch.nn as nn
import torch.nn.functional as F

## Data Transformations (A)

In [2]:
#Solution
%load_ext autoreload
%autoreload 2

In [3]:
df_raw_beer = pd.read_csv('/wd/data/raw/beer_reviews.csv')

In [4]:
df_cleaned = df_raw_beer.copy()
df_cleaned.drop(['brewery_id',
                 'review_profilename', 
                 'review_time',
                 'beer_name',
                 'beer_beerid',
                 'review_overall' # temp exclude
                ], axis=1, inplace=True)
df_cleaned.head()

Unnamed: 0,brewery_name,review_aroma,review_appearance,beer_style,review_palate,review_taste,beer_abv
0,Vecchio Birraio,2.0,2.5,Hefeweizen,1.5,1.5,5.0
1,Vecchio Birraio,2.5,3.0,English Strong Ale,3.0,3.0,6.2
2,Vecchio Birraio,2.5,3.0,Foreign / Export Stout,3.0,3.0,6.5
3,Vecchio Birraio,3.0,3.5,German Pilsener,2.5,3.0,5.0
4,Caldera Brewing Company,4.5,4.0,American Double / Imperial IPA,4.0,4.5,7.7


In [30]:
# Create a numeric version of the categorical features 

le = LabelEncoder()

cats_dict = dict(enumerate(df_cleaned.brewery_name.unique()))
df_cleaned['brewery_name'] = le.fit_transform(df_cleaned['brewery_name'])

In [31]:
# Standardise the numeric features 

num_cols = ['brewery_name',
            'review_aroma',
            'review_appearance',
            'review_palate',
            'review_taste',
            'beer_abv']

sc = StandardScaler()
df_cleaned[num_cols] = sc.fit_transform(df_cleaned[num_cols])

In [32]:
# Create a numeric (integer) version of the target variable 

cats_dict = dict(enumerate(df_cleaned.beer_style.unique()))
le = LabelEncoder()
df_cleaned['beer_style'] = le.fit_transform(df_cleaned['beer_style'])

In [None]:
df_cleaned

## Split the data (A)

In [27]:
from src.data.sets import split_sets_random, save_sets

# Split the data into training and testing sets with 80-20 ratio
X_train, y_train, X_val, y_val, X_test, y_test = split_sets_random(df_cleaned,
                                                                   target_col='beer_style',
                                                                   test_ratio=0.2)

In [28]:
save_sets(X_train=X_train,
          y_train=y_train,
          X_val=X_val,
          y_val=y_val,
          X_test=X_test,
          y_test=y_test,
          path='/wd/data/processed/beer_type/'
         )


In [29]:
# Convert all sets to PytorchDataset

from src.models.pytorch import PytorchDataset

train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

## Baseline Model (A)

In [39]:
# Import NullModel from src.models.null
from src.models.null import NullModel

baseline_model = NullModel(target_type='classification')
y_base = baseline_model.fit_predict(y_train)

In [40]:
# Import print_class_perf from src.models.performance
from src.models.performance import print_class_perf

# Print the classification metrics for this baseline model
print_class_perf(y_base, y_train, set_name='Training', average='weighted')

Accuracy Training: 0.0742157299405022
F1 Training: 0.01025487603110527


## Load Stored Data (A)

In [42]:
from src.data.sets import load_sets
X_train, y_train, X_val, y_val, X_test, y_test = load_sets(path='../data/processed/beer_type/'
                                                          )
                                                           
from src.models.pytorch import PytorchDataset

train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

In [43]:
X_train

# Note: contains 6 features: 
            # 'brewery_name',
            # 'review_aroma',
            # 'review_appearance',
            # 'review_palate',
            # 'review_taste',
            # 'beer_abv'

array([[ 0.08775722, -3.20467771, -1.36609615, -1.09012252, -0.40009933,
        -1.09466486],
       [-0.05561711, -1.05449924,  0.25703654, -0.35721912, -0.40009933,
         0.62759846],
       [ 1.11334619, -1.05449924, -0.55452981,  0.37568428, -0.40009933,
         0.06786288],
       ...,
       [ 1.34054826,  0.37895308,  0.25703654,  0.37568428,  0.96607804,
         1.14427745],
       [ 0.56586437,  0.37895308, -0.55452981, -1.09012252,  0.28298936,
        -0.01825029],
       [ 0.40630261,  0.37895308,  1.06860288,  0.37568428, -0.40009933,
         0.41231554]])

## Training Experiments (A)

Note: contains 6 features: 
            'brewery_name',
            'review_aroma',
            'review_appearance',
            'review_palate',
            'review_taste',
            'beer_abv'

### Architecture (X)

In [33]:
# Instantiate PytorchMultiClass with the correct number of input feature 
# and save it into a variable called model

from src.models.pytorch import PytorchMultiClass
model = PytorchMultiClass(X_train.shape[1])


# Set model to use the device available
from src.models.pytorch import get_device

device = get_device()
model.to(device)


PytorchMultiClass(
  (layer_1): Linear(in_features=6, out_features=50, bias=True)
  (layer_out): Linear(in_features=50, out_features=104, bias=True)
)

In [34]:
# Print the architecture of model
print(model)

PytorchMultiClass(
  (layer_1): Linear(in_features=6, out_features=50, bias=True)
  (layer_out): Linear(in_features=50, out_features=104, bias=True)
)


### Train (X)
Neural Network Multi-Class Classification with Pytorch

**learning rate = 0.1, 0.01, 0.001** 

**batch size = 500,000**

In [36]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
N_EPOCHS = 5
BATCH_SIZE = 500000


from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                 model=model,
                                                 criterion=criterion,
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device
                                                )
    
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion,
                                                batch_size=BATCH_SIZE,
                                                device=device
                                               )

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.3f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.3f}%')

Epoch: 0
	(train)	|	Loss: nan	|	Acc: 0.731%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 1
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 2
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 3
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 4
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%


In [5]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
N_EPOCHS = 5
BATCH_SIZE = 500000


from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                 model=model,
                                                 criterion=criterion,
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device
                                                )
    
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion,
                                                batch_size=BATCH_SIZE,
                                                device=device
                                               )

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.3f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.3f}%')

Epoch: 0
	(train)	|	Loss: nan	|	Acc: 0.691%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 1
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 2
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 3
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 4
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%


In [5]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
N_EPOCHS = 5
BATCH_SIZE = 500000


from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                 model=model,
                                                 criterion=criterion,
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device
                                                )
    
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion,
                                                batch_size=BATCH_SIZE,
                                                device=device
                                               )

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.3f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.3f}%')

Epoch: 0
	(train)	|	Loss: nan	|	Acc: 0.691%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 1
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 2
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 3
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 4
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%


In [18]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
N_EPOCHS = 2
BATCH_SIZE = 500000


from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                 model=model,
                                                 criterion=criterion,
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device
                                                )
    
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion,
                                                batch_size=BATCH_SIZE,
                                                device=device
                                               )

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.3f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.3f}%')

Epoch: 0
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 1
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%


**learning rate = 0.1** 

**batch size = 800,000**

In [15]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
N_EPOCHS = 2
BATCH_SIZE = 800000


from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                 model=model,
                                                 criterion=criterion,
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device
                                                )
    
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion,
                                                batch_size=BATCH_SIZE,
                                                device=device
                                               )

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.3f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.3f}%')

Epoch: 0
	(train)	|	Loss: nan	|	Acc: 1.097%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 1
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%


**learning rate = 0.1** 

**batch size = 700,000**

In [17]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
N_EPOCHS = 2
BATCH_SIZE = 700000


from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                 model=model,
                                                 criterion=criterion,
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device
                                                )
    
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion,
                                                batch_size=BATCH_SIZE,
                                                device=device
                                               )

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.3f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.3f}%')

Epoch: 0
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 1
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%


**learning rate = 0.1, 0.01, 0.001** 

**batch size = 50,000**

In [7]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
N_EPOCHS = 2
BATCH_SIZE = 50000


from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                 model=model,
                                                 criterion=criterion,
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device
                                                )
    
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion,
                                                batch_size=BATCH_SIZE,
                                                device=device
                                               )

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.4f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.4f}%')

Epoch: 0
	(train)	|	Loss: nan	|	Acc: 0.4970%
	(valid)	|	Loss: nan	|	Acc: 0.5140%
Epoch: 1
	(train)	|	Loss: nan	|	Acc: 0.4821%
	(valid)	|	Loss: nan	|	Acc: 0.5140%


In [9]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
N_EPOCHS = 5
BATCH_SIZE = 50000


from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                 model=model,
                                                 criterion=criterion,
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device
                                                )
    
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion,
                                                batch_size=BATCH_SIZE,
                                                device=device
                                               )

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.4f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.4f}%')

Epoch: 0
	(train)	|	Loss: nan	|	Acc: 0.4821%
	(valid)	|	Loss: nan	|	Acc: 0.5140%
Epoch: 1
	(train)	|	Loss: nan	|	Acc: 0.4821%
	(valid)	|	Loss: nan	|	Acc: 0.5140%
Epoch: 2
	(train)	|	Loss: nan	|	Acc: 0.4821%
	(valid)	|	Loss: nan	|	Acc: 0.5140%
Epoch: 3
	(train)	|	Loss: nan	|	Acc: 0.4821%
	(valid)	|	Loss: nan	|	Acc: 0.5140%
Epoch: 4
	(train)	|	Loss: nan	|	Acc: 0.4821%
	(valid)	|	Loss: nan	|	Acc: 0.5140%


In [45]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
N_EPOCHS = 3
BATCH_SIZE = 50000


from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                 model=model,
                                                 criterion=criterion,
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device
                                                )
    
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion,
                                                batch_size=BATCH_SIZE,
                                                device=device
                                               )

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.4f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.4f}%')

Epoch: 0
	(train)	|	Loss: nan	|	Acc: 0.4821%
	(valid)	|	Loss: nan	|	Acc: 0.5140%
Epoch: 1
	(train)	|	Loss: nan	|	Acc: 0.4821%
	(valid)	|	Loss: nan	|	Acc: 0.5140%
Epoch: 2
	(train)	|	Loss: nan	|	Acc: 0.4821%
	(valid)	|	Loss: nan	|	Acc: 0.5140%


**learning rate = 0.1** 

**batch size = 50**

In [11]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
N_EPOCHS = 3
BATCH_SIZE = 50


from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                 model=model,
                                                 criterion=criterion,
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device
                                                )
    
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion,
                                                batch_size=BATCH_SIZE,
                                                device=device
                                               )

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.4f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.4f}%')

Epoch: 0
	(train)	|	Loss: nan	|	Acc: 0.4821%
	(valid)	|	Loss: nan	|	Acc: 0.5140%
Epoch: 1
	(train)	|	Loss: nan	|	Acc: 0.4821%
	(valid)	|	Loss: nan	|	Acc: 0.5140%
Epoch: 2
	(train)	|	Loss: nan	|	Acc: 0.4821%
	(valid)	|	Loss: nan	|	Acc: 0.5140%


## Training Experiments (B)

Note: contains 6 features: 
            'brewery_name',
            'review_aroma',
            'review_appearance',
            'review_palate',
            'review_taste',
            'beer_abv'

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
from src.data.sets import load_sets
X_train, y_train, X_val, y_val, X_test, y_test = load_sets(path='../data/processed/beer_type/'
                                                          )
                                            
from src.models.pytorch import PytorchDataset

train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)                                                           
                                                           

### Architecture (Y)

In [4]:
# Instantiate PytorchMultiClass with the correct number of input feature 
# and save it into a variable called model

from src.models.pytorch import PytorchMultiClass
model = PytorchMultiClass(X_train.shape[1])


# Set model to use the device available
from src.models.pytorch import get_device

device = get_device()
model.to(device)

PytorchMultiClass(
  (layer_1): Linear(in_features=6, out_features=500, bias=True)
  (layer_out): Linear(in_features=500, out_features=104, bias=True)
)

**learning rate = 0.1** 

**batch size = 500000**

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
N_EPOCHS = 5
BATCH_SIZE = 500000


from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                 model=model,
                                                 criterion=criterion,
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device
                                                )
    
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion,
                                                batch_size=BATCH_SIZE,
                                                device=device
                                               )

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.3f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.3f}%')

## Training Experiments (C)

Note: contains **5 features**: 
            'review_aroma',
            'review_appearance',
            'review_palate',
            'review_taste',
            'beer_abv'

#### New Data Transformations

In [5]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, LabelEncoder

In [6]:
df_raw_beer = pd.read_csv('/wd/data/raw/beer_reviews.csv')

In [7]:
df_cleaned = df_raw_beer.copy()
df_cleaned.drop(['brewery_id',
                 'review_profilename', 
                 'review_time',
                 'beer_name',
                 'beer_beerid',
                 'review_overall', # temp exclude
                 'brewery_name' # not include in the model
                ], axis=1, inplace=True)
df_cleaned.head()

Unnamed: 0,review_aroma,review_appearance,beer_style,review_palate,review_taste,beer_abv
0,2.0,2.5,Hefeweizen,1.5,1.5,5.0
1,2.5,3.0,English Strong Ale,3.0,3.0,6.2
2,2.5,3.0,Foreign / Export Stout,3.0,3.0,6.5
3,3.0,3.5,German Pilsener,2.5,3.0,5.0
4,4.5,4.0,American Double / Imperial IPA,4.0,4.5,7.7


In [8]:
# Standardise the numeric features 


num_cols = ['review_aroma',
            'review_appearance',
            'review_palate',
            'review_taste',
            'beer_abv']
sc = StandardScaler()
df_cleaned[num_cols] = sc.fit_transform(df_cleaned[num_cols])

df_cleaned.head()


Unnamed: 0,review_aroma,review_appearance,beer_style,review_palate,review_taste,beer_abv
0,-2.487952,-2.177663,Hefeweizen,-3.288833,-3.132454,-0.879382
1,-1.771225,-1.366096,English Strong Ale,-1.090123,-1.083188,-0.362703
2,-1.771225,-1.366096,Foreign / Export Stout,-1.090123,-1.083188,-0.233533
3,-1.054499,-0.55453,German Pilsener,-1.823026,-1.083188,-0.879382
4,1.095679,0.257037,American Double / Imperial IPA,0.375684,0.966078,0.283146


In [9]:
# Create a numeric (integer) version of the target variable 

cats_dict = dict(enumerate(df_cleaned.beer_style.unique()))
le = LabelEncoder()
df_cleaned['beer_style'] = le.fit_transform(df_cleaned['beer_style'])
df_cleaned

Unnamed: 0,review_aroma,review_appearance,beer_style,review_palate,review_taste,beer_abv
0,-2.487952,-2.177663,65,-3.288833,-3.132454,-0.879382
1,-1.771225,-1.366096,51,-1.090123,-1.083188,-0.362703
2,-1.771225,-1.366096,59,-1.090123,-1.083188,-0.233533
3,-1.054499,-0.554530,61,-1.823026,-1.083188,-0.879382
4,1.095679,0.257037,9,0.375684,0.966078,0.283146
...,...,...,...,...,...,...
1586609,0.378953,-0.554530,85,0.375684,0.282989,-0.793269
1586610,1.812405,-2.177663,85,-2.555929,0.282989,-0.793269
1586611,-0.337773,-1.366096,85,-0.357219,0.282989,-0.793269
1586612,1.095679,1.068603,85,1.108588,0.966078,-0.793269


#### Split the data

In [10]:
from src.data.sets import split_sets_random, save_sets

# Split the data into training and testing sets with 80-20 ratio
X_train, y_train, X_val, y_val, X_test, y_test = split_sets_random(df_cleaned,
                                                                   target_col='beer_style',
                                                                   test_ratio=0.2)


In [11]:
save_sets(X_train=X_train,
          y_train=y_train,
          X_val=X_val,
          y_val=y_val,
          X_test=X_test,
          y_test=y_test,
          path='/wd/data/processed/beer_type/'
         )



In [12]:
from src.data.sets import load_sets
X_train, y_train, X_val, y_val, X_test, y_test = load_sets(path='../data/processed/beer_type/'
                                                          )
                                                           
from src.models.pytorch import PytorchDataset

train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

In [13]:
X_train

array([[-3.20467771, -1.36609615, -1.09012252, -0.40009933, -1.09466486],
       [-1.05449924,  0.25703654, -0.35721912, -0.40009933,  0.62759846],
       [-1.05449924, -0.55452981,  0.37568428, -0.40009933,  0.06786288],
       ...,
       [ 0.37895308,  0.25703654,  0.37568428,  0.96607804,  1.14427745],
       [ 0.37895308, -0.55452981, -1.09012252,  0.28298936, -0.01825029],
       [ 0.37895308,  1.06860288,  0.37568428, -0.40009933,  0.41231554]])

#### Baseline Model

In [20]:
# Import NullModel from src.models.null
from src.models.null import NullModel

baseline_model = NullModel(target_type='classification')
y_base = baseline_model.fit_predict(y_train)

  self.pred_value = mode(y)[0][0]


In [21]:
# Import print_class_perf from src.models.performance
from src.models.performance import print_class_perf

# Print the classification metrics for this baseline model
print_class_perf(y_base, y_train, set_name='Training', average='weighted')

Accuracy Training: 0.0742157299405022
F1 Training: 0.01025487603110527


### Architecture (Z)

In [14]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

In [32]:
# Instantiate PytorchMultiClass with the correct number of input feature 
# and save it into a variable called model

from src.models.pytorch import PytorchMultiClass
model = PytorchMultiClass(X_train.shape[1])


# Set model to use the device available
from src.models.pytorch import get_device

device = get_device()
model.to(device)


PytorchMultiClass(
  (layer_1): Linear(in_features=5, out_features=500, bias=True)
  (layer_out): Linear(in_features=500, out_features=104, bias=True)
)

In [5]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
N_EPOCHS = 5
BATCH_SIZE = 500000


from src.models.pytorch import train_classification, test_classificatiocn

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                model=model,
                                                 criterion=criterion,
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device
                                                )
    
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion,
                                                batch_size=BATCH_SIZE,
                                                device=device
                                               )

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.3f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.3f}%')

Epoch: 0
	(train)	|	Loss: nan	|	Acc: 0.834%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 1
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 2
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 3
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 4
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%


### Architecture (W)

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
from src.data.sets import load_sets
X_train, y_train, X_val, y_val, X_test, y_test = load_sets(path='../data/processed/beer_type/'
                                                          )
                                                           
from src.models.pytorch import PytorchDataset

train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

In [3]:
# Instantiate PytorchMultiClass with the correct number of input feature 
# and save it into a variable called model

from src.models.pytorch import PytorchMultiClass
model = PytorchMultiClass(X_train.shape[1])


# Set model to use the device available
from src.models.pytorch import get_device

device = get_device()
model.to(device)


PytorchMultiClass(
  (layer_1): Linear(in_features=5, out_features=50, bias=True)
  (layer_out): Linear(in_features=50, out_features=104, bias=True)
)

In [7]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
N_EPOCHS = 5
BATCH_SIZE = 500000


from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                model=model,
                                                 criterion=criterion,
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device
                                                )
    
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion,
                                                batch_size=BATCH_SIZE,
                                                device=device
                                               )

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.3f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.3f}%')

Epoch: 0
	(train)	|	Loss: nan	|	Acc: 1.009%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 1
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 2
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 3
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 4
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%


In [9]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
N_EPOCHS = 2
BATCH_SIZE = 100000


from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                model=model,
                                                 criterion=criterion,
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device
                                                )
    
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion,
                                                batch_size=BATCH_SIZE,
                                                device=device
                                               )

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.3f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.3f}%')

Epoch: 0
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 1
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%


In [10]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
N_EPOCHS = 2
BATCH_SIZE = 50000


from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                model=model,
                                                 criterion=criterion,
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device
                                                )
    
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion,
                                                batch_size=BATCH_SIZE,
                                                device=device
                                               )

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.3f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.3f}%')

Epoch: 0
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 1
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%


In [19]:
# Instantiate PytorchMultiClass with the correct number of input feature 
# and save it into a variable called model

from src.models.pytorch import PytorchMultiClass
model = PytorchMultiClass(X_train.shape[1])


# Set model to use the device available
from src.models.pytorch import get_device

device = get_device()
model.to(device)


PytorchMultiClass(
  (layer_1): Linear(in_features=5, out_features=500, bias=True)
  (layer_out): Linear(in_features=500, out_features=104, bias=True)
)

In [4]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
N_EPOCHS = 2
BATCH_SIZE = 50000


from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                model=model,
                                                 criterion=criterion,
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device
                                                )
    
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion,
                                                batch_size=BATCH_SIZE,
                                                device=device
                                               )

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.3f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.3f}%')

Epoch: 0
	(train)	|	Loss: nan	|	Acc: 0.503%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 1
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
N_EPOCHS = 2
BATCH_SIZE = 500000


from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                model=model,
                                                 criterion=criterion,
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device
                                                )
    
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion,
                                                batch_size=BATCH_SIZE,
                                                device=device
                                               )

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.3f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.3f}%')

### Architecture (V)

In [15]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

In [16]:
from src.data.sets import load_sets
X_train, y_train, X_val, y_val, X_test, y_test = load_sets(path='../data/processed/beer_type/'
                                                          )
                                                           
from src.models.pytorch import PytorchDataset

train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

In [17]:
# Instantiate PytorchMultiClass with the correct number of input feature 
# and save it into a variable called model

from src.models.pytorch import PytorchMultiClass
model = PytorchMultiClass(X_train.shape[1])


# Set model to use the device available
from src.models.pytorch import get_device

device = get_device()
model.to(device)


PytorchMultiClass(
  (layer_1): Linear(in_features=5, out_features=5, bias=True)
  (layer_out): Linear(in_features=5, out_features=104, bias=True)
)

In [18]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
N_EPOCHS = 5
BATCH_SIZE = 500000


from src.models.pytorch import train_classification, test_classification

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                model=model,
                                                 criterion=criterion,
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device
                                                )
    
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion,
                                                batch_size=BATCH_SIZE,
                                                device=device
                                               )

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.3f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.3f}%')

Epoch: 0
	(train)	|	Loss: nan	|	Acc: 0.780%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 1
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 2
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 3
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
Epoch: 4
	(train)	|	Loss: nan	|	Acc: 0.482%
	(valid)	|	Loss: nan	|	Acc: 0.514%
