In [428]:
import torch
import numpy as np
from torchtext import data

SEED = 1452 # for reproducibility
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize= 'spacy') # for CNN
# TEXT = data.Field(tokenize= 'spacy', include_lengths=True) # For RNN
LABEL = data.LabelField()


In [593]:
# Load dataset
import csv

STOI = {
    'country': 1, 
    'province': 6, 
    'taster_name': 9,
    'variety': 12,
}

CURRENT_LABEL = 'country'
MIN_SAMPLE_NUMBER = 150

value = STOI[CURRENT_LABEL]

with open('datasets/winemag-data-130k-v2.csv') as f:
    reader = csv.reader(f)
    lines_uncontrolled = []
    
    counts = {}

    for row in reader:
        if not row[value]:
            continue
        if CURRENT_LABEL == 'province':
            # Fix the issue where "Bordeaux" is also sometimes called "Burgundy"
            if province == "Burgundy":
                row[6] = "Bordeaux"
        if not row[value] in counts.keys():
            counts[row[value]] = 1
        else:
            counts[row[value]] += 1
        lines_uncontrolled.append(row)
        
lines = []

# Remove the rows where the label is too rare
for row in lines_uncontrolled:
    if counts[row[value]] >= MIN_SAMPLE_NUMBER:
        lines.append(row)
    
            
            
print("Removed " + str(len(lines_uncontrolled) - len(lines)) + " rows")

print(len(counts.keys()))
# print(counts)

print(len(lines_uncontrolled))    
print(len(lines))
print([k for k in counts.keys() if counts[k] >= MIN_SAMPLE_NUMBER])

Removed 1277 rows
44
129909
128632
['Chile', 'Austria', 'South Africa', 'Canada', 'US', 'Spain', 'Argentina', 'Italy', 'Germany', 'France', 'Australia', 'Israel', 'Greece', 'Portugal', 'New Zealand']


In [594]:
# Split in train and test

TEST_SET_SIZE = .3
VALIDATION_SET_SIZE = .2

indices = list(range(1, len(lines)))
np.random.seed(SEED)
np.random.shuffle(indices)

first_split_index = int(TEST_SET_SIZE * len(lines))
second_split_index = int((TEST_SET_SIZE+VALIDATION_SET_SIZE) * len(lines))

print(first_split_index)
print(second_split_index)

test_indices = indices[:first_split_index]
validation_indices = indices[first_split_index:second_split_index]
train_indices = indices[second_split_index:]

train_set = [lines[k] for k in train_indices]
test_set = [lines[k] for k in test_indices]
validation_set = [lines[k] for k in validation_indices]

print(len(train_set))
print(len(test_set))
print(len(validation_set))
print(train_set[0:3])

38589
64316
64315
38589
25727
[['93313', 'US', 'A pleasant sipper for drinking now, with citrus fruit, Asian-pear and peach flavors, accented by acidity. This 100% Sauvignon was unoaked.', 'Honker Blanc', '86', '15.0', 'California', 'Napa Valley', 'Napa', '', '', 'Tudal 2012 Honker Blanc White (Napa Valley)', 'White Blend', 'Tudal'], ['73193', 'US', 'Dusty, chalky tones accent fresh apple and pear flavors throughout this minerally off-dry wine. Honed and steely on the palate, it finishes briskly on a tart, lemony note. Refreshing and quaffable in style.', 'Red Oak Vineyard', '87', '20.0', 'New York', 'Finger Lakes', 'Finger Lakes', 'Anna Lee C. Iijima', '', 'Lamoreaux Landing 2012 Red Oak Vineyard Riesling (Finger Lakes)', 'Riesling', 'Lamoreaux Landing'], ['41887', 'Spain', 'Briny citrus and petrol aromas are unusual for Macabeo. This basic white tastes like lemon-infused water. Yeasty, leesy white-fruit flavors are low on steam. Drink Now.', 'Luzón Blanco', '85', '8.0', 'Levante', 'J

In [595]:
# Write split sets

with open('preprocessed_datasets/train.csv', 'w') as train_file:
    writer = csv.writer(train_file)
    writer.writerows(train_set)
    
with open('preprocessed_datasets/test.csv', 'w') as test_file:
    writer = csv.writer(test_file)
    writer.writerows(test_set)
with open('preprocessed_datasets/validation.csv', 'w') as validation_file:
    writer = csv.writer(validation_file)
    writer.writerows(validation_set)

In [598]:
# Build the dataset

# Put the thing we want to predict as a label
tv_datafields = [("id", None),
                 ("country", LABEL),
                 ("description", TEXT),
                 ("designation", None),
                 ("points", None),
                 ("price", None),
                 ("province", None),
                 ("region_1", None),
                 ("region_2", None),
                 ("taster_name", None),
                 ("taster_twitter_handle", None),
                 ("title", None),
                 ("variety", None),
                 ("winery", None)]

trn, vld, tst = data.TabularDataset.splits(path='preprocessed_datasets',
                                     format="csv",
                                     train= 'train.csv',
                                     validation='validation.csv',
                                     test='test.csv',
                                     fields=tv_datafields)

In [599]:
# Prepare the vocab (BEWARE: this also downloads the vectors, ~800MB)
MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(trn,
                 max_size=MAX_VOCAB_SIZE,
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)
LABEL.build_vocab(trn)

print(len(TEXT.vocab)) # 25002 (and not 25000) because of <pad> and <unk>
print(len(LABEL.vocab))

25002
15


In [600]:
print(LABEL.vocab.freqs.most_common(100))
# print(LABEL.vocab.stoi)

[('US', 27365), ('France', 10982), ('Italy', 9763), ('Spain', 3341), ('Portugal', 2866), ('Chile', 2219), ('Argentina', 1876), ('Austria', 1681), ('Australia', 1143), ('Germany', 1066), ('New Zealand', 735), ('South Africa', 671), ('Israel', 255), ('Greece', 226), ('Canada', 126)]


In [601]:
# Set up the iterators
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # TODO: back to cpu


train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (trn, vld, tst), 
    batch_size = BATCH_SIZE,
#     sort_within_batch=True, # For RNN
    sort_key=lambda x: len(x.description), # Sort the examples so the ones with similar lengths are close to each other
    device = device)

In [602]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        #text = [sent len, batch size]

        text = text.permute(1, 0)        
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        #conv_n = [batch size, n_filters, sent len - filter_sizes[n]]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))
        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [603]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [2,3,4]
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [604]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(count_parameters(model))

2595015


In [605]:
pretrained_embeddings = TEXT.vocab.vectors


model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.3460,  0.7065,  0.1639,  ..., -1.4077,  1.7792, -0.9527],
        [-0.9241, -1.4135, -0.8655,  ...,  0.0169, -0.8565, -0.1619],
        [-0.1077,  0.1105,  0.5981,  ..., -0.8316,  0.4529,  0.0826],
        ...,
        [-0.4288, -0.0500, -0.3499,  ..., -1.2627,  0.1444, -0.8879],
        [-1.0038,  0.6452, -0.3984,  ..., -0.6172, -0.0960,  0.2449],
        [ 0.3714, -1.2620, -0.1996,  ..., -0.2593,  1.2749,  1.0969]])

In [606]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [607]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [608]:
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]])

In [612]:
LABEL_NAME = 'country'

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
#     print('start training')
    
    for batch in iterator:
#         print(epoch_loss)
        optimizer.zero_grad()
        
        predictions = model(batch.description)
        
        loss = criterion(predictions, getattr(batch, LABEL_NAME))
        
        acc = categorical_accuracy(predictions, getattr(batch, LABEL_NAME))
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.description)
            
            loss = criterion(predictions, getattr(batch, LABEL_NAME))
            
            acc = categorical_accuracy(predictions, getattr(batch, LABEL_NAME))

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [613]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [614]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'wine-prediction-model.pt')
    
    print('Epoch: ' + str(epoch+1.02) + ' | Epoch Time: ' + str(epoch_mins) + 'm '+ str(epoch_secs) + 's')
    print('\tTrain Loss: ' + str(train_loss) + ' | Train Acc: ' + str(train_acc*100) + '%')
    print('\tVal. Loss: ' + str(valid_loss) + ' |  Val. Acc: ' + str(valid_acc*100) + '%')

Epoch: 1.02 | Epoch Time: 1m 23s
	Train Loss: 1.0312679250738515 | Train Acc: 68.61107597896708%
	Val. Loss: 0.6479807856367595 |  Val. Acc: 78.97497878145816%
Epoch: 2.02 | Epoch Time: 1m 43s
	Train Loss: 0.6399591669810946 | Train Acc: 79.38793848403057%
	Val. Loss: 0.5278927133376918 |  Val. Acc: 82.23280917056164%
Epoch: 3.02 | Epoch Time: 1m 50s
	Train Loss: 0.5153121326100174 | Train Acc: 82.79821760025784%
	Val. Loss: 0.4710242266011475 |  Val. Acc: 84.09052209474555%
Epoch: 4.02 | Epoch Time: 1m 47s
	Train Loss: 0.4371247465782498 | Train Acc: 84.90787587355618%
	Val. Loss: 0.45896594141104924 |  Val. Acc: 84.39764224771244%
Epoch: 5.02 | Epoch Time: 1m 47s
	Train Loss: 0.38324987962471313 | Train Acc: 86.60134180861327%
	Val. Loss: 0.43887938610949917 |  Val. Acc: 85.26816562337068%
Epoch: 6.02 | Epoch Time: 1m 49s
	Train Loss: 0.3407266934490322 | Train Acc: 87.95163483168949%
	Val. Loss: 0.45016990779940763 |  Val. Acc: 85.4316586730492%
Epoch: 7.02 | Epoch Time: 1m 45s
	Tra

In [615]:
model.load_state_dict(torch.load('wine-prediction-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print('Test Loss: ' + str(test_loss) + ' | Test Acc: '+ str(test_acc*100) + '%')

Test Loss: 0.4363667101384593 | Test Acc: 85.35604932612645%


In [491]:
import spacy
nlp = spacy.load('en_core_web_sm')

def predict_class(model, sentence, min_len = 4):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    preds = model(tensor)
    max_preds = preds.argmax(dim = 1)
    return max_preds.item()

In [492]:
# Live testing: change the description to see how the model classifies it
description = "This cooperative, based in Aÿ, has benefited from the fine Pinot Noir in the village to produce a ripe red fruited wine. With balanced acidity and a soft aftertaste, it is ready to drink."
pred_class = predict_class(model, description)
print('Predicted class is: ' + str(pred_class) + ' = ' + str(LABEL.vocab.itos[pred_class]))

Predicted class is: 0 = Pinot Noir


In [591]:
# Check how the model performs on a batch
import csv

LIMIT = 1000
SHOW_ONLY_WRONG = True

with open('preprocessed_datasets/test.csv') as f:
    reader = csv.reader(f)

    i = 0
    for row in reader:
        if i > LIMIT:
            break
        sentence = row[2]
        real_value = row[6]
        pred_value = predict_class(model, sentence)
        if not SHOW_ONLY_WRONG or real_value != LABEL.vocab.itos[pred_value]:
            print(sentence)
            print("Actual: " + str(real_value) + ", predicted: " + str(LABEL.vocab.itos[pred_value]) + "\n")
        i += 1
    
# for i in range(30):
#     row = rows[i]
#     sentence = row[2]
#     real_value = row[-2]
#     pred_value = predict_class(model, sentence)
#     print(sentence)
#     print("Actual: " + str(real_value) + ", predicted: " + str(LABEL.vocab.itos[pred_value]) + "\n")


This is a very zesty Grüner where yeasty, earthy notes are completely cut through with the wonderful, bracing freshness of grapefruit peel. The more usual fruit of green pears lurks underneath but the overall picture is one of lip-smacking refreshment. Wonderfully clean and zingy finish.
Actual: Kamptal, predicted: Kremstal

Full-bodied and supple in texture, this plummy Shiraz offers plenty of enjoyment at a realistic price. Vanilla and brown sugar notes frame blueberries and spice, with just a hint of bitterness apparent on the finish. Drink now–2012.
Actual: South Australia, predicted: Australia Other

This flowery, perfumed wine is full of red currants and berries that balance a solid, dry element. With the potential of rounding out and becoming rich as well as structured, it should be cellared. Drink from 2017.
Actual: Southwest France, predicted: Loire Valley

Packed in an extremely heavy bottle, this has aromas of French oak, vanilla and coconut. The firm palate offers dried sou

Dry, earthy and leathery on the nose, with finely etched aromas of raspberries, plums and other red berries. This feels fresh and pure, with balance, weight, tannic grip and perfect acidity. It's bright, forward and juicy, with elevated flavors of plums, berries and peppery spices. A blend of 85% Cab Franc and 15% Cab Sauvignon.
Actual: Catalonia, predicted: Mendoza Province

Full of delicious dark-fruit flavors, this is smooth and deep, comforting but sophisticated. The fruit flavors are ripe and satisfying, the texture is velvety and broad, the body is full but not heavy.
Actual: California, predicted: Alsace

Subtle notes of both green, crisp Conference pear and ripe Delicious apples yield to some zesty grapefruit notes. There is structure and a resonant background of herbal spice, yeasty notes and fresh white pepper. The palate is streamlined and crisp yet cushioned. Ott once again shows his mastery of this variety—this time with all the freshness of a cooler vintage.
Actual: Niede

A earthy element creeps into the peach notes on the nose. The medium-sweet palate comes in with refreshing phenolic zestiness. The flavors are full and bright, with just a hint of spice. Drink soon.
Actual: Alsace, predicted: Kremstal

This is a boldly extracted and concentrated expression of Sangiovese (with 10% of the Bursona Longanesi grape) that delivers steady and bold oak aromas. The wood influences are almost too much but will surely integrate as the wine ages in the bottle. Beyond all that spice and toast is a solid core of black berry fruit and prune.
Actual: Central Italy, predicted: Tuscany

This ultrarich, opulent wine has such purity of fruit. With its pepper and spice flavors layered over mango and ripe nectarine, it is a straight line of pure flavor, rich and complex.
Actual: Alsace, predicted: Bordeaux

A lean, structured wine, with dense tannins, jammy fruit and dark berry flavors. The wood comes through strongly, offering high spice, and giving the wine a dry core.
Ac

The cuttings for this vineyard were taken from the Old Garden Vineyard, so despite it's relatively young age, it has pedigree. Sage, smoke and black olive notes mingle easily on the nose, backed by flavors of vanilla, black cherry and mocha. It's medium to full in body and creamy in texture, with a smooth, silky finish.
Actual: South Australia, predicted: California

The first whiff brings a noseful of sulfur, although most of it blows off on airing. The underlying flavors are lovely, though, suggesting citrus fruits and apple, supported by crisp acidity. Finishes clean and crisp. Would score higher except for the opening smell.
Actual: Oregon, predicted: Burgundy

Rich and concentrated for an entry-level wine, with earthy, bold aromas and a lush, well-balanced palate feel. Deep and sweet in terms of flavor, with a mix of ripe black fruits and herbs. Shows a bit of faux oak flavor on a mocha-tinged finish. Exemplary for this price range.
Actual: Central Valley, predicted: Mendoza Provi

Lush and forward, with complex sour-plum, espresso, cola, vanilla and fudge aromas. Round berry flavors, licorice, Asian spice and pepper accents mark the palate. Full, but not heavy, with a long, smooth plum and cocoa finish. This is a stylish and elegant black knight.
Actual: California, predicted: Mendoza Province

Lovely, dense aromas of dried flowers, herbs, cherries and chocolate immediately capture attention. This is a superbly elegant wine, with both precision and power. Orange-peel flavors highlight deep cherry compote, with tension, balance and length in abundance.
Actual: British Columbia, predicted: Washington

Aromas of dark-skinned berry, blue flowers and a whiff of baking spices lead the nose on this vibrant red. The juicy palate doles out black cherry, red raspberry, white pepper and star anise alongside vibrant acidity. It's made to be drunk young so enjoy soon.
Actual: Piedmont, predicted: Tuscany

Intense acidity, with the sweetness held in check by a bright, crisp c

Toasted oak, citrus and yellow apple aromas lead the nose. They follow through to the medium-bodied palate,. along with a hint of candied lemon. A bitter almond note marks the finish.
Actual: Lombardy, predicted: Northeastern Italy

Big and bold, this opens with aromas of mature dark-skinned berry, exotic spice, leather and resin. The concentrated full-bodied palate delivers black cherry steeped in spirits, raspberry compote, vanilla and nutmeg framed in velvety tannins. There is just enough fresh acidity lifts the finish.
Actual: Veneto, predicted: Piedmont

Opens with aromas of lettuce, underripe peach and mineral water. The palate is healthy and crisp, with lemony acids and fresh flavors of green fruits and lettuce leaf. Clean but lightly bitter.
Actual: Casablanca Valley, predicted: Mendoza Province

Tarry berry aromas come with notes of olive, rubber and eucalyptus. A tight, racy palate is driven by acidity, while this tastes of herbal plum and lightly salted tomato sauce. Woody, 

Cherry and plum aromas dominate the rather basic bouquet of this wine, which has a saucy, barbecued element to it. It shows tannic grip along with flavors of savory berry and barbeque sauce. The finish tastes spicy and roasted, with grip and push.
Actual: Colchagua Valley, predicted: Mendoza Province

Scents of waxy lemon peels and candles meld with delicate peach on this refreshing off-dry Riesling. On the palate, hints of lemon verbena and lime zest lend a herbal tone that highlights the wine's revitalizing lime-juice finish.
Actual: Mosel, predicted: New York

There seems to be a bit of residual sugar in this wine, just enough to offset the searing acidity. Some nice flower notes highlight the aromas, with flavors of tart peach and citrus filling in the rest.
Actual: Oregon, predicted: Washington

Easy, fresh and true to the variety, this expression offers blue flower and forest berry notes, backed by almond and pistachio accents. The finish is crisp, tight and lean.
Actual: Sicily 

In [None]:
# Same for RNN
def trainRNN(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for (index, batch) in enumerate(iterator):
        print(index/len(iterator))
        optimizer.zero_grad()
        
        text, text_lengths = batch.description
        
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, batch.province)
        
        acc = categorical_accuracy(predictions, batch.province)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluateRNN(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.description
            
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.province)
            
            acc = categorical_accuracy(predictions, batch.province)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

N_EPOCHS = 2
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = trainRNN(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluateRNN(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'wine-prediction-model.pt')
    
    print('Epoch: ' + str(epoch+1.02) + ' | Epoch Time: ' + str(epoch_mins) + 'm '+ str(epoch_secs) + 's')
    print('\tTrain Loss: ' + str(train_loss) + ' | Train Acc: ' + str(train_acc*100) + '%')
    print('\tVal. Loss: ' + str(valid_loss) + ' |  Val. Acc: ' + str(valid_acc*100) + '%')

In [381]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden.squeeze(0))

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = len(LABEL.vocab)
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [None]:
# ========== ONLY NOTES BELOW ==============

In [None]:
import random
print(vars(full_dataset.examples[0]))
train_and_valid_data, test_data = full_dataset.split(random_state = random.seed(SEED))

train_data, valid_data = train_and_valid_data.split(random_state = random.seed(SEED))
print(len(train_data))
print(len(test_data))
print(len(valid_data))