In [150]:
import torch
import numpy as np
from torchtext import data

SEED = 1452 # for reproducibility
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize= 'spacy')
LABEL = data.LabelField()


In [204]:
# Load dataset
import csv

with open('datasets/winemag-data-130k-v2.csv') as f:
    reader = csv.reader(f)
    rows = []
    
    removed = 0

    for row in reader:
        variety = row[-2]
        if not variety:
            print(row)
            removed +=1
        else:
            rows.append(row)
            
            
print("Removed " + str(removed) + " rows")
    
    
print(len(lines))

['86909', 'Chile', "A chalky, dusty mouthfeel nicely balances this Petite Syrah's bright, full blackberry and blueberry fruit. Wheat-flour and black-pepper notes add interest to the bouquet; the wine finishes with herb and an acorny nuttiness. A good first Chilean wine for those more comfortable with the Californian style. It's got tannins to lose, but it's very good.", '', '88', '17.0', 'Maipo Valley', '', '', '', '', 'Carmen 1999  (Maipo Valley)', '', 'Carmen']
Removed 1 rows
129975


In [69]:
# Split in train and test

TEST_SET_SIZE = .3
VALIDATION_SET_SIZE = .2

indices = list(range(1, len(lines)))
np.random.seed(SEED)
np.random.shuffle(indices)

first_split_index = int(TEST_SET_SIZE * len(lines))
second_split_index = int((TEST_SET_SIZE+VALIDATION_SET_SIZE) * len(lines))

print(first_split_index)
print(second_split_index)

test_indices = indices[:first_split_index]
validation_indices = indices[first_split_index:second_split_index]
train_indices = indices[second_split_index:]

train_set = [lines[k] for k in train_indices]
test_set = [lines[k] for k in test_indices]
validation_set = [lines[k] for k in validation_indices]

print(len(train_set))
print(len(test_set))
print(len(validation_set))
print(train_set[0:3])

38992
64988
64987
38992
25996
['18579,US,"Flavors of candied lemon, lime and pineapple are brightened by crisp acidity in this unoaked  Sauvignon Blanc. The wine reflects this cool-climate Monterey appellation with its clean, brisk character.",,85,16.0,California,Arroyo Seco,Central Coast,,,Mercy 2010 Sauvignon Blanc (Arroyo Seco),Sauvignon Blanc,Mercy\n', '89421,US,"There\'s a lot of oak on this Chardonnay, to judge by the buttered toast and butterscotch richness. Underneath all that is a wine ripe in tropical fruits and green apples, brightened by excellent, mouthwatering acidity. The oak stands out now, but give the wine until 2015 or 2016 in the cellar to let the parts integrate.",Sierra Mar Vineyard,90,40.0,California,Santa Lucia Highlands,Central Coast,,,Loring Wine Company 2012 Sierra Mar Vineyard Chardonnay (Santa Lucia Highlands),Chardonnay,Loring Wine Company\n', '56113,US,"Almost mauve in color, this widely distributed wine (named after the time the winemaking crew pops open

In [70]:
# Write split sets
with open('preprocessed_datasets/train.csv', 'w') as train_file:
    train_file.write(''.join(train_set))
with open('preprocessed_datasets/test.csv', 'w') as test_file:
    test_file.write(''.join(test_set))
with open('preprocessed_datasets/validation.csv', 'w') as validation_file:
    validation_file.write(''.join(validation_set))

In [185]:
# Build the dataset

# Put the thing we want to predict as a label
tv_datafields = [("id", None),
                 ("country", LABEL),
                 ("description", TEXT),
                 ("designation", None),
                 ("points", None),
                 ("price", None),
                 ("province", None),
                 ("region_1", None),
                 ("region_2", None),
                 ("taster_name", None),
                 ("taster_twitter_handle", None),
                 ("title", None),
                 ("variety", None),
                 ("winery", None)]

trn, vld, tst = data.TabularDataset.splits(path='preprocessed_datasets',
                                     format="csv",
                                     train= 'train.csv',
                                     validation='validation.csv',
                                     test='test.csv',
                                     fields=tv_datafields)

In [152]:
# Prepare the vocab (BEWARE: this also downloads the vectors, ~800MB)
MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(trn,
                 max_size=MAX_VOCAB_SIZE,
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)
LABEL.build_vocab(trn)

print(len(TEXT.vocab))
# 25002 because of <pad> and <unk>
print(len(LABEL.vocab))

25002
47


In [153]:
# print(LABEL.vocab.freqs.most_common(10))
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7f85ef13fae8>, {'': 21, 'Chile': 5, 'China': 44, 'Serbia': 35, 'Austria': 7, 'South Africa': 10, 'Czech Republic': 32, 'Switzerland': 37, 'Canada': 14, 'US': 0, 'Greece': 13, 'Bosnia and Herzegovina': 38, 'Georgia': 22, 'Brazil': 27, 'Argentina': 6, 'Bulgaria': 16, 'Italy': 2, '90': 41, 'Bordeaux-style Red Blend': 43, 'Ukraine': 30, 'Peru': 31, 'Lebanon': 28, 'Mexico': 25, 'Macedonia': 34, 'Cyprus': 36, 'Romania': 17, 'Armenia': 42, 'Croatia': 24, 'Luxembourg': 39, 'India': 33, 'Germany': 9, 'Morocco': 29, 'England': 23, 'Australia': 8, 'Slovenia': 19, 'France': 1, 'Hungary': 15, 'Uruguay': 18, ' fine and extremely polished; hold for 10 years."': 40, 'Israel': 12, 'Egypt': 45, 'Turkey': 20, 'Spain': 3, 'Portugal': 4, 'Moldova': 26, 'New Zealand': 11, 'Slovakia': 46})


In [186]:
# Set up the iterators
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (trn, vld, tst), 
    batch_size = BATCH_SIZE,
    sort_key=lambda x: len(x.description), # Sort the examples so the ones with similar lengths are close to each other
    device = device)

In [129]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        #text = [sent len, batch size]

        text = text.permute(1, 0)        
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        #conv_n = [batch size, n_filters, sent len - filter_sizes[n]]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))
        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [155]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [2,3,4]
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [156]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(count_parameters(model))

2604647


In [157]:
pretrained_embeddings = TEXT.vocab.vectors


model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.5130,  1.2094,  0.6369,  ..., -0.4520, -0.4385,  0.0610],
        [ 0.2348,  0.8024,  2.4040,  ..., -0.0284,  0.4618, -1.6748],
        [-0.1077,  0.1105,  0.5981,  ..., -0.8316,  0.4529,  0.0826],
        ...,
        [-0.3868,  0.9669, -0.1847,  ...,  0.0047, -0.1571,  0.4996],
        [ 0.5564, -2.1407,  1.5627,  ...,  0.2384, -0.6747, -0.3413],
        [ 0.6508,  2.4764, -0.3841,  ..., -1.1424, -1.8912,  0.5773]])

In [158]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [159]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [160]:
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]])

In [180]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
#     print('start training')
    
    for batch in iterator:
#         print(epoch_loss)
        optimizer.zero_grad()
        
        predictions = model(batch.description)
        
        loss = criterion(predictions, batch.country)
        
        acc = categorical_accuracy(predictions, batch.country)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [194]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.description)
            
            loss = criterion(predictions, batch.country)
            
            acc = categorical_accuracy(predictions, batch.country)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [168]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [181]:
N_EPOCHS = 2

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'wine-prediction-model.pt')
    
    print('Epoch: ' + str(epoch+1.02) + ' | Epoch Time: ' + str(epoch_mins) + 'm '+ str(epoch_secs) + 's')
    print('\tTrain Loss: ' + str(train_loss) + ' | Train Acc: ' + str(train_acc*100) + '%')
    print('\tVal. Loss: ' + str(valid_loss) + ' |  Val. Acc: ' + str(valid_acc*100) + '%')

Epoch: 1.02 | Epoch Time: 1m 28s
	Train Loss: 0.3627316837279698 | Train Acc: 87.80347768833317%
	 Val. Loss: 0.4833411257485207 |  Val. Acc: 84.78194103194103%
Epoch: 2.02 | Epoch Time: 1m 30s
	Train Loss: 0.31593928699506313 | Train Acc: 89.2988230006432%
	 Val. Loss: 0.507915745496164 |  Val. Acc: 84.59254709742872%


In [195]:
model.load_state_dict(torch.load('wine-prediction-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print('Test Loss: ' + str(test_loss) + ' | Test Acc: '+ str(test_acc*100) + '%')

Test Loss: 0.47960158129695984 | Test Acc: 85.12670765157606%


In [187]:
import spacy
nlp = spacy.load('en_core_web_sm')

def predict_class(model, sentence, min_len = 4):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    preds = model(tensor)
    max_preds = preds.argmax(dim = 1)
    return max_preds.item()

In [190]:
# Live testing: change the description to see how the model classifies it
description = "This opens with aromas suggesting resin, overripe plum, raisin, menthol and a whiff of nail polish remover. The palate showstart cranberry, star anise and a hint of dark baking spice alongside assertive, close-grained tannins leave an astringent finish. You'll also notice the slight warmth of alcohol on the finish."
pred_class = predict_class(model, description)
print('Predicted class is: ' + str(pred_class) + ' = ' + str(LABEL.vocab.itos[pred_class]))

Predicted class is: 2 = Italy


In [202]:
# Check how the model performs on a batch
import csv

with open('preprocessed_datasets/test.csv') as f:
    reader = csv.reader(f)
    rows = []
    for row in reader:
        rows.append(row)

for i in range(10):
    row = rows[i]
    sentence = row[2]
    real_value = row[1]
    pred_value = predict_class(model, sentence)
    print(sentence)
    print("Actual: " + str(real_value) + ", predicted: " + str(LABEL.vocab.itos[pred_value]) + "\n")

Dark orange-pink in color, this meaty, substantial wine doesn't skimp on flavor or body. It presents a rewarding and intense celebration of raspberry and cherry that delights on the palate and will do well at the table, indoor or out.
Actual: US, predicted: US

This light and fruity wine offers fresh peach flavors, with a burst of lemon candy. It's off dry and finishes with a sugary kick.
Actual: US, predicted: US

Smooth, wood-polished wine, packed with a ripe, comfortable texture, very ripe red fruits, highlights of tannins. There are black figs, balanced with sweet acidity and fattened with some bacon flavors.
Actual: Portugal, predicted: Portugal

Produced by the team from classed growth Château Giscours, this is a successfully ripe wine with smoothly integrated tannins. Cushioned by the ripe fruit, the structure is concentrated and impressive. Drink this wine from 2022.
Actual: France, predicted: France

Bold and beautiful, this powerful Zin has a deep, dark color that's practical

In [None]:
# ========== ONLY NOTES BELOW ==============

In [None]:
import random
print(vars(full_dataset.examples[0]))
train_and_valid_data, test_data = full_dataset.split(random_state = random.seed(SEED))

train_data, valid_data = train_and_valid_data.split(random_state = random.seed(SEED))
print(len(train_data))
print(len(test_data))
print(len(valid_data))