# 🚀 GPT

In this notebook, we'll walk through the steps required to train your own GPT model on the wine review dataset

The code is adapted from the excellent [GPT tutorial](https://keras.io/examples/generative/text_generation_with_miniature_gpt/) created by Apoorv Nandan available on the Keras website.

In [1]:
%load_ext autoreload
%autoreload 2
import re
import math
import json
import string
import numpy as np
from typing import List, Union
from collections import Counter
from IPython.display import display, HTML

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

## 0. Parameters <a name="parameters"></a>

In [2]:
VOCAB_SIZE = 10000
MAX_LEN = 80
EMBEDDING_DIM = 256
KEY_DIM = 256
N_HEADS = 2
N_BLOCKS = 1
FEED_FORWARD_DIM = 256
VALIDATION_SPLIT = 0.2
SEED = 42
TRAIN_MODEL = True
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 5

NUM_WORKERS = 24
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## 1. Load and explore the data <a name="load"></a>

In [3]:
# Load the full dataset
with open("../../../../data/wine-reviews/winemag-data-130k-v2.json") as json_data:
    wine_data = json.load(json_data)

In [4]:
wine_data[10]

{'points': '87',
 'title': 'Kirkland Signature 2011 Mountain Cuvée Cabernet Sauvignon (Napa Valley)',
 'description': 'Soft, supple plum envelopes an oaky structure in this Cabernet, supported by 15% Merlot. Coffee and chocolate complete the picture, finishing strong at the end, resulting in a value-priced wine of attractive flavor and immediate accessibility.',
 'taster_name': 'Virginie Boone',
 'taster_twitter_handle': '@vboone',
 'price': 19,
 'designation': 'Mountain Cuvée',
 'variety': 'Cabernet Sauvignon',
 'region_1': 'Napa Valley',
 'region_2': 'Napa',
 'province': 'California',
 'country': 'US',
 'winery': 'Kirkland Signature'}

In [5]:
class WineReviewsTextDataset(Dataset):
    def __init__(self, json_file_path):
        
        self.wine_reviews = None
        self.n_wine_reviews = 0

        def filter_and_format(data):
            return [
                "wine review : "
                + x["country"]
                + " : "
                + x["province"]
                + " : "
                + x["variety"]
                + " : "
                + x["description"]
                for x in data
                if x["country"] is not None
                and x["province"] is not None
                and x["variety"] is not None
                and x["description"] is not None
            ]
        
        # Load the full dataset
        with open(json_file_path) as json_data:
            self.wine_reviews = json.load(json_data)
            self.wine_reviews = filter_and_format(self.wine_reviews)
            self.n_wine_reviews = len(self.wine_reviews)
    

    def __len__(self):
        return self.n_wine_reviews
    
    def __getitem__(self, index):
        return self.wine_reviews[index]

In [6]:
text_ds = WineReviewsTextDataset("../../../../data/wine-reviews/winemag-data-130k-v2.json")

In [7]:
# Count the recipes
n_wines = len(text_ds)
print(f"{n_wines} recipes loaded")

129907 recipes loaded


In [8]:
example = text_ds[25]
print(example)

wine review : US : California : Pinot Noir : Oak and earth intermingle around robust aromas of wet forest floor in this vineyard-designated Pinot that hails from a high-elevation site. Small in production, it offers intense, full-bodied raspberry and blackberry steeped in smoky spice and smooth texture.


## 2. Tokenize the data <a name="tokenize"></a>

In [9]:
class SimpleTokenizer:
    def __init__(self, max_size_vocab: int, max_len_seq: int):
        self.max_len_seq = max_len_seq
        self.max_size_vocab = max_size_vocab

        self.PAD = "<PAD>"
        self.UNKNOWN = "<UNK>"

        self.vocab = []
        self.id_to_token = {}
        self.token_to_id = {self.PAD: 0, self.UNKNOWN: 1}

    def tokenize(self, text: str) -> List[str]:
        text = text.lower()
        text = re.sub(f"([{string.punctuation}, '\n'])", r" \1 ", text)
        text = re.sub(" +", " ", text)
        return text.split()
    
    def adapt(self, texts: List[str]):
        counter = Counter()
        for text in texts:
            tokens = self.tokenize(text)
            counter.update(tokens)
        
        most_common = counter.most_common(self.max_size_vocab - len(self.token_to_id))
        for idx, (word, _) in enumerate(most_common, start=len(self.token_to_id)):
            self.token_to_id[word] = idx

        for (token, id) in self.token_to_id.items():
            self.id_to_token[id] = token

        self.vocab = list(self.token_to_id.keys())
    
    def _encode_one(self, text: str, pad) -> List[int]:
        tokens = self.tokenize(text)
        ids = [self.token_to_id.get(token, self.token_to_id[self.UNKNOWN]) for token in tokens]
        if len(ids) < self.max_len_seq:
            if pad:
                ids += [self.token_to_id[self.PAD]] * (self.max_len_seq - len(ids))
        else:
            ids = ids[:self.max_len_seq]
        
        return ids
    
    def encode(self, texts: Union[str, List[str]], pad=True) -> Union[List[int], List[List[int]]]:
        if isinstance(texts, str):
            return self._encode_one(texts, pad)
        elif isinstance(texts, list) and all(isinstance(t, str) for t in texts):
            return [self._encode_one(text, pad) for text in texts]
        else:
            raise TypeError("Input must be either a str or a List[str].")
    
    def _decode_one(self, text_ids: List[int]) -> str:
        return ' '.join([self.id_to_token.get(token_id, self.UNKNOWN) for token_id in text_ids])
    
    def decode(self, texts_ids: Union[List[int], List[List[int]]]) -> Union[str, List[str]]:
        def is_list_of_ints(l):
            return isinstance(l, list) and all(isinstance(x, int) for x in l)
        
        if is_list_of_ints(texts_ids):
            return self._decode_one(texts_ids)
        elif isinstance(texts_ids, list) and all(is_list_of_ints(x) for x in texts_ids):
            return [self._decode_one(text_ids) for text_ids in texts_ids]
        else:
            raise ValueError("Input must be either a List[int] or a List[List[int]].")

In [10]:
tokenizer = SimpleTokenizer(max_size_vocab=VOCAB_SIZE, max_len_seq=MAX_LEN)
tokenizer.adapt([s for s in text_ds])

In [11]:
tokenizer.vocab[:10]

['<PAD>', '<UNK>', ':', ',', '.', 'and', 'the', 'wine', 'a', 'of']

In [12]:
# Display some token:word mappings
from itertools import islice
for token, id in islice(tokenizer.token_to_id.items(), 10):
    print(f"{id} : {token}")

0 : <PAD>
1 : <UNK>
2 : :
3 : ,
4 : .
5 : and
6 : the
7 : wine
8 : a
9 : of


In [13]:
# Display the same example converted to ints
example_tokenised = tokenizer.encode(example)
print(np.array(example_tokenised))

[   7   10    2   20    2   29    2   43   62    2   55    5  243 4118
  453  634   26    9  497  498  666   17   12  142   14 2210   43   25
 2481   32    8  223   14 2211  948    4  594   17  987    3   15   75
  237    3   64   14   82   97    5   74 2631   17  198   49    5  125
   77    4    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0]


## 3. Create the Training Set <a name="create"></a>

In [14]:
class WineReviewsDataset(Dataset):

    def __init__(self, text_ds, tokenizer, transform=None, target_transform=None):
        self.transform = transform
        self.target_transform = target_transform
        self.tokenizer = tokenizer
        self.tokenizer.adapt([s for s in text_ds])
        self.enc_ds = [self.tokenizer.encode(t) for t in text_ds]
    
    def __len__(self):
        return len(self.enc_ds)
    
    def __getitem__(self, index):
        x = self.enc_ds[index][:-1]
        y = self.enc_ds[index][1:]
        if self.transform:
            x = self.transform(x)
        if self.target_transform:
            y = self.target_transform(y)
        return x, y

In [15]:
class ToTensor:
    def __call__(self, x):
        return torch.tensor(x)

In [16]:
train_ds = WineReviewsDataset(text_ds, tokenizer, transform=ToTensor(), target_transform=ToTensor())

train_ds, test_ds = torch.utils.data.random_split(train_ds, [0.9, 0.1])
train_ds, valid_ds = torch.utils.data.random_split(train_ds, [(1 - VALIDATION_SPLIT), VALIDATION_SPLIT])

In [17]:
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True, pin_memory_device=str(DEVICE))
valid_loader = DataLoader(valid_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True, pin_memory_device=str(DEVICE))
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True, pin_memory_device=str(DEVICE))

In [18]:
example_input_output = train_ds[0]

In [19]:
# Example Input
print(np.array(example_input_output[0]))

[   7   10    2   85    2  208   85    2 1471  542 1704    2   74    5
  194   26   39  634    5  726   46 1704    4   17    6  133    3 1040
    3   12   13 1151  137    5  111 1699    4   78   59   16   39   71
 1078    3   80    6   31  245    9   74    5  107   33  210  137   51
 3427    4   35  121  672  296   12  113  852  122    8  860    7    4
    0    0    0    0    0    0    0    0    0]


In [20]:
# Example Output (shifted by one token)
print(np.array(example_input_output[1]))

[  10    2   85    2  208   85    2 1471  542 1704    2   74    5  194
   26   39  634    5  726   46 1704    4   17    6  133    3 1040    3
   12   13 1151  137    5  111 1699    4   78   59   16   39   71 1078
    3   80    6   31  245    9   74    5  107   33  210  137   51 3427
    4   35  121  672  296   12  113  852  122    8  860    7    4    0
    0    0    0    0    0    0    0    0    0]


## 5. Create the causal attention mask function <a name="causal"></a>

In [21]:
def attn_mask(len_q, len_k, device):
    return torch.triu(torch.ones(len_q, len_k, device=device), diagonal=1).bool()

## 6. Create a Transformer Block layer <a name="transformer"></a>

In [22]:
class TransformerDecoderBlock(nn.Module):
    def __init__(self, n_heads, d_key, d_embed, d_ff, rate_dropout = 0.1):
        super().__init__()

        self.attn = nn.MultiheadAttention(embed_dim=d_embed, num_heads=n_heads, dropout=rate_dropout, kdim=d_key, batch_first=True)
        self.ln1 = nn.LayerNorm(d_embed)
        self.ff = nn.Sequential(
            nn.Linear(d_embed, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_embed),
        )
        self.ln2 = nn.LayerNorm(d_embed)
        self.dropout = nn.Dropout(rate_dropout)
    
    def forward(self, x):
        len_seq = x.shape[1]
        attn_out, attn_out_wts = self.attn(x, x, x, attn_mask=attn_mask(len_seq, len_seq, device=x.device), is_causal=True)
        x = self.ln1(x + self.dropout(attn_out))
        ff_out = self.ff(x)
        x = self.ln2(x + self.dropout(ff_out))
        return x, attn_out_wts

## 7. Positional Embedding <a name="embedder"></a>

In [23]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_embed, max_len_seq=MAX_LEN):
        super().__init__()
        pe = torch.zeros(max_len_seq, d_embed)  # [max_len_seq, d_embed]
        position = torch.arange(0, max_len_seq, dtype=torch.float).unsqueeze(1)  # [max_len_seq, 1]
        div_term = torch.exp(torch.arange(0, d_embed, 2).float() * (-math.log(10000.0) / d_embed))  # [d_embed/2]

        pe[:, 0::2] = torch.sin(position * div_term)  # even dims
        pe[:, 1::2] = torch.cos(position * div_term)  # odd dims
        pe = pe.unsqueeze(0)  # [1, max_len_seq, d_embed]
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

## 8. Build the Transformer model <a name="transformer_decoder"></a>

In [24]:
class DecoderOnlyTransformer(nn.Module):
    def __init__(self, size_vocab, d_embed, n_heads, n_blocks, d_ff, max_len_seq):
        super().__init__()
        self.n_blocks = n_blocks
        self.embed = nn.Embedding(size_vocab, d_embed)
        self.pos_enc = PositionalEncoding(d_embed=d_embed, max_len_seq=max_len_seq)
        self.blocks = nn.ModuleList([
            TransformerDecoderBlock(n_heads=n_heads, d_key=d_embed, d_embed=d_embed, d_ff=d_ff) 
            for _ in range(n_blocks)
        ])
        self.head = nn.Linear(d_embed, size_vocab)
    
    def forward(self, x):

        # len_seq = x.size(1)
        attn_out_wts = None

        x = self.embed(x)
        x = self.pos_enc(x)
        for block in self.blocks:
            x, attn_out_wts = block(x)
        logits = self.head(x)
        
        return logits, attn_out_wts


In [25]:
gpt = DecoderOnlyTransformer(
    size_vocab=VOCAB_SIZE, 
    d_embed=EMBEDDING_DIM, 
    n_heads=N_HEADS, 
    n_blocks=N_BLOCKS, 
    d_ff=FEED_FORWARD_DIM, 
    max_len_seq=MAX_LEN
)

In [26]:
if LOAD_MODEL:
    gpt.load_state_dict(torch.load('./models/gpt.pth'))
    gpt.to(DEVICE)

## 9. Train the Transformer <a name="train"></a>

In [27]:
# Create a TextGenerator
class TextGenerator():
    def __init__(self, model, tokenizer, max_tokens=MAX_LEN, top_k=10):
        self.model = model
        self.tokenizer = tokenizer
        self.max_tokens = max_tokens
        self.id_to_token = tokenizer.id_to_token
        self.token_to_id = tokenizer.token_to_id

    def sample_from(self, probs, temperature):
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, prompt, max_tokens, temperature, return_info=False):
        generated_ids = self.tokenizer.encode(prompt, pad = False)
        generated_text = prompt
        sample_token = None
        info = []
        device = next(self.model.parameters()).device
        while len(generated_ids) < max_tokens and sample_token != tokenizer.PAD:
            logits, att = self.model(torch.tensor(generated_ids).unsqueeze(0).to(device))
            probs = F.softmax(logits, dim=-1)
            sample_token, probs = self.sample_from(probs[0][-1].detach().cpu().numpy(), temperature)
            if return_info == True:
                info.append(
                    {
                        "prompt": generated_text,
                        "word_probs": probs,
                        "atts": att[0, -1, :],
                    }
                )
            generated_ids.append(sample_token)
            generated_text = self.tokenizer.decode(generated_ids)
        # print(f"\ngenerated text:\n{generated_text}\n")
        if return_info:
            return generated_text, info
        else:
            return generated_text

    def on_epoch_end(self, epoch, logs=None):
        print(f'Generated text: "{self.generate("wine review", max_tokens=self.max_tokens, temperature=1.0)}"\n')

In [28]:
text_generator = TextGenerator(model=gpt, tokenizer=tokenizer, max_tokens=MAX_LEN)

In [29]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(gpt.parameters(), lr=0.001)

In [30]:
# Learning Loop

if TRAIN_MODEL:
    gpt.to(DEVICE)
    gpt.train()
    for epoch in range(EPOCHS):
        total_loss = 0.0
        for input_ids, target_ids in train_loader:
            input_ids = input_ids.to(DEVICE)
            target_ids = target_ids.to(DEVICE)

            optimizer.zero_grad()

            logits, _ = gpt(input_ids)          # logits: [Batch, Seq, Vocab]
            
            logits = logits.permute(0, 2, 1)    # logits: [Batch, Vocab, Seq] ==> For CrossEntropyLoss (refer to multidimensional cases)
            loss = criterion(logits, target_ids)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss / len(train_loader):.4f}")
        text_generator.on_epoch_end(epoch)

Epoch 1/5, Loss: 2.5905
Generated text: "wine review : us : washington : merlot : this falls too often <UNK> - vineyard , attractively takes nearly aromas of purple fruit flavors . taste floor , the a compact effort are balanced mix of barrel notes work with power and bark flavors . it drinks for chewy , it ' s all varietal and perfectly quaffable in burgenland . <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>"

Epoch 2/5, Loss: 2.2150
Generated text: "wine review : chile : cachapoal valley : cabernet sauvignon : leathery , jumpy and lactic smelling , with sweet , spicy aromas . feels round , ripe and feels round , loose , like all brunello , while acids rises up thick , and rubbery aromas and feels smooth , with flavors of blackberry and coconut . finishes definitely could benefit from a torrontés . <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>"

Epoch 3/5, Loss: 2.1095
Generated text: "w

In [31]:
# Save the final model
torch.save(gpt.state_dict(), './models/gpt.pth')

# 3. Generate text using the Transformer

In [32]:
def print_probs(info, vocab, top_k=5):
    for i in info:
        highlighted_text = []
        for word, att_score in zip(
            # i["prompt"].split(), np.mean(i["atts"], axis=0)
            i["prompt"].split(), i["atts"]
        ):
            highlighted_text.append(
                '<span style="background-color:rgba(135,206,250,'
                # + str(att_score / max(np.mean(i["atts"], axis=0)))
                + str(att_score / max(i["atts"]))
                + ');">'
                + word
                + "</span>"
            )
        highlighted_text = " ".join(highlighted_text)
        display(HTML(highlighted_text))

        word_probs = i["word_probs"]
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        i_sorted = np.argsort(word_probs)[::-1][:top_k]
        for p, i in zip(p_sorted, i_sorted):
            print(f"{vocab[i]}:   \t{np.round(100*p,2)}%")
        print("--------\n")

In [33]:
gen_text, info = text_generator.generate("wine review : us", max_tokens=80, temperature=1.0, return_info=True)

In [34]:
gen_text, info = text_generator.generate("wine review : italy", max_tokens=80, temperature=0.5, return_info=True)

In [35]:
gen_text, info = text_generator.generate("wine review : germany", max_tokens=80, temperature=0.5, return_info=True)
print_probs(info, tokenizer.vocab)

::   	100.0%
zealand:   	0.0%
-:   	0.0%
in:   	0.0%
<UNK>:   	0.0%
--------



mosel:   	96.91%
rheinhessen:   	1.74%
rheingau:   	0.92%
nahe:   	0.26%
pfalz:   	0.08%
--------



::   	99.87%
-:   	0.13%
<UNK>:   	0.0%
grosso:   	0.0%
blend:   	0.0%
--------



riesling:   	99.99%
pinot:   	0.0%
gewürztraminer:   	0.0%
chardonnay:   	0.0%
rosé:   	0.0%
--------



::   	100.0%
grosso:   	0.0%
-:   	0.0%
is:   	0.0%
giallo:   	0.0%
--------



a:   	33.59%
while:   	12.79%
this:   	10.85%
hints:   	6.41%
ripe:   	3.05%
--------



aromas:   	66.52%
peach:   	12.03%
white:   	4.31%
,:   	3.24%
yellow:   	2.58%
--------



of:   	99.87%
and:   	0.05%
are:   	0.02%
include:   	0.01%
hint:   	0.01%
--------



pressed:   	19.92%
sweet:   	12.73%
ripe:   	12.12%
freshly:   	9.53%
crushed:   	8.86%
--------



cut:   	51.33%
pressed:   	35.32%
crushed:   	12.79%
picked:   	0.19%
baked:   	0.13%
--------



apples:   	52.15%
apple:   	37.41%
yellow:   	8.72%
peach:   	0.39%
white:   	0.31%
--------



flowers:   	49.83%
flower:   	35.98%
peach:   	5.82%
apples:   	3.18%
apple:   	1.85%
--------



and:   	79.72%
,:   	20.21%
with:   	0.02%
from:   	0.02%
on:   	0.01%
--------



white:   	15.79%
peach:   	13.02%
apple:   	7.12%
citrus:   	7.07%
freesia:   	5.84%
--------



peach:   	83.44%
stone:   	11.94%
flowers:   	1.99%
flower:   	1.31%
-:   	0.66%
--------



and:   	81.53%
,:   	12.39%
with:   	1.34%
are:   	1.07%
on:   	1.0%
--------



citrus:   	16.19%
pear:   	9.19%
white:   	9.14%
lemon:   	8.51%
honeydew:   	7.21%
--------



melon:   	96.33%
melons:   	0.66%
mango:   	0.56%
fruit:   	0.54%
flavors:   	0.44%
--------



lead:   	39.77%
are:   	18.31%
flavors:   	16.14%
rind:   	6.88%
in:   	5.05%
--------



are:   	30.57%
in:   	23.9%
.:   	11.0%
this:   	8.83%
make:   	5.63%
--------



dry:   	80.71%
medium:   	8.42%
off:   	5.28%
wine:   	2.9%
riesling:   	0.4%
--------



,:   	81.0%
riesling:   	15.02%
off:   	2.89%
yet:   	0.65%
and:   	0.13%
--------



.:   	99.13%
from:   	0.42%
,:   	0.19%
a:   	0.05%
are:   	0.04%
--------



it:   	68.42%
the:   	29.34%
a:   	0.51%
this:   	0.43%
dry:   	0.34%
--------



palate:   	99.84%
flavors:   	0.06%
mouth:   	0.04%
medium:   	0.01%
acidity:   	0.01%
--------



is:   	96.57%
offers:   	0.94%
shows:   	0.64%
has:   	0.61%
boasts:   	0.38%
--------



medium:   	12.99%
juicy:   	11.2%
a:   	10.34%
full:   	9.39%
dry:   	7.96%
--------



of:   	93.06%
and:   	3.26%
,:   	1.58%
but:   	1.07%
bodied:   	0.51%
--------



the:   	64.1%
sweet:   	10.49%
a:   	9.14%
this:   	3.09%
fresh:   	3.04%
--------



mouth:   	77.15%
flavors:   	11.53%
palate:   	6.52%
same:   	1.63%
flavor:   	1.28%
--------



,:   	58.12%
-:   	17.34%
with:   	12.91%
and:   	8.76%
but:   	1.84%
--------



with:   	88.43%
but:   	8.33%
yet:   	0.98%
accented:   	0.39%
however:   	0.3%
--------



a:   	97.24%
just:   	0.64%
the:   	0.23%
an:   	0.22%
hints:   	0.2%
--------



hint:   	50.63%
touch:   	14.88%
lingering:   	14.68%
long:   	2.43%
creamy:   	1.93%
--------



of:   	100.0%
that:   	0.0%
,:   	0.0%
on:   	0.0%
and:   	0.0%
--------



sweet:   	36.43%
lime:   	14.73%
tangerine:   	12.17%
fresh:   	5.52%
zesty:   	5.02%
--------



zest:   	55.14%
.:   	21.03%
and:   	13.91%
acidity:   	4.31%
peel:   	1.98%
--------



.:   	50.73%
and:   	37.8%
,:   	6.56%
that:   	2.82%
on:   	1.79%
--------



a:   	79.83%
citrus:   	3.61%
lemon:   	3.31%
honey:   	2.08%
lime:   	1.47%
--------



long:   	39.61%
lingering:   	22.46%
touch:   	12.52%
hint:   	7.25%
steely:   	3.97%
--------



mineral:   	26.28%
pollen:   	19.24%
finish:   	15.61%
,:   	11.31%
hint:   	6.31%
--------



note:   	51.52%
finish:   	22.12%
texture:   	12.58%
backbone:   	3.67%
edge:   	2.48%
--------



.:   	99.48%
that:   	0.39%
,:   	0.03%
on:   	0.03%
and:   	0.03%
--------



it:   	67.56%
finishes:   	7.74%
the:   	6.85%
drink:   	6.43%
<PAD>:   	5.87%
--------



':   	99.88%
finishes:   	0.11%
is:   	0.01%
lingers:   	0.0%
has:   	0.0%
--------



s:   	100.0%
ll:   	0.0%
<UNK>:   	0.0%
a:   	0.0%
shows:   	0.0%
--------



a:   	89.61%
an:   	2.66%
easy:   	0.8%
dry:   	0.68%
medium:   	0.68%
--------



bit:   	93.77%
touch:   	1.9%
fine:   	0.9%
little:   	0.82%
wine:   	0.75%
--------



lean:   	26.56%
demure:   	15.36%
disjointed:   	8.84%
rustic:   	4.69%
soft:   	4.26%
--------



,:   	56.15%
in:   	22.55%
and:   	9.66%
but:   	5.47%
.:   	2.99%
--------



sweet:   	18.54%
delicate:   	16.19%
filigreed:   	10.36%
a:   	10.01%
easy:   	6.41%
--------



,:   	74.2%
.:   	20.48%
but:   	1.9%
now:   	1.79%
in:   	0.83%
--------



with:   	58.37%
but:   	37.78%
yet:   	1.29%
it:   	1.12%
and:   	0.91%
--------



a:   	99.4%
an:   	0.48%
just:   	0.08%
fresh:   	0.01%
crisp:   	0.0%
--------



hint:   	36.6%
touch:   	29.93%
lingering:   	19.64%
note:   	3.82%
long:   	1.67%
--------



of:   	100.0%
and:   	0.0%
,:   	0.0%
.:   	0.0%
that:   	0.0%
--------



sweet:   	36.88%
lime:   	10.74%
sweetness:   	10.5%
acidity:   	7.03%
fresh:   	3.64%
--------



.:   	77.89%
and:   	12.22%
on:   	4.25%
,:   	2.57%
that:   	1.62%
--------



<PAD>:   	98.77%
drink:   	1.14%
it:   	0.04%
enjoy:   	0.03%
a:   	0.01%
--------



<PAD>:   	100.0%
enjoy:   	0.0%
.:   	0.0%
drink:   	0.0%
give:   	0.0%
--------



<PAD>:   	100.0%
.:   	0.0%
drink:   	0.0%
by:   	0.0%
enjoy:   	0.0%
--------



<PAD>:   	100.0%
.:   	0.0%
<UNK>:   	0.0%
drink:   	0.0%
enjoy:   	0.0%
--------



<PAD>:   	100.0%
drink:   	0.0%
.:   	0.0%
enjoy:   	0.0%
give:   	0.0%
--------



<PAD>:   	100.0%
drink:   	0.0%
enjoy:   	0.0%
<UNK>:   	0.0%
.:   	0.0%
--------



<PAD>:   	100.0%
enjoy:   	0.0%
drink:   	0.0%
—m:   	0.0%
<UNK>:   	0.0%
--------



<PAD>:   	100.0%
drink:   	0.0%
owner:   	0.0%
enjoy:   	0.0%
<UNK>:   	0.0%
--------



<PAD>:   	100.0%
.:   	0.0%
<UNK>:   	0.0%
owner:   	0.0%
s:   	0.0%
--------



<PAD>:   	100.0%
<UNK>:   	0.0%
drink:   	0.0%
hold:   	0.0%
enjoy:   	0.0%
--------



<PAD>:   	100.0%
.:   	0.0%
<UNK>:   	0.0%
drink:   	0.0%
,:   	0.0%
--------



<PAD>:   	100.0%
<UNK>:   	0.0%
s:   	0.0%
.:   	0.0%
drink:   	0.0%
--------



<PAD>:   	100.0%
drink:   	0.0%
<UNK>:   	0.0%
.:   	0.0%
enjoy:   	0.0%
--------



<PAD>:   	100.0%
<UNK>:   	0.0%
drink:   	0.0%
.:   	0.0%
the:   	0.0%
--------



<PAD>:   	100.0%
drink:   	0.0%
<UNK>:   	0.0%
.:   	0.0%
enjoy:   	0.0%
--------



<PAD>:   	100.0%
.:   	0.0%
drink:   	0.0%
<UNK>:   	0.0%
enjoy:   	0.0%
--------



<PAD>:   	100.0%
.:   	0.0%
drink:   	0.0%
<UNK>:   	0.0%
the:   	0.0%
--------



<PAD>:   	100.0%
drink:   	0.0%
the:   	0.0%
enjoy:   	0.0%
.:   	0.0%
--------

