In [None]:
import random
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import spacy
from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

"""
To install spacy languages do:
python -m spacy download en
python -m spacy download de
"""
spacy_ger = spacy.load("de")
spacy_eng = spacy.load("en")


def tokenize_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]


def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]


german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>")

english = Field(
    tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>"
)

train_data, valid_data, test_data = Multi30k.splits(
    exts=(".de", ".en"), fields=(german, english)
)

german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)


class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, bidirectional=True)

        self.fc_hidden = nn.Linear(hidden_size * 2, hidden_size)
        self.fc_cell = nn.Linear(hidden_size * 2, hidden_size)
        self.dropout = nn.Dropout(p)

    def forward(self, x):
        # x: (seq_length, N) where N is batch size

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size)

        encoder_states, (hidden, cell) = self.rnn(embedding)
        # outputs shape: (seq_length, N, hidden_size)

        # Use forward, backward cells and hidden through a linear layer
        # so that it can be input to the decoder which is not bidirectional
        # Also using index slicing ([idx:idx+1]) to keep the dimension
        hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim=2))
        cell = self.fc_cell(torch.cat((cell[0:1], cell[1:2]), dim=2))

        return encoder_states, hidden, cell


class Decoder(nn.Module):
    def __init__(
        self, input_size, embedding_size, hidden_size, output_size, num_layers, p
    ):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(hidden_size * 2 + embedding_size, hidden_size, num_layers)

        self.energy = nn.Linear(hidden_size * 3, 1)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(p)
        self.softmax = nn.Softmax(dim=0)
        self.relu = nn.ReLU()

    def forward(self, x, encoder_states, hidden, cell):
        x = x.unsqueeze(0)
        # x: (1, N) where N is the batch size

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)

        sequence_length = encoder_states.shape[0]
        h_reshaped = hidden.repeat(sequence_length, 1, 1)
        # h_reshaped: (seq_length, N, hidden_size*2)

        energy = self.relu(self.energy(torch.cat((h_reshaped, encoder_states), dim=2)))
        # energy: (seq_length, N, 1)

        attention = self.softmax(energy)
        # attention: (seq_length, N, 1)

        # attention: (seq_length, N, 1), snk
        # encoder_states: (seq_length, N, hidden_size*2), snl
        # we want context_vector: (1, N, hidden_size*2), i.e knl
        context_vector = torch.einsum("snk,snl->knl", attention, encoder_states)

        rnn_input = torch.cat((context_vector, embedding), dim=2)
        # rnn_input: (1, N, hidden_size*2 + embedding_size)

        outputs, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        # outputs shape: (1, N, hidden_size)

        predictions = self.fc(outputs).squeeze(0)
        # predictions: (N, hidden_size)

        return predictions, hidden, cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(english.vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
        encoder_states, hidden, cell = self.encoder(source)

        # First input will be <SOS> token
        x = target[0]

        for t in range(1, target_len):
            # At every time step use encoder_states and update hidden, cell
            output, hidden, cell = self.decoder(x, encoder_states, hidden, cell)

            # Store prediction for current time step
            outputs[t] = output

            # Get the best word the Decoder predicted (index in the vocabulary)
            best_guess = output.argmax(1)

            # With probability of teacher_force_ratio we take the actual next word
            # otherwise we take the word that the Decoder predicted it to be.
            # Teacher Forcing is used so that the model gets used to seeing
            # similar inputs at training and testing time, if teacher forcing is 1
            # then inputs at test time might be completely different than what the
            # network is used to. This was a long comment.
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs


### We're ready to define everything we need for training our Seq2Seq model ###
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
load_model = False
save_model = True

# Training hyperparameters
num_epochs = 100
learning_rate = 3e-4
batch_size = 32

# Model hyperparameters
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 1
enc_dropout = 0.0
dec_dropout = 0.0

# Tensorboard to get nice loss plot
writer = SummaryWriter(f"runs/loss_plot")
step = 0

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.src),
    device=device,
)

encoder_net = Encoder(
    input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout
).to(device)

decoder_net = Decoder(
    input_size_decoder,
    decoder_embedding_size,
    hidden_size,
    output_size,
    num_layers,
    dec_dropout,
).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = english.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

if load_model:
    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)

sentence = (
    "ein boot mit mehreren männern darauf wird von einem großen"
    "pferdegespann ans ufer gezogen."
)

for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    if save_model:
        checkpoint = {
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),
        }
        save_checkpoint(checkpoint)

    model.eval()

    translated_sentence = translate_sentence(
        model, sentence, german, english, device, max_length=50
    )

    print(f"Translated example sentence: \n {translated_sentence}")

    model.train()

    for batch_idx, batch in enumerate(train_iterator):
        # Get input and targets and get to cuda
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)

        # Forward prop
        output = model(inp_data, target)

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin. While we're at it
        # Let's also remove the start token while we're at it
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)

        # Back prop
        loss.backward()

        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        # Plot to tensorboard
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1

# running on entire test data takes a while
score = bleu(test_data[1:100], model, german, english, device)
print(f"Bleu score {score * 100:.2f}")


In [2]:
import pandas as pd

In [8]:
data = pd.read_csv("cornell/preprocessed_movie_lines.txt",error_bad_lines=False)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [5]:
!ls cornell

chameleons.pdf		       movie_titles_metadata.txt
movie_characters_metadata.txt  preprocessed_movie_lines.txt
movie_conversations.txt        raw_script_urls.txt
movie_lines.txt		       README.txt


In [9]:
len(data)

197615

In [10]:
data.head()

Unnamed: 0,they do not !
0,they do to !
1,i hope so .
2,she okay ?
3,let 's go .
4,wow


In [65]:
from data_reader import DataReader, PAD_TOKEN, EOS_TOKEN, GO_TOKEN
import random
class MovieDialogReader():
    """
    DataReader used to read and tokenize data from the Cornell open movie
    dialog dataset.
    """

    UNKNOWN_TOKEN = "UNK"

    DROPOUT_TOKENS = {"a", "an", "the", "'ll", "'s", "'m", "'ve", "to"}  # Add "to"

    REPLACEMENTS = {"there": "their", "their": "there", "then": "than",
                    "than": "then"}
    # Add: "be":"to"

    def __init__(self, train_path=None, token_to_id=None,
                 dropout_prob=0.37,replacement_prob=0.37, dataset_copies=2):
#         super(MovieDialogReader, self).__init__(
#             config, train_path=train_path, token_to_id=token_to_id,
#             special_tokens=[
#                 PAD_TOKEN, GO_TOKEN, EOS_TOKEN,
#                 MovieDialogReader.UNKNOWN_TOKEN],
#             dataset_copies=dataset_copies)

        self.dropout_prob = dropout_prob
        self.replacement_prob = replacement_prob

#         self.UNKNOWN_ID = self.token_to_id[MovieDialogReader.UNKNOWN_TOKEN]

    def read_samples_by_string(self, path):
        for tokens in self.read_tokens(path):
            source = []
            target = []

            for token in tokens:
                target.append(token)

                # Randomly dropout some words from the input.
                dropout_token = (token in MovieDialogReader.DROPOUT_TOKENS and
                                random.random() < self.dropout_prob)
                replace_token = (token in MovieDialogReader.REPLACEMENTS and
                                random.random() < self.replacement_prob)

                if replace_token:
                    source.append(MovieDialogReader.REPLACEMENTS[token])
                elif not dropout_token:
                    source.append(token)

            yield source, target

    def unknown_token(self):
        return MovieDialogReader.UNKNOWN_TOKEN

    def read_tokens(self, path):
        for line in range(len(path)):
            linep = path.iloc[line,:][0]
            yield linep.lower().strip().split()

In [107]:
m =  MovieDialogReader()

In [108]:
pairs = m.read_samples_by_string(data) 

In [109]:
pairs

<generator object MovieDialogReader.read_samples_by_string at 0x7fb695444258>

In [81]:
for i in pairs:
    print(i)

(['they', 'do', 'to', '!'], ['they', 'do', 'to', '!'])
(['i', 'hope', 'so', '.'], ['i', 'hope', 'so', '.'])
(['she', 'okay', '?'], ['she', 'okay', '?'])
(['let', "'s", 'go', '.'], ['let', "'s", 'go', '.'])
(['wow'], ['wow'])
(['okay', '--', 'you', "'re", 'gon', 'na', 'need', 'to', 'learn', 'how', 'to', 'lie', '.'], ['okay', '--', 'you', "'re", 'gon', 'na', 'need', 'to', 'learn', 'how', 'to', 'lie', '.'])
(['no'], ['no'])
(['i', "'m", 'kidding', '.', 'you', 'know', 'how', 'sometimes', 'you', 'just', 'become', 'this', '``', 'persona', "''", '?', 'and', 'you', 'do', "n't", 'know', 'how', 'quit', '?'], ['i', "'m", 'kidding', '.', 'you', 'know', 'how', 'sometimes', 'you', 'just', 'become', 'this', '``', 'persona', "''", '?', 'and', 'you', 'do', "n't", 'know', 'how', 'to', 'quit', '?'])
(['like', 'my', 'fear', 'of', 'wearing', 'pastels', '?'], ['like', 'my', 'fear', 'of', 'wearing', 'pastels', '?'])
(['the', '``', 'real', 'you', "''", '.'], ['the', '``', 'real', 'you', "''", '.'])
(['what', 

(["c'mon", '.', 'you', 'intimidate', 'her', "'cause", 'you', "'re", 'a', 'celebrity', '.', 'she', 'sees', 'me', 'differently', '.'], ["c'mon", '.', 'you', 'intimidate', 'her', "'cause", 'you', "'re", 'a', 'celebrity', '.', 'she', 'sees', 'me', 'differently', '.'])
(['so', 'am', 'i', '.'], ['so', 'am', 'i', '.'])
(['i', 'serious', 'here', '.'], ['i', "'m", 'serious', 'here', '.'])
(['my', 'point', 'exactly', '.'], ['my', 'point', 'exactly', '.'])
(['i', 'do', "n't", 'have', 'a', 'girlfriend', '.'], ['i', 'do', "n't", 'have', 'a', 'girlfriend', '.'])
(['what', 'would', 'your', 'girlfriend', 'think', 'of', 'that', '?'], ['what', 'would', 'your', 'girlfriend', 'think', 'of', 'that', '?'])
(['yeah', '.'], ['yeah', '.'])
(['you', 'wan', 'na', 'talk', 'to', 'her', 'alone', '?'], ['you', 'wan', 'na', 'talk', 'to', 'her', 'alone', '?'])
(['raw', 'deal', '.'], ['raw', 'deal', '.'])
(['she', "'s", 'fucked', '.', 'even', 'if', 'that', 'story', 'is', 'true', '.'], ['she', "'s", 'fucked', '.', 'even

(['must', 'be', 'billy', "'s", 'girl', '.'], ['must', 'be', 'billy', "'s", 'girl', '.'])
(['there', '.'], ['there', '.'])
(['my', 'kind', 'of', 'place', '.', 'i', 'always', 'liked', 'country', 'boys', '.'], ['my', 'kind', 'of', 'place', '.', 'i', 'always', 'liked', 'country', 'boys', '.'])
(['this', 'place', 'do', "n't", 'seem', 'real', 'popular', 'with', 'the', 'brothers', '.'], ['this', 'place', 'do', "n't", 'seem', 'real', 'popular', 'with', 'the', 'brothers', '.'])
(['i', 'thought', 'you', 'said', 'bullshit', 'and', 'experience', 'are', 'all', 'it', 'takes', '.'], ['i', 'thought', 'you', 'said', 'bullshit', 'and', 'experience', 'are', 'all', 'it', 'takes', '.'])
(['bet', "'s", 'off', '.'], ['the', 'bet', "'s", 'off', '.'])
(['i', 'do', "n't", 'know', 'what', 'you', "'re", 'talking', 'about', '.', 'i', 'just', 'wan', 'na', 'see', 'ganz', 'nailed', '.'], ['i', 'do', "n't", 'know', 'what', 'you', "'re", 'talking', 'about', '.', 'i', 'just', 'wan', 'na', 'see', 'ganz', 'nailed', '.'])


(['it', "'s", 'money', '.', 'people', 'use', 'it', 'to', 'purchase', 'goods', 'and', 'services', '.'], ['it', "'s", 'money', '.', 'people', 'use', 'it', 'to', 'purchase', 'goods', 'and', 'services', '.'])
(['what', "'s", 'this', '?'], ['what', "'s", 'this', '?'])
(['nope', '.'], ['nope', '.'])
(['still', 'do', "n't", 'want', 'tell', 'me', 'what', 'you', "'re", 'doing', '?'], ['still', 'do', "n't", 'want', 'to', 'tell', 'me', 'what', 'you', "'re", 'doing', '?'])
(['i', "'m", 'trying', 'to', 'figure', 'that', 'out', 'myself', '.', 'i', 'have', 'to', 'see', 'machine', 'without', 'his', 'mask', '.'], ['i', "'m", 'trying', 'to', 'figure', 'that', 'out', 'myself', '.', 'i', 'have', 'to', 'see', 'machine', 'without', 'his', 'mask', '.'])
(['what', "'s", 'next', '?'], ['what', "'s", 'next', '?'])
(['what', 'kind', 'of', 'junior', 'p.i', '.', 'would', 'i', 'be', 'if', 'i', 'did', "n't", 'go', 'with', 'you', '?'], ['what', 'kind', 'of', 'junior', 'p.i', '.', 'would', 'i', 'be', 'if', 'i', 'did',

(['sorry', 'about', 'the', 'long', 'lunch', '.', 'my', 'clutch', 'is', 'going', 'out', 'again', '.'], ['sorry', 'about', 'the', 'long', 'lunch', '.', 'my', 'clutch', 'is', 'going', 'out', 'again', '.'])
(['you', 'rolfe', '?', 'i', 'remember', 'you', 'from', 'high', 'school', '.', 'you', "'re", 'a', 'teacher', 'now', '?', 'harvard', '?'], ['you', 'rolfe', '?', 'i', 'remember', 'you', 'from', 'high', 'school', '.', 'you', "'re", 'a', 'teacher', 'now', '?', 'harvard', '?'])
(['yeah', '.', 'a', 'few', 'things', '.'], ['yeah', '.', 'a', 'few', 'things', '.'])
(['something', 'bugging', 'you', '?'], ['something', 'bugging', 'you', '?'])
(['what', 'the', 'fuck', '.', 'my', 'day', 'already', 'ruined', '.', 'give', 'me', 'the', 'keys', '.', 'you', 'can', 'go', 'back', 'with', 'jack', '.', 'you', 'still', 'got', 'shitload', 'of', 'plowing', 'to', 'do', '.'], ['what', 'the', 'fuck', '.', 'my', 'day', "'s", 'already', 'ruined', '.', 'give', 'me', 'the', 'keys', '.', 'you', 'can', 'go', 'back', 'wit

(['no', '.', 'no', '.', 'no', '.', 'wait', '.', 'wait', '.'], ['no', '.', 'no', '.', 'no', '.', 'wait', '.', 'wait', '.'])
(['thank', 'you', 'for', 'calling', 'the', 'white', 'house', '...'], ['thank', 'you', 'for', 'calling', 'the', 'white', 'house', '...'])
(['you', 'do', "n't", 'understand', '.', 'this', 'is', 'an', 'emergency', '.', 'let', 'me', 'talk', 'to', 'anyone', '.'], ['you', 'do', "n't", 'understand', '.', 'this', 'is', 'an', 'emergency', '.', 'let', 'me', 'talk', 'to', 'anyone', '.'])
(['do', "n't", 'cut', 'me', 'off', '.', 'this', 'is', 'an', 'emergency', '.'], ['do', "n't", 'cut', 'me', 'off', '.', 'this', 'is', 'an', 'emergency', '.'])
(['this', 'is', 'the', 'president', '.'], ['this', 'is', 'the', 'president', '.'])
(['who', 'can', 'i', 'say', 'is', 'calling', '?'], ['who', 'can', 'i', 'say', 'is', 'calling', '?'])
(['white', 'house', 'switchboard', '.', 'how', 'may', 'i', 'direct', 'your', 'call', '.'], ['white', 'house', 'switchboard', '.', 'how', 'may', 'i', 'direct

(['your', 'home', 'is', 'quite', 'disordered', '.', 'i', 'thought', 'perhaps', 'you', 'had', 'been', 'burglarized', 'when', 'i', 'walked', 'in', '.'], ['your', 'home', 'is', 'quite', 'disordered', '.', 'i', 'thought', 'perhaps', 'you', 'had', 'been', 'burglarized', 'when', 'i', 'walked', 'in', '.'])
(['we', 'mate', 'for', 'life', '.', 'divorce', '...', 'is', 'strange', 'concept', 'to', 'us', '.'], ['we', 'mate', 'for', 'life', '.', 'divorce', '...', 'is', 'a', 'strange', 'concept', 'to', 'us', '.'])
(['was', '.', 'divorced', '.'], ['was', '.', 'divorced', '.'])
(['you', 'are', 'married', '?'], ['you', 'are', 'married', '?'])
(['yeah', '...', 'i', 'know', 'the', 'routine', '.'], ['yeah', '...', 'i', 'know', 'the', 'routine', '.'])
(['my', 'wife', '?', 'she', 'worries', 'about', 'me', '.'], ['my', 'wife', '?', 'she', 'worries', 'about', 'me', '.'])
(['i', 'believe', 'he', 'is', 'probably', 'lying', '.'], ['i', 'believe', 'he', 'is', 'probably', 'lying', '.'])
(['when', 'will', 'strader',

(['i', 'not', 'leaving', 'you', '.'], ['i', "'m", 'not', 'leaving', 'you', '.'])
(['would', 'you', 'stay', 'with', 'me', 'while', 'i', 'sleep', 'little', '?'], ['would', 'you', 'stay', 'with', 'me', 'while', 'i', 'sleep', 'a', 'little', '?'])
(['we', "'ll", 'stop', 'for', 'just', 'a', 'moment', '.', 'than', 'we', "'ll", 'do', 'the', 'lacrimosa', '.'], ['we', "'ll", 'stop', 'for', 'just', 'a', 'moment', '.', 'then', 'we', "'ll", 'do', 'the', 'lacrimosa', '.'])
(['oh', 'no', '.', 'i', "'m", 'not', 'tired', 'at', 'all', '.'], ['oh', 'no', '.', 'i', "'m", 'not', 'tired', 'at', 'all', '.'])
(['do', 'you', 'want', 'rest', 'a', 'bit', '?'], ['do', 'you', 'want', 'to', 'rest', 'a', 'bit', '?'])
(['then', 'let', 'me', 'hear', 'it', '.', 'all', 'of', 'it', '.', 'the', 'whole', 'thing', 'from', 'the', 'beginning', '-', 'now', '!'], ['then', 'let', 'me', 'hear', 'it', '.', 'all', 'of', 'it', '.', 'the', 'whole', 'thing', 'from', 'the', 'beginning', '-', 'now', '!'])
(['yes', '.'], ['yes', '.'])
([

(['wait', 'a', 'minute', '.', 'answer', 'my', 'question', '.', 'was', 'mrs.', 'dickson', 'their', '?'], ['wait', 'a', 'minute', '.', 'answer', 'my', 'question', '.', 'was', 'mrs.', 'dickson', 'there', '?'])
(['was', 'mrs.', 'dickson', 'there', '?'], ['was', 'mrs.', 'dickson', 'there', '?'])
(['were', 'you', 'in', 'cluett', "'s", 'apartment', '?'], ['were', 'you', 'in', 'cluett', "'s", 'apartment', '?'])
(['why', 'would', "n't", 'you', 'tell', 'me', 'where', 'you', 'were', 'last', 'night', '?'], ['why', 'would', "n't", 'you', 'tell', 'me', 'where', 'you', 'were', 'last', 'night', '?'])
(['you', "'re", 'protecting', 'somebody', '.'], ['you', "'re", 'protecting', 'somebody', '.'])
(['no', '.', 'i', 'wo', "n't", '.'], ['no', '.', 'i', 'wo', "n't", '.'])
(['i', 'ca', "n't", 'tell', 'you', '.'], ['i', 'ca', "n't", 'tell', 'you', '.'])
(['their', 'you', 'are', '.'], ['there', 'you', 'are', '.'])
(['i', 'already', 'told', 'him', 'i', 'was', 'home', '.'], ['i', 'already', 'told', 'him', 'i', 'w

(['i', 'want', 'firm', 'commitment', '.'], ['i', 'want', 'a', 'firm', 'commitment', '.'])
(['it', "'s", 'not', '.'], ['it', "'s", 'not', '.'])
(['your', 'hairline', 'looks', 'like', 'it', 'receding', '.'], ['your', 'hairline', 'looks', 'like', 'it', "'s", 'receding', '.'])
(['no', '.', 'i', "'m", 'not', '.', 'why', 'should', 'i', '?'], ['no', '.', 'i', "'m", 'not', '.', 'why', 'should', 'i', '?'])
(['are', 'you', 'using', 'minoxidil', '?'], ['are', 'you', 'using', 'minoxidil', '?'])
(['everybody', 'has', 'a', 'great', 'body', 'now', '.'], ['everybody', 'has', 'a', 'great', 'body', 'now', '.'])
(['he', 'has', 'great', 'body'], ['he', 'has', 'a', 'great', 'body'])
(['he', 'good-looking', '.'], ['he', "'s", 'good-looking', '.'])
(['everybody', "'s", 'rich', '.'], ['everybody', "'s", 'rich', '.'])
(['he', "'s", 'rich', '.'], ['he', "'s", 'rich', '.'])
(['why', 'don', 't', 'you', 'just', 'go', 'for', 'price', '?'], ['why', 'don', 't', 'you', 'just', 'go', 'for', 'price', '?'])
(['hi', '.', 

(['...', 'were', 'you', '?'], ['...', 'were', 'you', '?'])
(['i', 'would', "n't", 'worry', 'about', 'it', '.', 'you', "'ve", 'got', 'ta', 'figure', 'most', 'people', 'around', 'here', 'were', 'their', 'teachers', "'", 'pets', '.'], ['i', 'would', "n't", 'worry', 'about', 'it', '.', 'you', "'ve", 'got', 'ta', 'figure', 'most', 'people', 'around', 'here', 'were', 'their', 'teachers', "'", 'pets', '.'])
(['i', "'m", 'getting', 'teacher', "'s", 'pet', 'rep', '.'], ['i', "'m", 'getting', 'a', 'teacher', "'s", 'pet', 'rep', '.'])
(['you', "'re", 'famous', 'around', 'here', '.'], ['you', "'re", 'famous', 'around', 'here', '.'])
(['you', 'know', 'mine', '.'], ['you', 'know', 'mine', '.'])
(['you', 'know', 'my', 'name', '.'], ['you', 'know', 'my', 'name', '.'])
(['lisa', '.'], ['lisa', '.'])
(['i', 'did', "n't", 'plan', 'this', '.'], ['i', 'did', "n't", 'plan', 'this', '.'])
(['what', 'does', 'that', 'mean', '?'], ['what', 'does', 'that', 'mean', '?'])
(['i', 'dunno', '.', 'i', 'guess', 'larry'

(['after', 'you', '--', 'mrs.', 'peel', '...'], ['after', 'you', '--', 'mrs.', 'peel', '...'])
(['please', 'do', "n't", 'tell', 'me', 'it', "'s", 'how', 'you', 'play', 'the', 'game', '.'], ['please', 'do', "n't", 'tell', 'me', 'it', "'s", 'how', 'you', 'play', 'the', 'game', '.'])
(['winning', 'is', "n't", 'everything', '.'], ['winning', 'is', "n't", 'everything', '.'])
(['i', "'m", 'playing', 'to', 'win', '.'], ['i', "'m", 'playing', 'to', 'win', '.'])
(['i', 'thought', 'you', 'did', "n't", '.'], ['i', 'thought', 'you', 'did', "n't", '.'])
(['i', 'thought', 'you', 'played', 'by', 'the', 'rules', '.'], ['i', 'thought', 'you', 'played', 'by', 'the', 'rules', '.'])
(['you', "'re", 'playing', 'games', '.'], ['you', "'re", 'playing', 'games', '.'])
(['not', 'for', 'moment', '.'], ['not', 'for', 'a', 'moment', '.'])
(['but', 'you', 'did', 'suspect', 'me', '.'], ['but', 'you', 'did', 'suspect', 'me', '.'])
(['is', 'that', 'all', '?'], ['is', 'that', 'all', '?'])
(['keeping', 'a', 'stiff', 'u

(['--', 'i', 'work', 'for', 'the', 'city', '.'], ['--', 'i', 'work', 'for', 'the', 'city', '.'])
(['leave', 'stephen', 'out', 'of', 'this', '--'], ['leave', 'stephen', 'out', 'of', 'this', '--'])
(['yeah', '?', 'and', 'just', 'how', 'much', 'of', 'all', 'this', 'has', 'been', '``', 'doing', 'your', 'job', "''", '?'], ['yeah', '?', 'and', 'just', 'how', 'much', 'of', 'all', 'this', 'has', 'been', '``', 'doing', 'your', 'job', "''", '?'])
(['i', 'was', 'doing', 'my', 'job', '.'], ['i', 'was', 'doing', 'my', 'job', '.'])
(['--', 'you', 'could', 'have', 'scared', 'son', 'of', 'a', 'bitch', 'off', '.', 'we', 'may', 'never', 'bust', 'him', 'now', '.', 'all', 'for', 'a', 'couple', 'political', 'points', '.'], ['--', 'you', 'could', 'have', 'scared', 'the', 'son', 'of', 'a', 'bitch', 'off', '.', 'we', 'may', 'never', 'bust', 'him', 'now', '.', 'all', 'for', 'a', 'couple', "'s", 'political', 'points', '.'])
(['--', 'bullshit', '!', 'you', 'knew', 'what', 'i', 'told', 'you', 'was', "n't", 'ready

(['i', 'am', 'only', 'sorry', 'that', 'i', 'was', 'unable', 'prevent', 'villain', 'from', 'carrying', 'off', 'all', 'her', 'ladyship', 'money', 'and', 'pearls', '.'], ['i', 'am', 'only', 'sorry', 'that', 'i', 'was', 'unable', 'to', 'prevent', 'the', 'villain', 'from', 'carrying', 'off', 'all', 'her', 'ladyship', "'s", 'money', 'and', 'pearls', '.'])
(['i', 'have', 'said', 'all', 'there', 'is', 'be', 'said', '.', 'i', 'am', 'at', 'your', 'disposal', 'for', 'whatever', 'purposes', 'you', 'wish', '.', 'good', 'night', '.'], ['i', 'have', 'said', 'all', 'there', 'is', 'to', 'be', 'said', '.', 'i', 'am', 'at', 'your', 'disposal', 'for', 'whatever', 'purposes', 'you', 'wish', '.', 'good', 'night', '.'])
(['i', 'do', "n't", 'know', '.'], ['i', 'do', "n't", 'know', '.'])
(['gentleman', 'has', 'nothing', 'contraband', '.'], ['the', 'gentleman', 'has', 'nothing', 'contraband', '.'])
(['i', 'have', 'no', 'luggage', '.'], ['i', 'have', 'no', 'luggage', '.'])
(['as', 'you', 'please', '.'], ['as', '

(['what', 'did', 'she', 'look', 'like', '?'], ['what', 'did', 'she', 'look', 'like', '?'])
(['did', 'you', 'see', 'that', 'woman', '?'], ['did', 'you', 'see', 'that', 'woman', '?'])
(['and', 'i', 'like', 'you', '.'], ['and', 'i', 'like', 'you', '.'])
(['i', 'like', 'your', 'stories', '.'], ['i', 'like', 'your', 'stories', '.'])
(['you', 'were', 'talking', 'about', 'your', 'wedding', '.'], ['you', 'were', 'talking', 'about', 'your', 'wedding', '.'])
(['there', 'was', 'one', 'time', 'when', 'i', 'was', 'eleven', '...'], ['there', 'was', 'one', 'time', 'when', 'i', 'was', 'eleven', '...'])
(['is', 'it', 'medicine', 'that', 'making', 'you', 'thirsty', '?'], ['is', 'it', 'the', 'medicine', 'that', "'s", 'making', 'you', 'thirsty', '?'])
(['i', 'thought', 'you', 'said', 'you', 'did', "n't", 'have', 'a', 'church', 'wedding', '.'], ['i', 'thought', 'you', 'said', 'you', 'did', "n't", 'have', 'a', 'church', 'wedding', '.'])
(['will', 'never', 'told', 'you', 'that', '?', 'probably', 'just', 'as'

(['yes', '?'], ['yes', '?'])
(['angela', '?'], ['angela', '?'])
(['well', '.', 'it', "'s", 'good', 'see', 'you', '.'], ['well', '.', 'it', "'s", 'good', 'to', 'see', 'you', '.'])
(['been', 'on', 'crop', '.', 'may', 'need', 'to', 'get', 'some', 'extra', 'hands', 'if', 'i', 'do', "n't", 'want', 'work', 'on', 'sundays', '.'], ['been', 'on', 'the', 'crop', '.', 'may', 'need', 'to', 'get', 'some', 'extra', 'hands', 'if', 'i', 'do', "n't", 'want', 'to', 'work', 'on', 'sundays', '.'])
(['i', 'standing', 'most', 'of', 'my', 'day', '.', 'they', "'re", 'for', 'support', '.', 'did', "n't", 'see', 'you', 'in', 'church', 'this', 'mornin', "'", '.'], ['i', "'m", 'standing', 'most', 'of', 'my', 'day', '.', 'they', "'re", 'for', 'support', '.', 'did', "n't", 'see', 'you', 'in', 'church', 'this', 'mornin', "'", '.'])
(['you', 'hurt', 'it', '?'], ['you', 'hurt', 'it', '?'])
(['i', "'m", 'sorry', '.', 'it', 'these', 'soft', 'shoes', 'i', 'wear', 'for', 'my', 'back', '.'], ['i', "'m", 'sorry', '.', 'it', 

(['i', 'was', 'just', 'about', 'to', 'have', 'my', 'dinner', '.'], ['i', 'was', 'just', 'about', 'to', 'have', 'my', 'dinner', '.'])
(['yeah', '.'], ['yeah', '.'])
(['garbage', 'man', '?'], ['garbage', 'man', '?'])
(['industrial', 'refuse', '.'], ['industrial', 'refuse', '.'])
(['what', 'was', 'the', 'cover', 'on', 'one', 'that', 'got', 'holden', '?'], ['what', 'was', 'the', 'cover', 'on', 'the', 'one', 'that', 'got', 'holden', '?'])
(['where', 'do', 'we', 'start', '?'], ['where', 'do', 'we', 'start', '?'])
(['we', "'re", 'using', 'esper', '--', 'a', '231', '--', 'that', 'picked', 'up', 'holden', 'alarm', '.', 'its', 'guess', 'is', 'that', 'all', 'five', 'are', 'in', 'city', '.'], ['we', "'re", 'using', 'esper', '--', 'a', '231', '--', 'that', 'picked', 'up', 'holden', "'s", 'alarm', '.', 'its', 'guess', 'is', 'that', 'all', 'five', 'are', 'in', 'the', 'city', '.'])
(['you', 'got', 'a', 'machine', 'on', 'it', 'yet', '?'], ['you', 'got', 'a', 'machine', 'on', 'it', 'yet', '?'])
(['what'

(['for', 'good', '?', 'i', 'ca', "n't", 'mom', '.', 'not', 'right', 'in', 'the', 'middle', 'of', 'the', 'term', '.'], ['for', 'good', '?', 'i', 'ca', "n't", 'mom', '.', 'not', 'right', 'in', 'the', 'middle', 'of', 'the', 'term', '.'])
(['i', "'ll", 'get', 'a', 'coat', 'for', 'her', '.'], ['i', "'ll", 'get', 'a', 'coat', 'for', 'her', '.'])
(['that', 'sounds', 'great', '!'], ['that', 'sounds', 'great', '!'])
(['here', 'you', 'are', '.', 'would', 'anyone', 'like', 'coffee', '?'], ['here', 'you', 'are', '.', 'would', 'anyone', 'like', 'coffee', '?'])
(['say', '``', 'goodnight', "''", 'sandy', '.'], ['say', '``', 'goodnight', "''", 'to', 'sandy', '.'])
(['mrs.', 'williams', '?', 'thanks', 'for', 'the', 'cake', '.'], ['mrs.', 'williams', '?', 'thanks', 'for', 'the', 'cake', '.'])
(['yeah', '.'], ['yeah', '.'])
(['he', 'comes', 'over', 'to', 'study', '.'], ['he', 'comes', 'over', 'to', 'study', '.'])
(['yeah', '.', 'okay', '.'], ['yeah', '.', 'okay', '.'])
(['it', "'s", 'a', 'real', 'good', 

(['you', 'really', 'think', 'he', "'ll", 'remember', 'you', '?'], ['you', 'really', 'think', 'he', "'ll", 'remember', 'you', '?'])
(['no', '.', 'i', 'was', 'calm', '.'], ['no', '.', 'i', 'was', 'calm', '.'])
(['do', "n't", 'count', 'it', '.'], ['do', "n't", 'count', 'it', '.'])
(['how', 'much', 'is', 'there', '?'], ['how', 'much', 'is', 'there', '?'])
(['did', 'we', 'get', 'it', '?'], ['did', 'we', 'get', 'it', '?'])
(['that', "'s", 'good', '.', 'keep', 'it', 'at', 'forty', '.'], ['that', "'s", 'good', '.', 'keep', 'it', 'at', 'forty', '.'])
(['this', 'is', 'natural', '.'], ['this', 'is', 'natural', '.'])
(['what', 'happened', '?'], ['what', 'happened', '?'])
(['i', "'m", 'putting', 'piece', 'of', 'tape', 'on', 'my', 'nose', '.'], ['i', "'m", 'putting', 'a', 'piece', 'of', 'tape', 'on', 'my', 'nose', '.'])
(['what', 'are', 'you', 'doing', '?'], ['what', 'are', 'you', 'doing', '?'])
(['that', "'s", 'too', 'bad', '.'], ['that', "'s", 'too', 'bad', '.'])
(['yeah', '.', 'one', 'of', 'nices

(['stop', '!', 'take', 'me', 'back', '!'], ['stop', '!', 'take', 'me', 'back', '!'])
(['we', 'have', 'get', 'you', 'sunnyvale', '.', 'you', "'re", 'having', 'one', 'of', 'your', 'famous', 'hemorrhages', '.'], ['we', 'have', 'to', 'get', 'you', 'to', 'sunnyvale', '.', 'you', "'re", 'having', 'one', 'of', 'your', 'famous', 'hemorrhages', '.'])
(['where', 'are', 'we', 'going', '?'], ['where', 'are', 'we', 'going', '?'])
(['i', "'m", 'over', 'it', '.', 'whenever', 'i', 'try', 'to', 'help', 'anyone', 'it', 'all', 'turns', 'shit', '.'], ['i', "'m", 'over', 'it', '.', 'whenever', 'i', 'try', 'to', 'help', 'anyone', 'it', 'all', 'turns', 'to', 'shit', '.'])
(['do', "n't", 'think', 'i', "'ll", 'need', 'it', 'where', 'i', "'m", 'going', '.'], ['do', "n't", 'think', 'i', "'ll", 'need', 'it', 'where', 'i', "'m", 'going', '.'])
(['sure', 'you', 'do', "n't", 'want', 'your', 'wallet', '?'], ['sure', 'you', 'do', "n't", 'want', 'your', 'wallet', '?'])
(['there', 'is', 'no', 'fuckin', "'", 'way', 'on',

(['two', '.'], ['two', '.'])
(['how', 'many', 'months', 'are', 'you', '?'], ['how', 'many', 'months', 'are', 'you', '?'])
(['candy', '.', 'candy', 'kendall', '.'], ['candy', '.', 'candy', 'kendall', '.'])
(['no', '.', 'it', 'ai', "n't", 'muddy', '.', 'muddy', "'s", 'just', '...'], ['no', '.', 'it', 'ai', "n't", 'muddy', '.', 'muddy', "'s", 'just', '...'])
(['why', '?'], ['why', '?'])
(['i', 'ca', "n't", 'go', 'nowhere', '.'], ['i', 'ca', "n't", 'go', 'nowhere', '.'])
(['i', 'know', 'where', 'you', 'can', 'go', '.', 'homer', 'and', 'i', 'can', 'take', 'you', '...'], ['i', 'know', 'where', 'you', 'can', 'go', '.', 'homer', 'and', 'i', 'can', 'take', 'you', '...'])
(['i', 'got', 'some', 'more', 'clothes', 'for', 'you', '--', 'i', 'just', 'keep', 'forgetting', 'to', 'bring', 'them', 'with', 'me', '.'], ['i', "'ve", 'got', 'some', 'more', 'clothes', 'for', 'you', '--', 'i', 'just', 'keep', 'forgetting', 'to', 'bring', 'them', 'with', 'me', '.'])
(['hi', '...'], ['hi', '...'])
(['hi', '...']

(['it', 'a', 'rule', '.'], ['it', "'s", 'a', 'rule', '.'])
(['i', 'make', 'it', 'worth', 'your', 'while', '.'], ['i', "'ll", 'make', 'it', 'worth', 'your', 'while', '.'])
(['i', 'do', "n't", 'get', 'involved', 'with', 'guns', '.'], ['i', 'do', "n't", 'get', 'involved', 'with', 'guns', '.'])
(['i', 'should', "n't", 'do', 'this', '.', 'but', 'i', 'got', 'client', 'lookin', "'", 'to', 'score', 'some', 'fire', 'power', '.', 'maybe', 'you', 'keep', 'your', 'eyes', 'open', 'for', 'me', '.'], ['i', 'should', "n't", 'do', 'this', '.', 'but', 'i', 'got', 'a', 'client', 'lookin', "'", 'to', 'score', 'some', 'fire', 'power', '.', 'maybe', 'you', "'ll", 'keep', 'your', 'eyes', 'open', 'for', 'me', '.'])
(['later', '.'], ['later', '.'])
(['i', 'give', 'you', 'seventy-five', 'bucks', 'for', 'it', '.'], ['i', "'ll", 'give', 'you', 'seventy-five', 'bucks', 'for', 'it', '.'])
(['i', 'ai', "n't", 'mad', 'at', 'you', '...'], ['i', 'ai', "n't", 'mad', 'at', 'you', '...'])
(['grand', '.'], ['a', 'grand', '

(['will', 'somebody', 'please', 'tell', 'those', 'chicks', 'disco', 'is', 'dead', '.'], ['will', 'somebody', 'please', 'tell', 'those', 'chicks', 'disco', 'is', 'dead', '.'])
(['sure', '.'], ['sure', '.'])
(['how', 'would', 'you', 'like', 'nice', 'hawaiian', 'punch', '?'], ['how', 'would', 'you', 'like', 'a', 'nice', 'hawaiian', 'punch', '?'])
(['okay', '.', 'ahem', '!', 'you', '.', 'are', '.', 'a.', 'dick', '.', 'with', '.', 'ears', '.', 'and', '.', 'a.', 'really', '.', 'bad', '.', 'haircut', '.'], ['okay', '.', 'ahem', '!', 'you', '.', 'are', '.', 'a.', 'dick', '.', 'with', '.', 'ears', '.', 'and', '.', 'a.', 'really', '.', 'bad', '.', 'haircut', '.'])
(['are', 'you', 'gettin', "'", 'wise', 'with', 'me', '?'], ['are', 'you', 'gettin', "'", 'wise', 'with', 'me', '?'])
(['and', 'if', 'it', 'ai', "n't", 'cleaned', 'off', '?'], ['and', 'if', 'it', 'ai', "n't", 'cleaned', 'off', '?'])
(['could', 'be', '.'], ['could', 'be', '.'])
(['i', 'ca', "n't", 'just', 'walk', 'in', 'and', 'take', 'my

(['is', 'mom', 'okay', '?'], ['is', 'mom', 'okay', '?'])
(['oh-you-beccha', '.'], ['oh-you-beccha', '.'])
(['yah', '?'], ['yah', '?'])
(['your', 'parents', "'d", 'kill', 'you', '.'], ['your', 'parents', "'d", 'kill', 'you', '.'])
(['amber', '?', 'here', '.'], ['amber', '?', 'here', '.'])
(['i', 'ca', "n't", 'believe', 'this', 'is', 'happenin', "'", '.', 'i', 'ca', "n't", 'believe', 'she', 'said', 'you', 'could', "n't", '...'], ['i', 'ca', "n't", 'believe', 'this', 'is', 'happenin', "'", '.', 'i', 'ca', "n't", 'believe', 'she', 'said', 'you', 'could', "n't", '...'])
(['huh', '?'], ['huh', '?'])
(['mrs.', 'leeman', '?'], ['mrs.', 'leeman', '?'])
(['i', 'lucky', 'i', 'have', 'after-school', 'job', 'where', 'i', 'can', 'practice', 'my', 'talent', '.'], ['i', "'m", 'lucky', 'i', 'have', 'an', 'after-school', 'job', 'where', 'i', 'can', 'practice', 'my', 'talent', '.'])
(['no', '.', 'loretta', 'called', '.', 'there', "'s", 'been', 'a', '...', 'a', 'fire', '.'], ['no', '.', 'loretta', 'called

(['do', "n't", 'you', 'feel', 'it', '?'], ['do', "n't", 'you', 'feel', 'it', '?'])
(['it', "'s", 'not', 'magic', '!', 'it', "'s", 'just', 'trick', '!'], ['it', "'s", 'not', 'magic', '!', 'it', "'s", 'just', 'a', 'trick', '!'])
(['i', 'know', '.', 'i', 'know', '.'], ['i', 'know', '.', 'i', 'know', '.'])
(['he', 'wants', 'to', 'kill', 'us', '.'], ['he', 'wants', 'to', 'kill', 'us', '.'])
(['i', 'know', '!'], ['i', 'know', '!'])
(['halfdan', 'black', 'here', '!'], ['halfdan', 'the', 'black', "'s", 'here', '!'])
(['you', 'mean', 'if', 'just', 'one', 'person', 'gets', 'killed', '?'], ['you', 'mean', 'if', 'just', 'one', 'person', 'gets', 'killed', '?'])
(['that', "'s", 'terrible', '!'], ['that', "'s", 'terrible', '!'])
(['you', 'do', "n't", 'go', 'through', 'all', 'hardships', 'of', 'ocean', 'voyage', 'make', '``', 'friends', "''", '.'], ['you', 'do', "n't", 'go', 'through', 'all', 'the', 'hardships', 'of', 'an', 'ocean', 'voyage', 'to', 'make', '``', 'friends', "''", '.'])
(['what', 'wrong

(['like', 'what', '?'], ['like', 'what', '?'])
(['how', 'does', 'she', 'like', 'what', 'in', 'bed', '?'], ['how', 'does', 'she', 'like', 'what', 'in', 'bed', '?'])
(['countess', '.', 'i', "'m", 'really', 'in', 'a', 'dither', '.', 'she', 'turned', 'my', 'head', '.'], ['countess', '.', 'i', "'m", 'really', 'in', 'a', 'dither', '.', 'she', "'s", 'turned', 'my', 'head', '.'])
(['extraordinary', '!'], ['extraordinary', '!'])
(['why', '?', 'why', 'do', 'you', 'have', 'go', '?'], ['why', '?', 'why', 'do', 'you', 'have', 'to', 'go', '?'])
(['but', 'i', 'do', "n't", 'want', 'you', 'to', 'leave', '.', 'please', 'stay', '.', 'we', 'can', 'go', 'to', 'dinner', '.', 'i', 'owe', 'you', 'dinner', '.', 'and', 'tonight', '...', 'we', 'can', '...', 'really', 'make', 'love', '.'], ['but', 'i', 'do', "n't", 'want', 'you', 'to', 'leave', '.', 'please', 'stay', '.', 'we', 'can', 'go', 'to', 'dinner', '.', 'i', 'owe', 'you', 'a', 'dinner', '.', 'and', 'tonight', '...', 'we', 'can', '...', 'really', 'make', '

(['what', '?', 'come', 'on', '.', 'she', "'s", 'got', 'a', 'good', 'thing', 'with', 'victor', '--'], ['what', '?', 'come', 'on', '.', 'she', "'s", 'got', 'a', 'good', 'thing', 'with', 'victor', '--'])
(['reed', '.', 'even', 'you', 'ca', "n't", 'compute', 'every', 'little', 'thing', '.'], ['reed', '.', 'even', 'you', 'ca', "n't", 'compute', 'every', 'little', 'thing', '.'])
(['i', 'do', "n't", 'know', '.', 'i', 'just', 'keep', 'going', 'over', 'and', 'over', 'the', 'numbers', '.'], ['i', 'do', "n't", 'know', '.', 'i', 'just', 'keep', 'going', 'over', 'and', 'over', 'the', 'numbers', '.'])
(['solid', '.'], ['solid', '.'])
(['three', 'days', '.', 'i', 'was', 'worried', 'about', 'you', '.', 'how', 'are', 'you', 'feeling', '?'], ['three', 'days', '.', 'i', 'was', 'worried', 'about', 'you', '.', 'how', 'are', 'you', 'feeling', '?'])
(['how', 'long', 'was', 'i', 'out', '?'], ['how', 'long', 'was', 'i', 'out', '?'])
(['ben', '.', 'this', 'is', 'serious', '.', 'turn', 'around', '.'], ['ben', '.

KeyboardInterrupt: 

In [37]:
for i in range(1,len(data)):
    print(data[i,:])

TypeError: '(1, slice(None, None, None))' is an invalid key

In [38]:
data[1][0]

KeyError: 1

In [74]:
len(data)

197615

In [4]:
import csv
from tqdm import tqdm 
import pandas as pd

In [111]:
with open('data.csv','w') as f:
    csvwriter = csv.writer(f)  
    csvwriter.writerow(['source','target'])  
    for row in tqdm(pairs, total=197615):
        
        src = " ".join(row[0])
        trg = " ".join(row[1])
#         print(src)
#         print(trg)
        csvwriter.writerow([src,trg])
        

100%|██████████| 197615/197615 [00:34<00:00, 5763.00it/s]


In [6]:
dataNew = pd.read_csv("data.csv", error_bad_lines=False)

In [7]:
len(dataNew)

197615

In [115]:
dataNew

Unnamed: 0,source,target
0,they do to !,they do to !
1,i hope so .,i hope so .
2,she okay ?,she okay ?
3,let 's go .,let 's go .
4,wow,wow
5,okay -- you 're gon na need to learn how to lie .,okay -- you 're gon na need to learn how to lie .
6,no,no
7,i 'm kidding . you know how sometimes you just...,i 'm kidding . you know how sometimes you just...
8,like my fear of wearing pastels ?,like my fear of wearing pastels ?
9,the `` real you '' .,the `` real you '' .


In [8]:
from sklearn.model_selection import train_test_split

#Creating a train and validation sentence
train, val = train_test_split(dataNew, test_size=0.1)

In [118]:
train.to_csv("train.csv", index=False)
val.to_csv("val.csv",index=False)

In [1]:
import random
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import spacy
from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
# from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator, TabularDataset
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.enabled = True
import GPUtil

"""
To install spacy languages do:
python -m spacy download en
python -m spacy download de
"""
# spacy_ger = spacy.load("de")
spacy_eng = spacy.load("en")


# def tokenize_ger(text):
#     return [tok.text for tok in spacy_ger.tokenizer(text)]


def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]


# german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>")

english = Field(
    tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>"
)

# train_data, valid_data, test_data = Multi30k.splits(
#     exts=(".de", ".en"), fields=(german, english)
# )


In [2]:
print("Start: tokenizing the train and validation set")
data_fields = [('source', english), ('target', english)]

train, val = TabularDataset.splits(path='./', train='train.csv', validation='val.csv', format='csv', fields=data_fields)
print("train ",len(train))
print("val ",len(val))
print("Done with creation of DataSet Object")


Start: tokenizing the train and validation set
train  177854
val  19763
Done with creation of DataSet Object


In [3]:
english.build_vocab(train, max_size=10000, min_freq=2)

In [4]:
print(len(english.vocab))

10004


In [5]:
    DROPOUT_TOKENS = ["a", "an", "the", "'ll", "'s", "'m", "'ve", "to"]  # Add "to"

    REPLACEMENTS = ["their", "there", "then", "than"]

In [6]:
CORRECTIVE_TOKENS = DROPOUT_TOKENS + REPLACEMENTS

# Creation of Model

## Encoder

In [7]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, bidirectional=True)

        self.fc_hidden = nn.Linear(hidden_size * 2, hidden_size)
        self.fc_cell = nn.Linear(hidden_size * 2, hidden_size)
        self.dropout = nn.Dropout(p)

    def forward(self, x):
        # x: (seq_length, N) where N is batch size

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size)

        encoder_states, (hidden, cell) = self.rnn(embedding)
        # outputs shape: (seq_length, N, hidden_size)

        # Use forward, backward cells and hidden through a linear layer
        # so that it can be input to the decoder which is not bidirectional
        # Also using index slicing ([idx:idx+1]) to keep the dimension
        hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim=2))
        cell = self.fc_cell(torch.cat((cell[0:1], cell[1:2]), dim=2))
        del embedding
        torch.cuda.empty_cache()
        return encoder_states, hidden, cell

## Decoder

In [8]:

class Decoder(nn.Module):
    def __init__(
        self, input_size, embedding_size, hidden_size, output_size, num_layers, p
    ):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(hidden_size * 2 + embedding_size, hidden_size, num_layers)

        self.energy = nn.Linear(hidden_size * 3, 1)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(p)
        self.softmax = nn.Softmax(dim=0)
        self.relu = nn.ReLU()

    def forward(self, x, encoder_states, hidden, cell):
        x = x.unsqueeze(0)
        # x: (1, N) where N is the batch size

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)

        sequence_length = encoder_states.shape[0]
        h_reshaped = hidden.repeat(sequence_length, 1, 1)
        # h_reshaped: (seq_length, N, hidden_size*2)

        energy = self.relu(self.energy(torch.cat((h_reshaped, encoder_states), dim=2)))
        # energy: (seq_length, N, 1)

        attention = self.softmax(energy)
        # attention: (seq_length, N, 1)

        # attention: (seq_length, N, 1), snk
        # encoder_states: (seq_length, N, hidden_size*2), snl
        # we want context_vector: (1, N, hidden_size*2), i.e knl
        context_vector = torch.einsum("snk,snl->knl", attention, encoder_states)

        rnn_input = torch.cat((context_vector, embedding), dim=2)
        # rnn_input: (1, N, hidden_size*2 + embedding_size)

        outputs, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        # outputs shape: (1, N, hidden_size)

        predictions = self.fc(outputs).squeeze(0)
        # predictions: (N, hidden_size)
        
        del x, embedding,energy, attention,outputs
        torch.cuda.empty_cache()
        return predictions, hidden, cell


## Seq2seq

In [9]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(english.vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
        encoder_states, hidden, cell = self.encoder(source)

        # First input will be <SOS> token
        x = target[0]

        for t in range(1, target_len):
            # At every time step use encoder_states and update hidden, cell
            output, hidden, cell = self.decoder(x, encoder_states, hidden, cell)

            # Store prediction for current time step
            outputs[t] = output

            # Get the best word the Decoder predicted (index in the vocabulary)
            best_guess = output.argmax(1)

            # With probability of teacher_force_ratio we take the actual next word
            # otherwise we take the word that the Decoder predicted it to be.
            # Teacher Forcing is used so that the model gets used to seeing
            # similar inputs at training and testing time, if teacher forcing is 1
            # then inputs at test time might be completely different than what the
            # network is used to. This was a long comment.
            x = target[t] if random.random() < teacher_force_ratio else best_guess
        del x, encoder_states, hidden, cell,target_vocab_size
        torch.cuda.empty_cache()
        return outputs


## Model Initialisation

In [10]:
torch.cuda.is_available()

True

In [11]:
### We're ready to define everything we need for training our Seq2Seq model ###
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
device2 = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
load_model = False
save_model = True

# Training hyperparameters
num_epochs = 100
learning_rate = 3e-4
batch_size = 32

# Model hyperparameters
input_size_encoder = len(english.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 1
enc_dropout = 0.0
dec_dropout = 0.0

# Tensorboard to get nice loss plot
writer = SummaryWriter("runs/loss_plot")
step = 0
GPUtil.showUtilization()


| ID | GPU | MEM |
------------------
|  0 |  7% |  4% |
|  1 |  0% |  0% |
|  2 |  0% |  0% |
|  3 |  0% |  0% |


In [12]:
train_iterator = BucketIterator(
    train,
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.source),
    device=device2
)
GPUtil.showUtilization()

| ID | GPU | MEM |
------------------
|  0 |  5% |  4% |
|  1 |  0% |  0% |
|  2 |  0% |  0% |
|  3 |  0% |  0% |


## creation of model, encoder and decoder

In [13]:
val[1].target

['wait', '.', 'i', 'think', 'we', 'understand', 'each', 'other', '.']

In [14]:
encoder_net = Encoder(
    input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout
).to(device)
GPUtil.showUtilization()
decoder_net = Decoder(
    input_size_decoder,
    decoder_embedding_size,
    hidden_size,
    output_size,
    num_layers,
    dec_dropout,
).to(device2)

model = Seq2Seq(encoder_net, decoder_net).to(device)
model = torch.nn.DataParallel(model, device_ids=[1,2]).to(device)
GPUtil.showUtilization()
# model = model.module()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
model = model.module
pad_idx = english.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

if load_model:
    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)

sentence = (
    "I don't want there food"
    "the Cardinals did better then the Cubs in the offseason"
)
GPUtil.showUtilization()

| ID | GPU | MEM |
------------------
|  0 |  5% |  4% |
|  1 |  8% |  9% |
|  2 |  0% |  0% |
|  3 |  0% |  0% |
| ID | GPU | MEM |
------------------
|  0 |  5% |  4% |
|  1 |  1% | 11% |
|  2 | 17% |  9% |
|  3 |  0% |  0% |
| ID | GPU | MEM |
------------------
|  0 |  6% |  4% |
|  1 |  0% | 11% |
|  2 |  0% |  9% |
|  3 |  0% |  0% |


In [15]:
for epoch in range(num_epochs):
    print("[Epoch {} / {}]".format(epoch,num_epochs))

    if save_model:
        checkpoint = {
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),
        }
        save_checkpoint(checkpoint)

    model.eval()

    translated_sentence = translate_sentence(
        model, sentence, english, english, device, max_length=50
    )

    print("Translated example sentence: \n {}".format(translated_sentence))

    model.train()
    GPUtil.showUtilization()

    for batch_idx, batch in enumerate(train_iterator):
        # Get input and targets and get to cuda
        inp_data = batch.source.to(device)
        target = batch.target.to(device)

        # Forward prop
        output = model(inp_data, target)
#         GPUtil.showUtilization()
        
        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin. While we're at it
        # Let's also remove the start token while we're at it
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
#         GPUtil.showUtilization()
        loss = criterion(output, target)
        
        del output, inp_data, target
        torch.cuda.empty_cache()
        for i in range(10000000):
            continue
        # Back prop
        loss.mean().backward()

        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        # Plot to tensorboard
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1
        del loss
        torch.cuda.empty_cache()
        for i in range(10000000):
            continue
# running on entire test data takes a while
score = bleu(val[1:100], model, english, english, device)
print("Bleu score {}".format(score * 100))


[Epoch 0 / 100]
=> Saving checkpoint
Translated example sentence: 
 ['local', 'fame', 'solve', 'undoubtedly', 'accomplice', 'ears', 'destroys', 'herbert', 'functional', 'scientific', 'scanner', 'selected', 'ties', 'laugh', 'foods', 'faced', 'increase', 'interest', 'script', 'spice', 'crashing', 'sue', 'horn', 'pig', 'found', 'eastern', 'halo', 'cramp', 'cramp', 'cramp', 'cramp', 'cramp', 'slippers', 'touched', 'de', 'parade', 'habits', 'sheets', 'offer', 'joints', 'samuel', 'band', 'dislike', 'copy', 'seem', 'herbert', 'agnes', 'chalk', 'cape', 'iii']
| ID | GPU | MEM |
------------------
|  0 |  2% | 11% |
|  1 | 45% | 10% |
|  2 |  0% |  7% |
|  3 |  0% |  0% |


RuntimeError: CUDA out of memory. Tried to allocate 56.00 MiB (GPU 1; 7.93 GiB total capacity; 7.16 GiB already allocated; 38.56 MiB free; 146.20 MiB cached)