In [48]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [49]:
# check torch is using GPU acceleration
print(torch.backends.mps.is_available())
print(torch.backends.mps.is_built())

True
True


In [50]:
class Bengio_et_al_word2vec(nn.Module):
    def __init__(self, n, embedding_dim, hidden_dim, vocab_size):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear((n-1)*embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, vocab_size)
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [51]:
import spacy
nlp = spacy.load('en_core_web_md')

In [52]:
punctuation_dict = {
    "...": "||elipsis||",
    ".": "||period||",
    ",": "||comma||",
    "\"": "||quotation||",
    ";": "||semicolon||",
    "!": "||exclamation||",
    "?": "||question||",
    "(": "||lparathensis||",
    ")": "||rparathensis||",
    "[": "||lsquare||",
    "]": "||rsquare||",
    "-": "||dash||",
    "'": "",
    "\n": "",
    "$": "||dollar||"
}

In [53]:
def clean_text(text):
    text = text.lower().replace("'", "").replace("\n", " ")
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    # for punct in punctuation_dict.keys():
    #     contents = contents.replace(punct, " " + punctuation_dict[punct] + " ")
    text = re.sub('  +', ' ', text)
    return text

In [54]:
import os
import re
import string

def read_script(filename):
    with open(filename, 'r') as f:
        contents = f.read()
    return clean_text(contents)

scripts = [read_script(os.path.join("data", script))
           for script in sorted(os.listdir("data/"))]

In [55]:
contents = ' '.join(scripts)

In [56]:
tokens = contents.split(" ")

In [57]:
import numpy as np
from collections import Counter

def create_lookup_tables(text):
    """
    Create lookup tables for vocabulary
    :param text: The text of tv scripts split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    # TODO: Implement Function
    Couted_text= Counter(text)
    sorted_vocab = sorted(Couted_text, key=Couted_text.get, reverse=True)
    
    # create int_to_vocab dictionaries
    int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}
    
    return (vocab_to_int, int_to_vocab)

In [58]:
vocab_to_int, int_to_vocab = create_lookup_tables(tokens)

In [59]:
contents



In [60]:
tokens

['mr',
 'soprano',
 'yeah',
 'have',
 'a',
 'seat',
 'my',
 'understanding',
 'from',
 'dr',
 'cusamano',
 'your',
 'physician',
 'is',
 'that',
 'you',
 'collapsed',
 'possibly',
 'a',
 'panic',
 'attack',
 'you',
 'were',
 'unable',
 'to',
 'breathe',
 'they',
 'said',
 'it',
 'was',
 'a',
 'panic',
 'attack',
 'the',
 'blood',
 'and',
 'neurological',
 'work',
 'came',
 'back',
 'negative',
 'and',
 'they',
 'sent',
 'me',
 'here',
 'you',
 'dont',
 'agree',
 'that',
 'you',
 'had',
 'a',
 'panic',
 'attack',
 'how',
 'are',
 'you',
 'feeling',
 'now',
 'good',
 'fine',
 'back',
 'at',
 'work',
 'what',
 'line',
 'of',
 'work',
 'are',
 'you',
 'in',
 'waste',
 'management',
 'consultant',
 'its',
 'impossible',
 'for',
 'me',
 'to',
 'talk',
 'to',
 'a',
 'psychiatrist',
 'any',
 'thoughts',
 'at',
 'all',
 'on',
 'why',
 'you',
 'blacked',
 'out',
 'i',
 'dont',
 'know',
 'stress',
 'maybe',
 'about',
 'what',
 'i',
 'dont',
 'know',
 'the',
 'morning',
 'i',
 'got',
 'sick',
 'id

In [61]:
vocab_to_int

{'you': 0,
 'the': 1,
 'i': 2,
 'a': 3,
 'to': 4,
 'it': 5,
 'that': 6,
 'and': 7,
 'what': 8,
 'of': 9,
 'in': 10,
 'me': 11,
 'my': 12,
 'this': 13,
 'your': 14,
 'is': 15,
 'he': 16,
 'on': 17,
 'for': 18,
 'im': 19,
 'was': 20,
 'its': 21,
 'know': 22,
 'dont': 23,
 'with': 24,
 'all': 25,
 'do': 26,
 'we': 27,
 'not': 28,
 'about': 29,
 'no': 30,
 'have': 31,
 'be': 32,
 'fucking': 33,
 'him': 34,
 'are': 35,
 'get': 36,
 'so': 37,
 'here': 38,
 'right': 39,
 'got': 40,
 'like': 41,
 'up': 42,
 'out': 43,
 'but': 44,
 'youre': 45,
 'at': 46,
 'go': 47,
 'just': 48,
 'yeah': 49,
 'his': 50,
 'thats': 51,
 'they': 52,
 'hes': 53,
 'how': 54,
 'were': 55,
 'fuck': 56,
 'there': 57,
 'come': 58,
 'oh': 59,
 'gonna': 60,
 'want': 61,
 'tony': 62,
 'good': 63,
 'she': 64,
 'if': 65,
 'her': 66,
 'now': 67,
 'can': 68,
 'well': 69,
 'think': 70,
 'did': 71,
 'some': 72,
 'from': 73,
 'one': 74,
 'see': 75,
 'ill': 76,
 'when': 77,
 'say': 78,
 'had': 79,
 'shit': 80,
 'back': 81,
 'take'

In [62]:
int_to_vocab

{0: 'you',
 1: 'the',
 2: 'i',
 3: 'a',
 4: 'to',
 5: 'it',
 6: 'that',
 7: 'and',
 8: 'what',
 9: 'of',
 10: 'in',
 11: 'me',
 12: 'my',
 13: 'this',
 14: 'your',
 15: 'is',
 16: 'he',
 17: 'on',
 18: 'for',
 19: 'im',
 20: 'was',
 21: 'its',
 22: 'know',
 23: 'dont',
 24: 'with',
 25: 'all',
 26: 'do',
 27: 'we',
 28: 'not',
 29: 'about',
 30: 'no',
 31: 'have',
 32: 'be',
 33: 'fucking',
 34: 'him',
 35: 'are',
 36: 'get',
 37: 'so',
 38: 'here',
 39: 'right',
 40: 'got',
 41: 'like',
 42: 'up',
 43: 'out',
 44: 'but',
 45: 'youre',
 46: 'at',
 47: 'go',
 48: 'just',
 49: 'yeah',
 50: 'his',
 51: 'thats',
 52: 'they',
 53: 'hes',
 54: 'how',
 55: 'were',
 56: 'fuck',
 57: 'there',
 58: 'come',
 59: 'oh',
 60: 'gonna',
 61: 'want',
 62: 'tony',
 63: 'good',
 64: 'she',
 65: 'if',
 66: 'her',
 67: 'now',
 68: 'can',
 69: 'well',
 70: 'think',
 71: 'did',
 72: 'some',
 73: 'from',
 74: 'one',
 75: 'see',
 76: 'ill',
 77: 'when',
 78: 'say',
 79: 'had',
 80: 'shit',
 81: 'back',
 82: 't

In [63]:
docs = list(nlp.pipe(tokens, n_process=6))

In [64]:
token_embeddings = torch.tensor([doc.vector for doc in docs])

In [65]:
token_embeddings.shape

torch.Size([418232, 300])

In [66]:
tokens[:12]

['mr',
 'soprano',
 'yeah',
 'have',
 'a',
 'seat',
 'my',
 'understanding',
 'from',
 'dr',
 'cusamano',
 'your']

In [67]:
token_embeddings[:12,]

tensor([[ -0.6751,   0.2653,  -3.1446,  ...,   1.1676,  -0.6565,  -0.4082],
        [  2.0068,   0.1829,   0.8957,  ...,   1.9985,  -2.3157,   2.5255],
        [  0.8682,  -0.3262,  -0.3838,  ...,   1.9856,  -0.2028,   0.8627],
        ...,
        [  1.4872,   0.3268,  -1.2697,  ...,  -2.6396,  -1.8952,  -0.4490],
        [  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000],
        [  3.9709,   4.8703, -10.0390,  ...,  -1.7916, -10.0240,   0.5141]])

In [68]:
def create_dataset_for_Bengio(window_size, tokens, token_embeddings, vocab_to_int):
    X = []
    y = []
    for i in range(window_size-1, len(tokens)):
        # for current word in index i
        # obtain word embeddings for the previous (window_size-1) words
        X.append(token_embeddings[i-(window_size-1):i,].flatten())
        # obtain token_id for the current word
        y.append(vocab_to_int[tokens[i]])
    return torch.stack(X), torch.tensor(y)

In [69]:
X, y = create_dataset_for_Bengio(4, tokens, token_embeddings, vocab_to_int)

In [70]:
X.shape

torch.Size([418229, 900])

In [71]:
y.shape

torch.Size([418229])

In [72]:
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset
from torch.utils.data.dataloader import DataLoader

def split_dataset(x_data,
                  y_data,
                  train_size = 0.8,
                  batch_size = 64,
                  seed = 42):
    # first split data into train set, test/valid set
    train_index, test_index = train_test_split(range(len(y_data)),
                                               test_size=(1-train_size),
                                               shuffle=True,
                                               random_state=seed)
    
    x_train = x_data[train_index]
    y_train = y_data[train_index]
    x_test = x_data[test_index]
    y_test = y_data[test_index]
    
    train = TensorDataset(x_train, y_train)
    test = TensorDataset(x_test, y_test)
    train_loader = DataLoader(dataset=train,
                              batch_size=batch_size,
                              shuffle=False)
    test_loader = DataLoader(dataset=test,
                             batch_size=batch_size,
                             shuffle=False)

    return train_loader, test_loader

In [73]:
train_loader, test_loader = split_dataset(X, y, 0.8, 64, 42)

In [74]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using {device} device")

Using mps device


In [75]:
window_size = 4

In [76]:
model = Bengio_et_al_word2vec(n=window_size,
                              embedding_dim=token_embeddings.shape[1],
                              hidden_dim=500,
                              vocab_size=len(vocab_to_int)).to(device)
print(model)

Bengio_et_al_word2vec(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=900, out_features=500, bias=True)
    (1): ReLU()
    (2): Linear(in_features=500, out_features=18866, bias=True)
  )
)


In [77]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [78]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [79]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [80]:
epochs = 100000
for t in range(0, epochs, 25000):
    print(f"Epoch {t}\n-------------------------------")
    train(train_loader, model, loss_fn, optimizer)
    test(test_loader, model, loss_fn)
print("Done!")

Epoch 0
-------------------------------
loss: 10.392384  [   64/334583]
loss: 9.322482  [ 6464/334583]
loss: 8.998567  [12864/334583]
loss: 9.140135  [19264/334583]
loss: 8.750331  [25664/334583]
loss: 8.706537  [32064/334583]
loss: 7.567351  [38464/334583]
loss: 8.267360  [44864/334583]
loss: 7.303654  [51264/334583]
loss: 8.006909  [57664/334583]
loss: 7.269784  [64064/334583]
loss: 7.692860  [70464/334583]
loss: 7.757287  [76864/334583]
loss: 7.396632  [83264/334583]
loss: 7.278845  [89664/334583]
loss: 7.220703  [96064/334583]
loss: 7.777912  [102464/334583]
loss: 7.378602  [108864/334583]
loss: 7.162069  [115264/334583]
loss: 7.098548  [121664/334583]
loss: 7.231112  [128064/334583]
loss: 7.956825  [134464/334583]
loss: 6.498912  [140864/334583]
loss: 6.515769  [147264/334583]
loss: 6.445539  [153664/334583]
loss: 7.591350  [160064/334583]
loss: 7.169521  [166464/334583]
loss: 6.367056  [172864/334583]
loss: 7.609105  [179264/334583]
loss: 6.814319  [185664/334583]
loss: 7.013813 

In [84]:
np.random.choice(list(range(10)), p=)

5

In [117]:
import random 
def generate_text(input, model, steps, window_size, T=1):
    device = "cpu"
    model = model.to(device)
    text = clean_text(input)
    print(f"input: {text}")
    text_tokens = text.split(' ')
    if len(text_tokens) < window_size-1:
        text_tokens = ['' for i in range(window_size-1)] + text_tokens
    docs = list(nlp.pipe(text_tokens))
    token_embeddings = torch.tensor([doc.vector for doc in docs])
    emb_size = token_embeddings.shape[1]
    input_vec = token_embeddings[token_embeddings.shape[0]-(window_size-1):,].flatten().to(device)
    model.eval()
    softmax = nn.Softmax(dim=0)
    with torch.no_grad():
        for i in range(steps):
            input_vec = input_vec
            pred = model(input_vec)
            prob = torch.pow(softmax(pred), 1/T)
            prob = prob/prob.sum()
            word = int_to_vocab[random.choices(list(range(len(prob))), weights=prob)[0]]
            print(f"next word: {word}")
            text_tokens.append(word)
            new_word_embedding = torch.tensor(nlp(word).vector).to(device)
            input_vec = torch.concatenate((input_vec[emb_size:], new_word_embedding), 0)
    return ' '.join(text_tokens)

In [128]:
generate_text("tony the meat has", model=model, steps=50, window_size=window_size, T=0.5)

input: tony the meat has
next word: to
next word: be
next word: a
next word: fucking
next word: how
next word: are
next word: you
next word: gonna
next word: do
next word: this
next word: is
next word: it
next word: to
next word: do
next word: something
next word: i
next word: dont
next word: know
next word: what
next word: i
next word: dont
next word: want
next word: to
next word: talk
next word: to
next word: the
next word: fucking
next word: fucking
next word: its
next word: not
next word: a
next word: fucking
next word: i
next word: dont
next word: know
next word: what
next word: do
next word: you
next word: got
next word: a
next word: couple
next word: of
next word: the
next word: therapy
next word: he
next word: never
next word: got
next word: a
next word: lot
next word: of


'tony the meat has to be a fucking how are you gonna do this is it to do something i dont know what i dont want to talk to the fucking fucking its not a fucking i dont know what do you got a couple of the therapy he never got a lot of'