In [None]:
import torch
from torch import nn
import torch.optim as optim

import re
import random
import numpy as np
from collections import Counter

In [None]:
raw_text = """
If I'm a bad person, you don't like me
Well, I guess I'll make my own way
It's a circle, a mean cycle
I can't excite you anymore
Where's your gavel? Your jury?
What's my offense this time?
You're not a judge, but if you're gonna judge me
Well, sentence me to another life
Don't wanna hear your sad songs
I don't wanna feel your pain
When you swear it's all my fault
'Cause you know we're not the same
We're not the same
Oh, we're not the same
Yeah, the friends who stuck together
We wrote our names in blood
But I guess you can't accept that the change is good
It's good, it's good
Well, you treat me just like another stranger
Well, it's nice to meet you, sir
I guess I'll go
I'd best be on my way out
You treat me just like another stranger
Well, it's nice to meet you, sir
I guess I'll go
I'd best be on my way out
Ignorance is your new best friend
Ignorance is your new best friend
This is the best thing that could have happened
Any longer and I wouldn't have made it
It's not a war, no, it's not a rapture
I'm just a person, but you can't take it
The same tricks that, that once fooled me
They won't get you anywhere
I'm not the same kid from your memory
Well, now I can fend for myself
Don't wanna hear your sad songs
I don't wanna feel your pain
When you swear it's all my fault
'Cause you know we're not the same
We're not the same
Oh, we're not the same
Yeah, the friends who stuck together
We wrote our names in blood
But I guess you can't accept that the change is good
It's good, it's good
Well, you treat me just like another stranger
Well, it's nice to meet you, sir
I guess I'll go
I'd best be on my way out
You treat me just like another stranger
Well, it's nice to meet you, sir
I guess I'll go
I'd best be on my way out
Ignorance is your new best friend
Ignorance is your new best friend
Ignorance is your new best friend
Ignorance is your new best friend
Well, you treat me just like another stranger
Well, it's nice to meet you, sir
I guess I'll go
I'd best be on my way out
You treat me just like another stranger
Well, it's nice to meet you, sir
I guess I'll go
I'd best be on my way out

"""

In [None]:
def preprocess(text):

    # Replace punctuation with tokens so we can use them in our model
    text = text.lower()
    text = text.replace('.', ' <PERIOD> ')
    text = text.replace(',', ' <COMMA> ')
    text = text.replace('"', ' <QUOTATION_MARK> ')
    text = text.replace(';', ' <SEMICOLON> ')
    text = text.replace('!', ' <EXCLAMATION_MARK> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    text = text.replace('(', ' <LEFT_PAREN> ')
    text = text.replace(')', ' <RIGHT_PAREN> ')
    text = text.replace('--', ' <HYPHENS> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    # text = text.replace('\n', ' <NEW_LINE> ')
    text = text.replace(':', ' <COLON> ')
    words = text.split()

    # Remove all words with  5 or fewer occurences
    word_counts = Counter(words)
    trimmed_words = [word for word in words if word_counts[word] > 1]

    return trimmed_words

In [None]:
# get list of words
words = preprocess(raw_text)
print(words[:30])

['if', "i'm", 'a', 'person', '<COMMA>', 'you', "don't", 'like', 'me', 'well', '<COMMA>', 'i', 'guess', "i'll", 'my', 'way', "it's", 'a', '<COMMA>', 'a', 'i', "can't", 'you', 'your', '<QUESTION_MARK>', 'your', '<QUESTION_MARK>', 'my', 'this', '<QUESTION_MARK>']


In [None]:
# print some stats about this word data
print("Total words in text: {}".format(len(words)))
print("Unique words: {}".format(len(set(words)))) # `set` removes any duplicate words

Total words in text: 420
Unique words: 75


In [None]:
def create_lookup_tables(words):
    """
    Create lookup tables for vocabulary
    :param words: Input list of words
    :return: A tuple of dicts.  The first dict....
    """
    word_counts = Counter(words)
    # sorting the words from most to least frequent in text occurrence
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    # create int_to_vocab dictionaries
    int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}

    return vocab_to_int, int_to_vocab

In [None]:
vocab_to_int, int_to_vocab = create_lookup_tables(words)
int_words = [vocab_to_int[word] for word in words]

print(int_words[:30])

[43, 41, 14, 44, 0, 1, 36, 15, 8, 7, 0, 3, 11, 16, 9, 17, 2, 14, 0, 14, 3, 38, 1, 4, 42, 4, 42, 9, 45, 42]


In [None]:
threshold = 1e-5
word_counts = Counter(int_words)

total_count = len(int_words)
freqs = {word: count/total_count for word, count in word_counts.items()}
p_drop = {word: 1 - np.sqrt(threshold/freqs[word]) for word in word_counts}

# discard some frequent words, according to the subsampling equation
# create a new list of words for training
train_words = [word for word in int_words if random.random() < (1 - p_drop[word])]

print(train_words[:30])

[55, 62, 11, 51, 59, 62, 63, 17, 11, 9, 20]


In [None]:
def get_target(words, idx, window_size=5):
    ''' Get a list of words in a window around an index. '''

    R = np.random.randint(1, window_size+1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R
    target_words = words[start:idx] + words[idx+1:stop+1]

    return list(target_words)

In [None]:
# test your code!

# run this cell multiple times to check for random window selection
int_text = [i for i in range(10)]
print('Input: ', int_text)
idx=5 # word index of interest

target = get_target(int_text, idx=idx, window_size=2)
print('Target: ', target)  # you should get some indices around the idx

Input:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Target:  [4, 6]


In [None]:
def get_batches(words, batch_size, window_size=5):
    ''' Create a generator of word batches as a tuple (inputs, targets) '''

    n_batches = len(words)//batch_size

    # only full batches
    words = words[:n_batches*batch_size]

    for idx in range(0, len(words), batch_size):
        x, y = [], []
        batch = words[idx:idx+batch_size]
        for ii in range(len(batch)):
            batch_x = batch[ii]
            batch_y = get_target(batch, ii, window_size)
            y.extend(batch_y)
            x.extend([batch_x]*len(batch_y))
        yield x, y

In [None]:
int_text = [i for i in range(20)]
x,y = next(get_batches(int_text, batch_size=4, window_size=5))

print('x\n', x)
print('y\n', y)

x
 [0, 0, 1, 1, 2, 2, 3]
y
 [1, 2, 0, 2, 1, 3, 2]


In [None]:
def cosine_similarity(embedding, valid_size=16, valid_window=100, device='cpu'):
    """ Returns the cosine similarity of validation words with words in the embedding matrix.
        Here, embedding should be a PyTorch embedding module.
    """

    # Here we're calculating the cosine similarity between some random words and
    # our embedding vectors. With the similarities, we can look at what words are
    # close to our random words.

    # sim = (a . b) / |a||b|

    embed_vectors = embedding.weight

    # magnitude of embedding vectors, |b|
    magnitudes = embed_vectors.pow(2).sum(dim=1).sqrt().unsqueeze(0)

    # pick N words from our ranges (0,window) and (1000,1000+window). lower id implies more frequent
    valid_examples = np.array(random.sample(range(valid_window), valid_size//2))
    valid_examples = np.append(valid_examples,
                               random.sample(range(1000,1000+valid_window), valid_size//2))
    valid_examples = torch.LongTensor(valid_examples).to(device)

    valid_vectors = embedding(valid_examples)
    similarities = torch.mm(valid_vectors, embed_vectors.t())/magnitudes

    return valid_examples, similarities

In [None]:
class SkipGramNeg(nn.Module):
    def __init__(self, n_vocab, n_embed, noise_dist=None):
        super().__init__()

        self.n_vocab = n_vocab
        self.n_embed = n_embed
        self.noise_dist = noise_dist

        # define embedding layers for input and output words
        self.in_embed = nn.Embedding(n_vocab, n_embed)
        self.out_embed = nn.Embedding(n_vocab, n_embed)

        # Initialize both embedding tables with uniform distribution
        self.in_embed.weight.data.uniform_(-1, 1)
        self.out_embed.weight.data.uniform_(-1, 1)

    def forward_input(self, input_words):
        # return input vector embeddings

        return self.in_embed(input_words)

    def forward_output(self, output_words):
        # return output vector embeddings

        return self.out_embed(output_words)

    def forward_noise(self, batch_size, n_samples):
        """ Generate noise vectors with shape (batch_size, n_samples, n_embed)"""
        if self.noise_dist is None:
            # Sample words uniformly
            noise_dist = torch.ones(self.n_vocab)
        else:
            noise_dist = self.noise_dist

        # Sample words from our noise distribution
        noise_words = torch.multinomial(noise_dist,
                                        batch_size * n_samples,
                                        replacement=True)

        device = "cuda" if model.out_embed.weight.is_cuda else "cpu"
        noise_words = noise_words.to(device)

        ## TODO: get the noise embeddings
        # reshape the embeddings so that they have dims (batch_size, n_samples, n_embed)
        noise_words = self.out_embed(noise_words)
        noise_words = noise_words.view(batch_size, n_samples, self.n_embed)

        return noise_words

In [None]:
class NegativeSamplingLoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, input_vectors, output_vectors, noise_vectors):

        batch_size, embed_size = input_vectors.shape

        # Input vectors should be a batch of column vectors
        input_vectors = input_vectors.view(batch_size, embed_size, 1)

        # Output vectors should be a batch of row vectors
        output_vectors = output_vectors.view(batch_size, 1, embed_size)

        # bmm = batch matrix multiplication
        # correct log-sigmoid loss
        out_loss = torch.bmm(output_vectors, input_vectors).sigmoid().log()
        out_loss = out_loss.squeeze()

        # incorrect log-sigmoid loss
        noise_loss = torch.bmm(noise_vectors.neg(), input_vectors).sigmoid().log()
        noise_loss = noise_loss.squeeze().sum(1)  # sum the losses over the sample of noise vectors

        # negate and sum correct and noisy log-sigmoid losses
        # return average batch loss
        return -(out_loss + noise_loss).mean()

In [None]:
freqs

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Get our noise distribution
# Using word frequencies calculated earlier in the notebook
word_freqs = np.array(sorted(freqs.values(), reverse=True))
unigram_dist = word_freqs/word_freqs.sum()
noise_dist = torch.from_numpy(unigram_dist**(0.75)/np.sum(unigram_dist**(0.75)))

# instantiating the model
embedding_dim = 300
model = SkipGramNeg(len(vocab_to_int), embedding_dim, noise_dist=noise_dist).to(device)

# using the loss that we defined
criterion = NegativeSamplingLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)

print_every = 10
steps = 0
epochs = 50

# train for some number of epochs
for e in range(epochs):

    # get our input, target batches
    for input_words, target_words in get_batches(train_words, 2):
        steps += 1
        inputs, targets = torch.LongTensor(input_words), torch.LongTensor(target_words)
        inputs, targets = inputs.to(device), targets.to(device)



        # input, outpt, and noise vectors
        input_vectors = model.forward_input(inputs)
        output_vectors = model.forward_output(targets)
        noise_vectors = model.forward_noise(inputs.shape[0], 5)


        # negative sampling loss
        loss = criterion(input_vectors, output_vectors, noise_vectors)


        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # loss stats
        if steps % print_every == 0:
            print("Epoch: {}/{}".format(e+1, epochs))
            print("Loss: ", loss.item()) # avg batch loss at this point in training
            # valid_examples, valid_similarities = cosine_similarity(model.in_embed, device=device, valid_size=2)
            # _, closest_idxs = valid_similarities.topk(6)

            # valid_examples, closest_idxs = valid_examples.to('cpu'), closest_idxs.to('cpu')
            # for ii, valid_idx in enumerate(valid_examples):
            #     closest_words = [int_to_vocab[idx.item()] for idx in closest_idxs[ii]][1:]
            #     print(int_to_vocab[valid_idx.item()] + " | " + ', '.join(closest_words))
            print("...\n")

Epoch: 2/50
Loss:  19.8654727935791
...

Epoch: 4/50
Loss:  11.68275260925293
...

Epoch: 6/50
Loss:  6.8022141456604
...

Epoch: 8/50
Loss:  9.818048477172852
...

Epoch: 10/50
Loss:  11.939051628112793
...

Epoch: 12/50
Loss:  14.844532012939453
...

Epoch: 14/50
Loss:  8.164490699768066
...

Epoch: 16/50
Loss:  3.639408826828003
...

Epoch: 18/50
Loss:  5.670575141906738
...

Epoch: 20/50
Loss:  1.9219176769256592
...

Epoch: 22/50
Loss:  1.4990862607955933
...

Epoch: 24/50
Loss:  0.49791812896728516
...

Epoch: 26/50
Loss:  1.2909852266311646
...

Epoch: 28/50
Loss:  1.5197029113769531
...

Epoch: 30/50
Loss:  1.282673716545105
...

Epoch: 32/50
Loss:  1.9757285118103027
...

Epoch: 34/50
Loss:  1.1976169347763062
...

Epoch: 36/50
Loss:  3.9572222232818604
...

Epoch: 38/50
Loss:  4.885851860046387
...

Epoch: 40/50
Loss:  0.3559829592704773
...

Epoch: 42/50
Loss:  5.517121315002441
...

Epoch: 44/50
Loss:  1.7190409898757935
...

Epoch: 46/50
Loss:  5.399352550506592
...

Epoch