In [249]:
import tensorflow as tf


# If you are going to use GPU, make sure the GPU in in the output
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [250]:
import os
import pickle
# In this imdb review, each line is a sentence seperated by a space.
sentences = pickle.load(open(os.path.join('./Archive/Archive/a2-data', 'imdb_review.pickle'), 'rb'))
print(sentences[:10])

['watching time chasers it obvious that it was made by a bunch of friends', 'maybe they were sitting around one day in film school and said hey let s pool our money together and make a really bad movie or something like that', 'what ever they said they still ended up making a really bad movie dull story bad script lame acting poor cinematography bottom of the barrel stock music etc', 'all corners were cut except the one that would have prevented this film s release', 'life s like that', 'i saw this film about years ago and remember it as being particularly nasty', 'i believe it is based on a true incident a young man breaks into a nurses home and rapes tortures and kills various women it is in black and white but saves the colour for one shocking shot at the end the film seems to be trying to make some political statement but it just comes across as confused and obscene avoid', 'minor spoilersin new york joan barnard elvire audrey is informed that her husband the archeologist arthur ba

# Data Processing

## Tokenization

In [251]:
import operator
class Tokenization:
    def __init__(self):
        self.token_occurences = {"sos":0,"eos":0,"unk":0}
        self.token_map = {"sos":0,"eos":1,"unk":2}
        self.index_map = {0:"sos",1:"eos",2:"unk"}
    def fit(self,data):
        for sentence in data:
            tokens = sentence.split(" ")
            for token in tokens:
                if token in self.token_occurences:
                    self.token_occurences[token] = self.token_occurences[token]+1
                else:
                    self.token_occurences[token] = 1
        sorted_token_occurences = dict( sorted(self.token_occurences.items(), key=operator.itemgetter(1), reverse=True))
        idx = 0
        for key,value in zip(sorted_token_occurences.keys(),sorted_token_occurences.values()):
            if key == "sos" or key == "eos" or key == "unk":
                continue
            self.token_map[key] = idx + 3
            self.index_map[idx+3] = key
            idx = idx+1
    def encode(self,data):
        sent_token_ids = []
        for sentence in data:
            token_ids = []
            tokens = sentence.split(" ")
            for token in tokens:
                if token in self.token_map:
                    token_ids.append(self.token_map[token])
                else:
                    token_ids.append(self.token_map["unk"])
            if len(token_ids) > 2:
                sent_token_ids.append(token_ids)
        return sent_token_ids
                    
            

In [252]:
tokenization = Tokenization()

In [253]:
tokenization.fit(sentences)

In [254]:
# tokenization.token_map

In [255]:
sent_token_ids = tokenization.encode(sentences)

In [256]:
token_ocur = sorted(tokenization.token_occurences.items(), key=lambda e: e[1], reverse=True)
print(len(token_ocur))
# print(token_ocur[:10])

102297


## Training Data Generation

In [270]:
import math as math
def positive_samples(sent_tokens_ids):
    count = 0
    positive_pairs = []
    for token_ids in sent_token_ids:
        center_word,context_words = 0,[]
        length = len(token_ids)
        # print(token_ids)
        for i,tokens in enumerate(token_ids):
            # print(tokens)
            center_word = tokens
            start_idx = max(0,i - 2)
            end_idx = min(length,i+2+1)
            # print(start_idx,end_idx)
            context_words = [token_ids[j] for j in range(start_idx, end_idx) if j != i]
            for context in context_words:
                positive_pairs.append([center_word,context])
            # print(context_words)
        # print(positive_pairs)
    return positive_pairs,count


In [271]:
sent_token_ids[3]

[32, 9814, 70, 590, 540, 3, 31, 13, 61, 29, 9364, 12, 21, 14, 773]

In [272]:
positive_pairs,count = positive_samples(sent_token_ids)

In [274]:
import numpy as np
def get_negative_samples(vocab_size: int, batch_size: int, negative_sample_num: int) -> np.ndarray:
    """ Generate negative words
    
    Args:
        vocab_size: number of tokens in the vocabulary
        batch_size: number of samples (center word) in a batch
        negative_sample_num: number of negative words sampled for a center word
        
    Return:
        negative_words: Shape of (batch_size x negative_sample_num)
        
    Note: 1. You should NOT sample special token in the vocabulary, i.e., the token ids range should be [5, vocab_size)
          2. Hint: See numpy.random.choice. Read carefully for each parameter of this function
    """
    negative_words = None
    # Start your code here
    negative_words = np.random.choice(np.arange(3, vocab_size), size=(batch_size, negative_sample_num), replace=True)
    # End
    return negative_words

def generate_training_data(positive_pairs,vocab_size,batch_size,negative_sample_num):
    while True:
        center_batch = [];context_batch = []
        for pair in positive_pairs:
            center_batch.append(pair[0])
            context_batch.append(pair[1])
            if(len(center_batch) == batch_size):
                negative_words = get_negative_samples(vocab_size, batch_size, negative_sample_num)
                yield center_batch,context_batch,negative_words
                center_batch = [];context_batch = []
        if len(center_batch) < batch_size:
            new_batch_size = len(center_batch)
            negative_words = get_negative_samples(vocab_size, new_batch_size, negative_sample_num)
            yield center_batch,context_batch,negative_words
    

In [275]:
from tensorflow.keras import Model
from tensorflow.keras.losses import Loss
from tensorflow.keras.layers import Embedding
        
def negative_sampling_loss(center_embeddings, context_embeddings, negative_embeddings):
    """ Calculate the negative sampling loss
    
    Args:
        center_embeddings: v_c, (batch_size x embedding_dim)
        context_embeddings: u_o, (batch_size x embedding_dim)
        negative_embeddings: u_k, (batch_size x negative_sample_num x embedding_dim)
    """
    loss = 0
    # Start your code here
    # 1. Calculate positive dot product
    positive_dot_product = tf.reduce_sum(tf.multiply(center_embeddings, context_embeddings), axis=1)

    # 2. loss for the positive pairs
    positive_loss = tf.reduce_mean(tf.math.log_sigmoid(positive_dot_product))
    
    # 3. Calculate negative dot product
    negative_dot_product = tf.reduce_sum(tf.multiply(center_embeddings[:, tf.newaxis, :], negative_embeddings), axis=2)
    
    # 4. loss for the negative words
    negative_loss = tf.reduce_mean(tf.math.log_sigmoid(-negative_dot_product))
    
    # Hint: See tf.reduce_sum, tf.expand_dims, tf.reduce_mean for help
    loss = - (positive_loss + negative_loss)
    # End
    return loss


class SkipGram(Model):
    def __init__(self, vocab_size: int, embedding_dim: int):
        """ Skip-gram model.
        """
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim

        # Start your code here
        # Initialize embedding layers
        self.center_embeddings = Embedding(input_dim=vocab_size, output_dim=embedding_dim, name='center_embedding')
        self.context_embeddings = Embedding(input_dim=vocab_size, output_dim=embedding_dim, name='context_embedding')
        # Hint: See tf.keras.layers.Embedding

        # End
        
    def call(self, center_words, context_words, negative_words):
        """ Forward of the skip-gram model
        
        Args:
            center_words: tensor (batch_size, )
            context_words: tensor (batch_size, )
            negative_words: tensor (batch_size, negative_embeddings)
            
        Return:
            center_embeddings, context_embeddings, negative_embeddings: The input for the negative_sampling_loss.
        """
        # Start your code here
        center_embeddings = self.center_embeddings(center_words)
        context_embeddings = self.context_embeddings(context_words)
        negative_embeddings = self.context_embeddings(negative_words)
        # End
        
        return center_embeddings, context_embeddings, negative_embeddings

In [278]:
n_batch = int(np.ceil(len(positive_pairs) / batch_size))
embedding_dim = 64


In [279]:
model = SkipGram(vocab_size, embedding_dim)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

In [211]:
vocab_size = len(tokenization.token_occurences)
batch_size = 1024
negative_sample_num = 5
train_gen = generate_training_data(positive_pairs,vocab_size,batch_size,negative_sample_num)
for i in range(0,1):
    for j in range(0,n_batch):
        batch = next(train_gen)
        batch = [tf.convert_to_tensor(d, tf.int64) for d in batch]
        with tf.GradientTape() as tape: #operations performed within this context are recorded as tape
            output = model(*batch)
            loss = negative_sampling_loss(*output)
        trainable_vars = model.trainable_variables #variables who is going to get optimized (weights and biases)
        gradients = tape.gradient(loss, trainable_vars) #calculate the gradients
        optimizer.apply_gradients(zip(gradients, trainable_vars))
        epoch_loss += loss * real_batch_size
        trainable_vars = model.trainable_variables #variables who is going to get optimized (weights and biases)
        gradients = tape.gradient(loss, trainable_vars) #calculate the gradients
        # Update weights
        optimizer.apply_gradients(zip(gradients, trainable_vars))
        epoch_loss += loss * len(batch)

NameError: name 'model' is not defined