In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
import string
import random
from typing import List, Tuple
import math

import log
import mynlputils as nu

In [2]:
logger = log.get_logger(__name__)

In [3]:
def load_data(raw_txt_train_path: str, raw_txt_test_path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    df_train = pd.read_csv(raw_txt_train_path, header=None, names=["label", "title", "description"])
    df_test = pd.read_csv(raw_txt_test_path, header=None, names=["label", "title", "description"])
    return df_train[["label", "description"]], df_test[["label", "description"]]


def create_validation_set(corpus: pd.DataFrame, valid_size: float) -> Tuple[pd.DataFrame, pd.DataFrame]:
    train_corpus, valid_corpus = train_test_split(corpus, test_size=valid_size, random_state=1)
    return train_corpus.reset_index(drop=True), valid_corpus.reset_index(drop=True)


def clean_text(docs: pd.DataFrame) -> pd.DataFrame:
    clean_docs = docs['description']
    clean_docs = clean_docs.str.replace("-", " ")
    clean_docs = clean_docs.str.replace("quot;", " ")
    clean_docs = clean_docs.str.replace("#39;s", "'")
    translation_table = str.maketrans('', '', string.punctuation)
    clean_docs = clean_docs.str.translate(translation_table)
    clean_docs = clean_docs.str.lower()
    clean_docs = clean_docs.str.replace(r'\d+', ' <NUM> ')
    clean_docs = clean_docs.str.replace(r'\s+', ' ')
    return clean_docs.to_frame()


def split_docs(docs: pd.DataFrame) -> pd.DataFrame:
    return docs['description'].str.split().to_list()


def tokenize(tokens: List[List[str]], min_freq: int = 5):
    word_freq = Counter([word for sentence in tokens for word in sentence])
    vocab = [word for word, freq in word_freq.items() if freq >= min_freq]
    vocab = ['<PAD>', '<UNK>'] + vocab
    word2idx = {word: idx for idx, word in enumerate(vocab)}
    idx_tokens = [[word2idx.get(word, 1) for word in sentence] for sentence in tokens]
    return vocab, idx_tokens, word2idx


def create_skipgrams(corpus, window_size, pad_idx):
    data = []
    for sentence in corpus:
        padded_sentence = [pad_idx] * window_size + sentence + [pad_idx] * window_size
        for word_index in range(window_size, len(padded_sentence) - window_size):
            contexts = padded_sentence[word_index - window_size : word_index] + padded_sentence[word_index + 1 : word_index + window_size + 1]
            data.append((contexts, padded_sentence[word_index]))
    return data


# def data_generator(skipgrams, word2idx, num_neg_samples=5):
#     words_list = list(word2idx.keys())
#     vocab_size = len(words_list)

#     for contexts, target in skipgrams:
#         negatives = [random.choice(range(vocab_size)) for _ in range(num_neg_samples)]
#         yield torch.LongTensor(contexts), torch.LongTensor([target]), torch.LongTensor(negatives)


def data_generator(skipgrams, word2idx, pad_idx, batch_size=32, num_neg_samples=5):
    words_list = list(word2idx.keys())
    vocab_size = len(words_list)
    n = len(skipgrams)

    # Shuffle skipgrams
    random.shuffle(skipgrams)

    for batch_start in range(0, n, batch_size):
        context_batch = []
        target_batch = []
        negative_batch = []
        
        # Create batches
        for contexts, target in skipgrams[batch_start:batch_start + batch_size]:
            negatives = [random.choice(range(vocab_size)) for _ in range(num_neg_samples)]
            context_batch.append(torch.LongTensor(contexts))
            target_batch.append(torch.LongTensor([target]))
            negative_batch.append(torch.LongTensor(negatives))

        # Pad context sequences in batch
        context_batch = pad_sequence(context_batch, batch_first=True, padding_value=pad_idx)

        # Convert target and negative batches to tensors
        target_batch = torch.stack(target_batch)
        negative_batch = torch.stack(negative_batch)
        
        yield context_batch, target_batch, negative_batch

# class CBOW_NS(nn.Module):
#     def __init__(self, vocab_size, embed_size):
#         super(CBOW_NS, self).__init__()
#         self.embeddings = nn.Embedding(vocab_size, embed_size)

#     def forward(self, context_words, target_word, negative_words):
#         # Get embeddings for context words
#         context_embeds = self.embeddings(context_words) # shape: (batch_size, window_size * 2, embed_size)
        
#         # Get embeddings for target word
#         target_embeds = self.embeddings(target_word) # shape: (batch_size, 1, embed_size)
#         target_embeds = torch.transpose(target_embeds, 1, 2)  # shape: (batch_size, embed_size, 1)
        
#         # Get embeddings for negative samples
#         negative_embeds = self.embeddings(negative_words).unsqueeze(3) # shape: (batch_size, num_neg_samples, embed_size, 1)
#         negative_embeds = negative_embeds.permute(0, 2, 1, 3).squeeze(-1) # shape: (batch_size, embed_size, num_neg_samples)

#         # Compute positive score
#         pos_score = torch.bmm(context_embeds, target_embeds) # shape: (batch_size, window_size * 2, 1)
#         pos_score = F.logsigmoid(pos_score).sum(1) # Sum scores across context words for each target word in the batch

#         # Compute negative score
#         neg_score = torch.bmm(context_embeds.unsqueeze(2), negative_embeds) # shape: (batch_size, window_size * 2, num_neg_samples)
#         neg_score = F.logsigmoid(-neg_score).sum(1) # Sum scores across context words and negative samples for each target word in the batch

#         # Return negative of total score
#         return -(pos_score + neg_score).mean() # Average across the batch


# def train(model, epochs, data_generator, lr=0.001):
#     optimizer = torch.optim.Adam(model.parameters(), lr=lr)
#     for epoch in range(epochs):
#         total_loss = 0
#         for context, target, negative in data_generator:
#             model.zero_grad()
#             loss = model(context, target, negative)
#             loss.backward()
#             optimizer.step()
#             total_loss += loss.item()
#         print(f'Loss at epoch {epoch}: {total_loss}')

# def train(model, epochs, data_generator, lr=0.001):
#     optimizer = torch.optim.Adam(model.parameters(), lr=lr)
#     for epoch in range(epochs):
#         total_loss = 0
#         for context_batch, target_batch, negative_batch in data_generator:
#             model.zero_grad()
#             loss = model(context_batch, target_batch, negative_batch)
#             loss.backward()
#             optimizer.step()
#             total_loss += loss.item()
#         print(f'Loss at epoch {epoch}: {total_loss}')

# def train(model, epochs, data_generator, batch_size, lr=0.001):
#     optimizer = torch.optim.Adam(model.parameters(), lr=lr)
#     data_size = len(data_generator)
#     num_batches = (data_size + batch_size - 1) // batch_size
#     for epoch in range(epochs):
#         total_loss = 0
#         for batch_idx in range(num_batches):
#             start_idx = batch_idx * batch_size
#             end_idx = min((batch_idx + 1) * batch_size, data_size)
#             batch = data_generator[start_idx:end_idx]
#             context, target, negative = zip(*batch)
#             context = torch.stack(context)
#             target = torch.stack(target)
#             negative = torch.stack(negative)
#             model.zero_grad()
#             loss = model(context, target, negative)
#             loss.backward()
#             optimizer.step()
#             total_loss += loss.item()
#         print(f'Loss at epoch {epoch}: {total_loss}')

In [4]:
conf = nu.load_config("a3")
df_train, df_test = load_data(conf.paths.raw_txt_train, conf.paths.raw_txt_test)
df_train, df_valid = create_validation_set(df_train, 0.1)
df_train_clean = clean_text(df_train)
df_valid_clean = clean_text(df_valid)
df_test_clean = clean_text(df_test)

train_tokens = split_docs(df_train_clean)
valid_tokens = split_docs(df_valid_clean)
test_tokens = split_docs(df_test_clean)

vocab, idx_train_tokens, word2idx = tokenize(train_tokens)
_, idx_valid_tokens, _ = tokenize(valid_tokens)
_, idx_test_tokens, _ = tokenize(test_tokens)

pad_idx = word2idx['<PAD>']
skipgrams_train = create_skipgrams(idx_train_tokens, window_size=2, pad_idx=pad_idx)
skipgrams_valid = create_skipgrams(idx_valid_tokens, window_size=2, pad_idx=pad_idx)
skipgrams_test = create_skipgrams(idx_test_tokens, window_size=2, pad_idx=pad_idx)

23-Jul-23 13:32:41 - INFO - Starting 'load_config'.
23-Jul-23 13:32:41 - INFO - Finished 'load_config' in 0.0071 secs.
  clean_docs = clean_docs.str.replace(r'\d+', ' <NUM> ')
  clean_docs = clean_docs.str.replace(r'\s+', ' ')


In [15]:
batch_size = 512

In [16]:
words_list = list(word2idx.keys())
vocab_size = len(words_list)
n = len(skipgrams_train)
vocab_size, n

(23042, 3358115)

In [18]:
num_batches = n // batch_size
num_batches

6558

In [11]:
# Shuffle skipgrams
random.shuffle(skipgrams_train)

In [12]:
context_batch = []
target_batch = []
negative_batch = []

In [14]:
for contexts, target in skipgrams_train[:10]:
    print(contexts, target)

[0, 30, 1958, 5569] 336
[9190, 1, 24, 30] 3176
[1529, 701, 7251, 5616] 30
[34, 2093, 43, 30] 88
[29, 24, 1, 0] 11015
[1143, 43, 2173, 11082] 30
[267, 29, 0, 0] 853
[1973, 244, 268, 1786] 276
[2092, 16081, 16, 4636] 2180
[235, 30, 8343, 348] 65


In [None]:
def get_negative_samples(vocab, target, num_samples):
    negative_samples = []
    while len(negative_samples) < num_samples:
        sample = random.choice(vocab)
        if sample != target:
            negative_samples.append(sample)
    return negative_samples

In [None]:
for batch in num_batches:
    start_idx = batch * batch_size
    end_idx = min((batch + 1) * batch_size, n)
    batch = skipgrams_train[start_idx:end_idx]
    context, target = zip(*batch)
    context_batch.append(context)
    target_batch.append(target)
    negative_batch.append(get_negative_samples(target, vocab_size, num_neg_samples=5))

In [None]:
words_list = list(word2idx.keys())
vocab_size = len(words_list)
n = len(skipgrams)

# Shuffle skipgrams
random.shuffle(skipgrams)

for batch_start in range(0, n, batch_size):
    context_batch = []
    target_batch = []
    negative_batch = []
    
    # Create batches
    for contexts, target in skipgrams[batch_start:batch_start + batch_size]:
        negatives = [random.choice(range(vocab_size)) for _ in range(num_neg_samples)]
        context_batch.append(torch.LongTensor(contexts))
        target_batch.append(torch.LongTensor([target]))
        negative_batch.append(torch.LongTensor(negatives))

    # Pad context sequences in batch
    context_batch = pad_sequence(context_batch, batch_first=True, padding_value=pad_idx)

    # Convert target and negative batches to tensors
    target_batch = torch.stack(target_batch)
    negative_batch = torch.stack(negative_batch)
    
    yield context_batch, target_batch, negative_batch

In [5]:
# train_gen = data_generator(skipgrams_train, word2idx)
# valid_gen = data_generator(skipgrams_valid, word2idx)
# test_gen = data_generator(skipgrams_test, word2idx)

train_data_gen = data_generator(skipgrams_train, word2idx, pad_idx=word2idx['<PAD>'], batch_size=512, num_neg_samples=5)
valid_data_gen = data_generator(skipgrams_valid, word2idx, pad_idx=word2idx['<PAD>'], batch_size=512, num_neg_samples=5)
test_data_gen = data_generator(skipgrams_test, word2idx, pad_idx=word2idx['<PAD>'], batch_size=512, num_neg_samples=5)

In [6]:
class CBOW_NS(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(CBOW_NS, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_size)

    def forward(self, context_words, target_word, negative_words):
        # Get embeddings for context words, target word and negative words
        context_embeds = self.embeddings(context_words)  # (batch_size, window_size*2, embed_size)
        target_embeds = self.embeddings(target_word)    # (batch_size, 1, embed_size)
        negative_embeds = self.embeddings(negative_words)# (batch_size, num_neg_samples, embed_size)

        # Sum the context word embeddings
        context_embeds_sum = torch.sum(context_embeds, dim=1, keepdim=True)  # (batch_size, 1, embed_size)

        # Compute positive score
        pos_score = torch.bmm(context_embeds_sum, target_embeds.transpose(1,2)) # (batch_size, 1, 1)
        pos_score = F.logsigmoid(pos_score)

        # Compute negative score
        neg_score = torch.bmm(context_embeds_sum, negative_embeds.transpose(1,2)) # (batch_size, 1, num_neg_samples)
        neg_score = F.logsigmoid(-neg_score)

        # Return negative of total score
        return -(torch.sum(pos_score) + torch.sum(neg_score))
    
    
def train(model, epochs, data_generator, lr=0.001):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for epoch in range(epochs):
        total_loss = 0
        batch_count = 0
        for context_batch, target_batch, negative_batch in data_generator:
            model.zero_grad()
            loss = model(context_batch, target_batch, negative_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            batch_count += 1
        print(f'Epoch {epoch}, Loss: {total_loss / batch_count}')

In [7]:
vocab_size = len(vocab)
embed_size = 100
model = CBOW_NS(vocab_size, embed_size)
train(model, epochs=3, data_generator=train_data_gen)

Epoch 0, Loss: 10949.492521160095


ZeroDivisionError: division by zero

In [8]:
# Create a temporary variable to hold the batch count
temp_batch_count = 0

# Manually iterate through your data generator
for batch in train_data_gen:
    temp_batch_count += 1

# Check how many batches were created
print(f'Number of batches: {temp_batch_count}')

Number of batches: 0


In [None]:
len(vocab)

In [None]:
len(word2idx)

In [None]:
def find_percentile_length(list_of_lists, percentile):
    # Step 1: Calculate the lengths of all sublists
    lengths = [len(sublist) for sublist in list_of_lists]
    
    # Step 2: Sort the lengths in ascending order
    sorted_lengths = sorted(lengths)
    
    # Step 3: Find the index of the percentile in the sorted lengths
    index = (percentile / 100) * (len(sorted_lengths) - 1)
    
    # Step 4: Check if the index is an integer or not
    if index.is_integer():
        # If the index is an integer, return the corresponding value
        percentile_length = sorted_lengths[int(index)]
    else:
        # If the index is not an integer, interpolate between two values
        lower_index = math.floor(index)
        upper_index = math.ceil(index)
        lower_value = sorted_lengths[lower_index]
        upper_value = sorted_lengths[upper_index]
        percentile_length = np.interp(index, [lower_index, upper_index], [lower_value, upper_value])
    
    return percentile_length

In [None]:
percentile_75th = find_percentile_length(idx_train_tokens, 75)
percentile_90th = find_percentile_length(idx_train_tokens, 90)

print("75th percentile length of lists:", percentile_75th)
print("90th percentile length of lists:", percentile_90th)

In [None]:
train(model, epochs=3, data_generator=train_gen, batch_size=1000, lr=0.001)

In [None]:
# vocab_size = len(word2idx)
# embed_size = 100  # dimension of the embedding vectors
# cbow = CBOW(vocab_size, embed_size)
# data_generator = generate_batches(skipgrams, batch_size=64)