In [1]:
import string
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import plotly.express as px
from collections import Counter
from typing import List, Tuple, Dict, Union
import log
import mynlputils as nu
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
# from torch.utils.data import Dataset, DataLoader

In [2]:
logger = log.get_logger(__name__)

In [3]:
@nu.timer
def load_data(raw_txt_train_path: str, raw_txt_test_path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Loads the training and test data.
    Args:
        raw_txt_train_path (str): Path to the training data file.
        raw_txt_test_path (str): Path to the test data file.
    Returns:
        train_data (DataFrame): Training data.
        test_data (DataFrame): Test data.
    """
    df_train = pd.read_csv(raw_txt_train_path, header=None, names=["label", "title", "description"])
    df_test = pd.read_csv(raw_txt_test_path, header=None, names=["label", "title", "description"])
    logger.info(f"df_train.shape: {df_train.shape}")
    logger.info(f"df_train unique Titles: {df_train['title'].nunique()}")
    logger.info(f"df_train unique Labels: {df_train['label'].value_counts()}")
    logger.info(f"df_test.shape: {df_test.shape}")
    logger.info(f"df_test unique Titles: {df_test['title'].nunique()}")
    logger.info(f"df_test unique Labels: {df_test['label'].value_counts()}")
    return df_train[["label", "description"]], df_test[["label", "description"]]

@nu.timer
def create_validation_set(corpus: pd.DataFrame, valid_size: float) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Creates a validation set from a list of DataFrames each representing a sentence.

    Args:
    corpus (pd.DataFrame): List of DataFrames each representing a sentence.
    valid_size (float): Proportion of sentences to include in the validation set.

    Returns:
    Tuple[pd.DataFrame, pd.DataFrame]: Training and validation sets.
    """
    train_corpus, valid_corpus = train_test_split(corpus, test_size=valid_size, random_state=1)
    logger.info(f"train_corpus.shape: {train_corpus.shape}")
    logger.info(f"train_corpus unique Labels: {train_corpus['label'].value_counts()}")
    logger.info(f"valid_corpus.shape: {valid_corpus.shape}")
    logger.info(f"valid_corpus unique Labels: {valid_corpus['label'].value_counts()}")
    return train_corpus.reset_index(drop=True), valid_corpus.reset_index(drop=True)

@nu.timer
def clean_text(docs: pd.DataFrame) -> pd.DataFrame:
    clean_docs = docs['description']
    clean_docs = clean_docs.str.replace("-", " ") # Separate hyphenated words
    clean_docs = clean_docs.str.replace("quot;", " ") # Remove HTML encoding for "
    clean_docs = clean_docs.str.replace("#39;s", "'") # Remove HTML encoding for 's
    translation_table = str.maketrans('', '', string.punctuation)
    clean_docs = clean_docs.str.translate(translation_table)
    clean_docs = clean_docs.str.lower() # Lowercase the text
    clean_docs = clean_docs.str.replace(r'\d+', ' <NUM> ') # Replace digits with <NUM>
    clean_docs = clean_docs.str.replace(r'\s+', ' ') # Replace multiple spaces with a single space
    return clean_docs.to_frame()

@nu.timer
def split_docs(docs: pd.DataFrame) -> pd.DataFrame:
    return docs['description'].str.split().to_list()

@nu.timer
def tokenize(tokens: List[List[str]], min_freq: int = 5):
    word_freq = Counter([word for sentence in tokens for word in sentence])
    vocab = [word for word, freq in word_freq.items() if freq >= min_freq]
    # Add <PAD>, <UNK> in the vocab
    vocab = ['<PAD>', '<UNK>'] + vocab
    word2idx = {word: idx for idx, word in enumerate(vocab)}
    idx_tokens = [[word2idx.get(word, 1) for word in sentence] for sentence in tokens]  # 1 is the index of <UNK>
    logger.info(f"Vocab size: {len(vocab)}")
    return vocab, idx_tokens, word2idx


@nu.timer
def create_skipgrams(corpus, window_size, pad_idx):
    data = []
    for sentence in corpus:
        # Pad the sentence with window_size <PAD> tokens at the beginning and end
        padded_sentence = [pad_idx] * window_size + sentence + [pad_idx] * window_size
        for word_index in range(window_size, len(padded_sentence) - window_size):
            # Select the context words from the padded sentence, excluding the current word
            contexts = padded_sentence[word_index - window_size : word_index] + padded_sentence[word_index + 1 : word_index + window_size + 1]
            # Add context and target pairs to the data
            data.append((contexts, padded_sentence[word_index]))
    logger.info(f"Number of skipgrams: {len(data)}")
    return data


def data_generator(skipgrams, word2idx, num_neg_samples=5):
    words_list = list(word2idx.keys())
    vocab_size = len(words_list)

    for contexts, target in skipgrams:
        # Generate negative samples for each context-target pair
        negatives = [random.choice(range(vocab_size)) for _ in range(num_neg_samples)]
        
        # Ensure the contexts, target and negatives are lists of integers
        if not all(isinstance(c, int) for c in contexts):
            raise ValueError(f"Contexts should be a list of integers. Got {contexts}")
        if not isinstance(target, int):
            raise ValueError(f"Target should be an integer. Got {target}")
        if not all(isinstance(n, int) for n in negatives):
            raise ValueError(f"Negatives should be a list of integers. Got {negatives}")

        yield torch.LongTensor([contexts]), torch.LongTensor([target]), torch.LongTensor(negatives)


class CBOW_NS(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(CBOW_NS, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_size)
    
    def forward(self, target_word, context_words, negative_words):
        # Reshape tensors to match dimensions expected by bmm
        target_embeds = self.embeddings(target_word).unsqueeze(2)  # shape: (batch_size, embedding_dim, 1)
        context_embeds = self.embeddings(context_words).unsqueeze(1)  # shape: (batch_size, 1, embedding_dim)
        negative_embeds = self.embeddings(negative_words).unsqueeze(1)  # shape: (batch_size, 1, num_negative_samples)

        # Calculate the positive log likelihood
        pos_score = F.logsigmoid(torch.bmm(context_embeds, target_embeds).squeeze())

        # Calculate the negative log likelihood
        neg_score = F.logsigmoid(-1 * torch.bmm(negative_embeds, target_embeds.repeat(1, negative_embeds.size(2), 1)).squeeze())

        # Sum up and return negative of scores (as we want to minimize negative log likelihood)
        return -(torch.sum(pos_score) + torch.sum(neg_score))


def train(model, epochs, data_generator, lr=0.001):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for epoch in range(epochs):
        total_loss = 0
        for target, context, negative in data_generator:
            target = torch.LongTensor([target])
            context = torch.LongTensor([context])
            negative = torch.LongTensor(negative)
            model.zero_grad()
            loss = model(target, context, negative)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        logger.info(f'Loss at epoch {epoch}: {total_loss/len(skipgrams)}')
    

# def train(model, epochs, data_generator, lr=0.001):
#     optimizer = torch.optim.Adam(model.parameters(), lr=lr)
#     for epoch in range(epochs):
#         total_loss = 0
#         for target, context, negative in data_generator:
#             model.zero_grad()
#             loss = model(target, context, negative)
#             loss.backward()
#             optimizer.step()
#             total_loss += loss.item()
#         logger.info(f'Loss at epoch {epoch}: {total_loss}')

In [4]:
conf = nu.load_config("a3")
df_train, df_test = load_data(conf.paths.raw_txt_train, conf.paths.raw_txt_test)
train_corpus, valid_corpus = create_validation_set(corpus = df_train, valid_size = 0.1)
clean_docs = clean_text(train_corpus)
raw_tokens = split_docs(clean_docs)
vocab, idx_tokens, word2idx = tokenize(raw_tokens)
skipgrams = create_skipgrams(idx_tokens, window_size=2, pad_idx=word2idx['<PAD>'])
window_size = 2  # Assume a suitable window size
num_negatives = 5  # Assume a suitable number of negative samples
data_gen = data_generator(skipgrams, word2idx)
# Create and train the model
vocab_size = len(vocab)
embed_size = 100
model = CBOW_NS(vocab_size, embed_size)

23-Jul-23 11:10:11 - INFO - Starting 'load_config'.
23-Jul-23 11:10:11 - INFO - Finished 'load_config' in 0.0070 secs.
23-Jul-23 11:10:11 - INFO - Starting 'load_data'.
23-Jul-23 11:10:11 - INFO - df_train.shape: (120000, 3)
23-Jul-23 11:10:11 - INFO - df_train unique Titles: 114364
23-Jul-23 11:10:11 - INFO - df_train unique Labels: 3    30000
4    30000
2    30000
1    30000
Name: label, dtype: int64
23-Jul-23 11:10:11 - INFO - df_test.shape: (7600, 3)
23-Jul-23 11:10:11 - INFO - df_test unique Titles: 7569
23-Jul-23 11:10:11 - INFO - df_test unique Labels: 3    1900
4    1900
2    1900
1    1900
Name: label, dtype: int64
23-Jul-23 11:10:11 - INFO - Finished 'load_data' in 0.2931 secs.
23-Jul-23 11:10:11 - INFO - Starting 'create_validation_set'.
23-Jul-23 11:10:11 - INFO - train_corpus.shape: (108000, 2)
23-Jul-23 11:10:11 - INFO - train_corpus unique Labels: 3    27024
4    27012
1    26982
2    26982
Name: label, dtype: int64
23-Jul-23 11:10:11 - INFO - valid_corpus.shape: (12000,

In [5]:
train(model, epochs=10, data_generator=data_gen)

RuntimeError: batch2 must be a 3D tensor

In [None]:
# vocab_size = len(word2idx)
# embed_size = 100  # dimension of the embedding vectors
# cbow = CBOW(vocab_size, embed_size)
# data_generator = generate_batches(skipgrams, batch_size=64)