In [14]:
import re
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import plotly.express as px
from collections import defaultdict, Counter
from itertools import combinations
from unidecode import unidecode
from typing import List, Tuple, Dict, Union
import log
import mynlputils as nu
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
logger = log.get_logger(__name__)

In [3]:
def load_data(raw_txt_train_path: str, raw_txt_test_path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Loads the training and test data.
    Args:
        raw_txt_train_path (str): Path to the training data file.
        raw_txt_test_path (str): Path to the test data file.
    Returns:
        train_data (DataFrame): Training data.
        test_data (DataFrame): Test data.
    """
    df_train = pd.read_csv(raw_txt_train_path, header=None, names=["label", "title", "description"])
    df_test = pd.read_csv(raw_txt_test_path, header=None, names=["label", "title", "description"])
    logger.info(f"df_train.shape: {df_train.shape}")
    logger.info(f"df_train unique Titles: {df_train['title'].nunique()}")
    logger.info(f"df_train unique Labels: {df_train['label'].value_counts()}")
    logger.info(f"df_test.shape: {df_test.shape}")
    logger.info(f"df_test unique Titles: {df_test['title'].nunique()}")
    logger.info(f"df_test unique Labels: {df_test['label'].value_counts()}")
    return df_train, df_test

In [4]:
conf = nu.load_config("a3")

22-Jul-23 19:14:29 - INFO - Starting 'load_config'.
22-Jul-23 19:14:29 - INFO - Finished 'load_config' in 0.0072 secs.


In [5]:
df_train, df_test = load_data(conf.paths.raw_txt_train, conf.paths.raw_txt_test)

22-Jul-23 19:14:29 - INFO - df_train.shape: (120000, 3)
22-Jul-23 19:14:29 - INFO - df_train unique Titles: 114364
22-Jul-23 19:14:29 - INFO - df_train unique Labels: 3    30000
4    30000
2    30000
1    30000
Name: label, dtype: int64
22-Jul-23 19:14:29 - INFO - df_test.shape: (7600, 3)
22-Jul-23 19:14:29 - INFO - df_test unique Titles: 7569
22-Jul-23 19:14:29 - INFO - df_test unique Labels: 3    1900
4    1900
2    1900
1    1900
Name: label, dtype: int64


In [6]:
def create_validation_set(corpus: pd.DataFrame, valid_size: float) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Creates a validation set from a list of DataFrames each representing a sentence.

    Args:
    corpus (pd.DataFrame): List of DataFrames each representing a sentence.
    valid_size (float): Proportion of sentences to include in the validation set.

    Returns:
    Tuple[pd.DataFrame, pd.DataFrame]: Training and validation sets.
    """
    train_corpus, valid_corpus = train_test_split(corpus, test_size=valid_size, random_state=1)
    return train_corpus, valid_corpus

In [7]:
train_corpus, valid_corpus = create_validation_set(corpus = df_train, valid_size = 0.063)

In [8]:
def preprocess_data(data: pd.DataFrame, min_freq: int = 5):
    """
    Preprocesses the data by tokenizing the text and replacing low-frequency words with <UNK>.
    Args:
        data (DataFrame): Input data.
        min_freq (int): Minimum frequency for a word to be kept. Default is 5.
    Returns:
        tokens (List[List[str]]): Tokenized text.
        word2idx (dict): A dictionary mapping words to their indices.
    """
    tokens = data['description'].str.lower().str.split().tolist()
    word_counts = Counter([word for sentence in tokens for word in sentence])
    words = [word if word_counts[word] >= min_freq else '<UNK>' for word in word_counts]
    word2idx = {word: idx for idx, word in enumerate(words)}
    tokens = [[word if word in word2idx else '<UNK>' for word in sentence] for sentence in tokens]
    return tokens, word2idx

In [9]:
tokens, word2idx = preprocess_data(train_corpus)

In [15]:
def create_co_occurrence_matrix(corpus: list) -> dict:
    """
    Create co-occurrence matrix for the given corpus.
    Args:
        corpus (list): The corpus of tokenized sentences.
    Returns:
        co_occurrence_matrix (dict of dicts): The co-occurrence matrix.
    """
    co_occurrence_matrix = Counter()
    for sentence in corpus:
        # use combinations window of size 2 to find adjacent words
        for word1, word2 in combinations(sentence, 2):
            co_occurrence_matrix[(word1, word2)] += 1

    return co_occurrence_matrix


class GloVeModel(nn.Module):
    def __init__(self, vocab_size: int, embed_size: int):
        super(GloVeModel, self).__init__()
        self.wi = nn.Embedding(vocab_size, embed_size)
        self.wj = nn.Embedding(vocab_size, embed_size)
        self.bi = nn.Embedding(vocab_size, 1)
        self.bj = nn.Embedding(vocab_size, 1)

    def forward(self, i_indices: torch.Tensor, j_indices: torch.Tensor):
        w_i = self.wi(i_indices)
        w_j = self.wj(j_indices)
        b_i = self.bi(i_indices).squeeze()
        b_j = self.bj(j_indices).squeeze()

        x = torch.sum(w_i * w_j, dim=1) + b_i + b_j

        return x


def glove_loss(x_hat: torch.Tensor, x: torch.Tensor):
    """
    Loss function for GloVe model.
    Args:
        x_hat (torch.Tensor): Predicted log of co-occurrence.
        x (torch.Tensor): True log of co-occurrence.
    Returns:
        loss (torch.Tensor): The computed loss.
    """
    return torch.mean((x_hat - x) ** 2)


def train_glove_model(corpus: list, word2idx: dict, co_occurrence_matrix: dict, embed_size: int = 100, epochs: int = 5):
    model = GloVeModel(len(word2idx), embed_size)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    for epoch in range(epochs):
        total_loss = 0
        for (word_i, word_j), n_ij in co_occurrence_matrix.items():
            i_idx = torch.tensor([word2idx[word_i]], dtype=torch.long)
            j_idx = torch.tensor([word2idx[word_j]], dtype=torch.long)
            n_ij_tensor = torch.tensor([n_ij], dtype=torch.float)

            optimizer.zero_grad()

            outputs = model(i_idx, j_idx)
            loss = glove_loss(outputs, torch.log(n_ij_tensor))
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        print(f'Epoch: {epoch+1}, Loss: {total_loss}')

    return model


In [17]:
co_occurrence_matrix = create_co_occurrence_matrix(tokens)

In [18]:
model = train_glove_model(tokens, word2idx, co_occurrence_matrix)

IndexError: index out of range in self

In [10]:
class SkipGramModel(torch.nn.Module):
    """
    Skip-gram model for training word embeddings.
    """
    def __init__(self, vocab_size: int, embed_size: int):
        super(SkipGramModel, self).__init__()
        self.in_embed = torch.nn.Embedding(vocab_size, embed_size)
        self.out_embed = torch.nn.Embedding(vocab_size, embed_size)

    def forward(self, target: torch.Tensor, context: torch.Tensor):
        in_embeds = self.in_embed(target)
        out_embeds = self.out_embed(context)
        scores = torch.matmul(in_embeds, out_embeds.t())
        return scores

In [11]:
def train_word_embeddings(data: list, word2idx: dict, embed_size: int = 100, epochs: int = 5):
    """
    Trains word embeddings using a Skip-gram model.
    Args:
        data (List[List[str]]): Tokenized text data.
        word2idx (dict): A dictionary mapping words to their indices.
        embed_size (int): Size of the word embeddings. Default is 100.
        epochs (int): Number of training epochs. Default is 5.
    Returns:
        model (SkipGramModel): Trained Skip-gram model.
    """
    model = SkipGramModel(len(word2idx), embed_size)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
    loss_fn = torch.nn.CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = 0
        for sentence in data:
            sentence_indices = [word2idx[word] for word in sentence]
            for i in range(1, len(sentence_indices) - 1):
                context = [sentence_indices[i-1], sentence_indices[i+1]]
                target = sentence_indices[i]
                target_tensor = torch.tensor([target], dtype=torch.long)
                
                for c in context:
                    context_tensor = torch.tensor([c], dtype=torch.long)
                    scores = model(target_tensor, context_tensor)
                    loss = loss_fn(scores, context_tensor)
                    total_loss += loss.item()

                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

        logger.info(f'Epoch {epoch + 1}, Loss: {total_loss / len(data)}')

    return model

In [12]:
model = train_word_embeddings(tokens, word2idx)

IndexError: Target 2 is out of bounds.

In [None]:
def visualize_embeddings(model, word2idx, words):
    """
    Visualizes word embeddings using t-SNE and Plotly.
    Args:
        model (SkipGramModel): Trained Skip-gram model.
        word2idx (dict): A dictionary mapping words to their indices.
        words (list): List of words to visualize.
    """
    word_embeds = model.in_embed.weight.data.numpy()
    words_idx = [word2idx[word] for word in words]
    words_embed = word_embeds[words_idx]
    
    tsne = TSNE(n_components=2, random_state=0)
    words_tsne = tsne.fit_transform(words_embed)

    df = pd.DataFrame(words_tsne, columns=['x', 'y'])
    df['word'] = words

    fig = px.scatter(df, x='x', y='y', text='word')
    fig.update_traces(textposition='top center')
    fig.show()