In [1]:
import string
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import plotly.express as px
from collections import Counter
from typing import List, Tuple, Dict, Union
import log
import mynlputils as nu
import torch
import torch.nn as nn
import html
# from torch.utils.data import Dataset, DataLoader

In [2]:
logger = log.get_logger(__name__)

In [3]:
def load_data(raw_txt_train_path: str, raw_txt_test_path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Loads the training and test data.
    Args:
        raw_txt_train_path (str): Path to the training data file.
        raw_txt_test_path (str): Path to the test data file.
    Returns:
        train_data (DataFrame): Training data.
        test_data (DataFrame): Test data.
    """
    df_train = pd.read_csv(raw_txt_train_path, header=None, names=["label", "title", "description"])
    df_test = pd.read_csv(raw_txt_test_path, header=None, names=["label", "title", "description"])
    logger.info(f"df_train.shape: {df_train.shape}")
    logger.info(f"df_train unique Titles: {df_train['title'].nunique()}")
    logger.info(f"df_train unique Labels: {df_train['label'].value_counts()}")
    logger.info(f"df_test.shape: {df_test.shape}")
    logger.info(f"df_test unique Titles: {df_test['title'].nunique()}")
    logger.info(f"df_test unique Labels: {df_test['label'].value_counts()}")
    return df_train[["label", "description"]], df_test[["label", "description"]]


def create_validation_set(corpus: pd.DataFrame, valid_size: float) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Creates a validation set from a list of DataFrames each representing a sentence.

    Args:
    corpus (pd.DataFrame): List of DataFrames each representing a sentence.
    valid_size (float): Proportion of sentences to include in the validation set.

    Returns:
    Tuple[pd.DataFrame, pd.DataFrame]: Training and validation sets.
    """
    train_corpus, valid_corpus = train_test_split(corpus, test_size=valid_size, random_state=1)
    return train_corpus.reset_index(drop=True), valid_corpus.reset_index(drop=True)

In [4]:
conf = nu.load_config("a3")
df_train, df_test = load_data(conf.paths.raw_txt_train, conf.paths.raw_txt_test)
train_corpus, valid_corpus = create_validation_set(corpus = df_train, valid_size = 0.063)

22-Jul-23 21:53:00 - INFO - Starting 'load_config'.
22-Jul-23 21:53:00 - INFO - Finished 'load_config' in 0.0072 secs.
22-Jul-23 21:53:00 - INFO - df_train.shape: (120000, 3)
22-Jul-23 21:53:00 - INFO - df_train unique Titles: 114364
22-Jul-23 21:53:00 - INFO - df_train unique Labels: 3    30000
4    30000
2    30000
1    30000
Name: label, dtype: int64
22-Jul-23 21:53:00 - INFO - df_test.shape: (7600, 3)
22-Jul-23 21:53:00 - INFO - df_test unique Titles: 7569
22-Jul-23 21:53:00 - INFO - df_test unique Labels: 3    1900
4    1900
2    1900
1    1900
Name: label, dtype: int64


In [5]:
train_corpus.head()

Unnamed: 0,label,description
0,4,NASA #39;s robotic explorers are back at work ...
1,2,NEW YORK (Reuters) - Boston Red Sox can make ...
2,3,Brookstone met analyst estimates and has raise...
3,2,AP - The Japanese won the pregame home run der...
4,4,Also: Congress OKs private spaceflight bill. N...


In [48]:
def clean_text(docs: pd.DataFrame) -> pd.DataFrame:
    clean_docs = docs['description']
    clean_docs = clean_docs.str.replace("-", " ") # Separate hyphenated words
    clean_docs = clean_docs.str.replace("quot;", " ") # Remove HTML encoding for "
    clean_docs = clean_docs.str.replace("#39;s", "'") # Remove HTML encoding for 's
    # clean_docs = clean_docs.str.replace(r"http\S+", "url") # Remove URLs
    # clean_docs = html.unescape(clean_docs) # Remove HTML encoding
    translation_table = str.maketrans('', '', string.punctuation)
    clean_docs = clean_docs.str.translate(translation_table)
    clean_docs = clean_docs.str.lower() # Lowercase the text
    clean_docs = clean_docs.str.replace(r'\d+', ' <NUM> ') # Replace digits with <NUM>
    clean_docs = clean_docs.str.replace(r'\s+', ' ') # Replace multiple spaces with a single space
    return clean_docs.to_frame()


def split_docs(docs: pd.DataFrame) -> pd.DataFrame:
    return docs['description'].str.split().to_list()


def tokenize(tokens: List[List[str]], min_freq: int = 5):
    word_freq = Counter([word for sentence in tokens for word in sentence])
    vocab = [word if word_freq[word] >= min_freq else '<UNK>' for word in word_freq]
    word2idx = {word: idx for idx, word in enumerate(vocab)}
    tokens = [[word if word in word2idx else '<UNK>' for word in sentence] for sentence in tokens]
    return tokens, word2idx

In [18]:
clean_docs = clean_text(train_corpus)
# raw_tokens = split_docs(clean_docs)
# tokens, word2idx = tokenize(raw_tokens)

  clean_docs = clean_docs.str.replace(r'\d+', ' <NUM> ') # Replace digits with <NUM>
  clean_docs = clean_docs.str.replace(r'\s+', ' ') # Replace multiple spaces with a single space


In [19]:
clean_docs

Unnamed: 0,description
0,nasa robotic explorers are back at work on mar...
1,new york reuters boston red sox can make base...
2,brookstone met analyst estimates and has raise...
3,ap the japanese won the pregame home run derby...
4,also congress oks private spaceflight bill new...
...,...
112435,mountain view calif october <NUM> <NUM> olyfue...
112436,santiago afp asia pacific leaders overpowered ...
112437,ap vietnam veterans supporting john kerry for ...
112438,it is extremely important for muslims not only...


In [20]:
clean_docs['description'][10]

'seven islamic militants were charged here on friday with terrorist related activities in connection with the killing of the dutch filmmaker theo van gogh amsterdam public prosecutor said'

In [52]:
tokens

[['nasa',
  'robotic',
  'explorers',
  'are',
  'back',
  'at',
  'work',
  'on',
  'mars',
  'following',
  'an',
  'extended',
  '<UNK>',
  'that',
  'had',
  'left',
  'mission',
  'managers',
  'with',
  'only',
  'hope',
  'and',
  'optimism',
  'that',
  'the',
  'rovers'],
 ['new',
  'york',
  'reuters',
  'boston',
  'red',
  'sox',
  'can',
  'make',
  'baseball',
  'history',
  'on',
  'wednesday',
  'by',
  'becoming',
  'the',
  'first',
  'team',
  'to',
  'overturn',
  'a',
  'three',
  'game',
  'deficit',
  'in',
  'a',
  'best',
  'of',
  'seven',
  'series'],
 ['<UNK>',
  'met',
  'analyst',
  'estimates',
  'and',
  'has',
  'raised',
  'guidance',
  'for',
  'the',
  'all',
  'important',
  'christmas',
  'quarter'],
 ['ap',
  'the',
  'japanese',
  'won',
  'the',
  'pregame',
  'home',
  'run',
  'derby',
  'then',
  'the',
  'game',
  'started',
  'and',
  'the',
  'major',
  'league',
  'all',
  'stars',
  'put',
  'their',
  'bats',
  'to',
  'work',
  'back',

In [45]:
len(vocab)

23461

In [10]:
# counter = Counter()
#     for text in corpus:
#         counter.update(text)
#     vocab = {word: i for i, (word, freq) in enumerate(counter.items()) if freq >= min_freq}
#     # Add special tokens
#     vocab['<PAD>'] = len(vocab)
#     vocab['<UNK>'] = len(vocab)

In [11]:
# Build the vocabulary
def build_vocab(corpus):
    counter = Counter()
    for text in corpus:
        counter.update(text)
    vocab = {word: i for i, (word, freq) in enumerate(counter.items())}
    # Add special tokens
    vocab['<PAD>'] = len(vocab)
    vocab['<UNK>'] = len(vocab)
    return vocab

# Build the inverse vocabulary
def build_inverse_vocab(vocab):
    return {i: word for word, i in vocab.items()}

# Build the vocabularies
vocab = build_vocab(clean_docs['description'])
inverse_vocab = build_inverse_vocab(vocab)

len(vocab)

34

In [12]:
zero = train_corpus['description']

In [13]:
zero[112436]

'SANTIAGO (AFP) - Asia-Pacific leaders, overpowered by the US-led  quot;war on terror quot; agenda, closed an annual summit vowing to shore up anti-terrorist defenses and target weapons of mass destruction.'

In [None]:
punctuation_without_hyphen = string.punctuation.replace("-", "") 
translation_table = str.maketrans('', '', punctuation_without_hyphen)
clean_docs = clean_docs.str.translate(translation_table)

In [14]:
translation_table = str.maketrans('', '', string.punctuation)
one = zero.str.translate(translation_table)

0         NASA 39s robotic explorers are back at work on...
1          NEW YORK Reuters  Boston Red Sox can make bas...
2         Brookstone met analyst estimates and has raise...
3         AP  The Japanese won the pregame home run derb...
4         Also Congress OKs private spaceflight bill New...
                                ...                        
112435    MOUNTAIN VIEW CALIF  October 5 2004 olyFuel a ...
112436    SANTIAGO AFP  AsiaPacific leaders overpowered ...
112437    AP  Vietnam veterans supporting John Kerry for...
112438    It is extremely important for Muslims not only...
112439    A battalion of Indian Army soldiers has pulled...
Name: description, Length: 112440, dtype: object

In [15]:
one[112436]

'SANTIAGO AFP  AsiaPacific leaders overpowered by the USled  quotwar on terror quot agenda closed an annual summit vowing to shore up antiterrorist defenses and target weapons of mass destruction'

In [None]:
clean_docs = train_corpus['description']
translation_table = str.maketrans('', '', string.punctuation)
clean_docs = clean_docs.str.translate(translation_table)

In [29]:
clean_docs = clean_text(train_corpus)

  clean_docs = clean_docs.str.replace(r'\d+', '<NUM>') # Replace digits with <NUM>


In [34]:
clean_docs.iloc[-1]

'a battalion of indian army soldiers has pulled out of kashmir <NUM>s summer capital srinagar marking the first withdrawal from the urban hub of insurgency in the himalayan state'

In [None]:
decoded_text = html.unescape(text)

In [8]:
def preprocess_data(data: pd.DataFrame, min_freq: int = 5):
    """
    Preprocesses the data by tokenizing the text and replacing low-frequency words with <UNK>.
    Args:
        data (DataFrame): Input data.
        min_freq (int): Minimum frequency for a word to be kept. Default is 5.
    Returns:
        tokens (List[List[str]]): Tokenized text.
        word2idx (dict): A dictionary mapping words to their indices.
    """
    tokens = data['description'].str.lower().str.split().tolist()
    word_counts = Counter([word for sentence in tokens for word in sentence])
    words = [word if word_counts[word] >= min_freq else '<UNK>' for word in word_counts]
    word2idx = {word: idx for idx, word in enumerate(words)}
    tokens = [[word if word in word2idx else '<UNK>' for word in sentence] for sentence in tokens]
    return tokens, word2idx

In [6]:
tokens = train_corpus['description'].str.lower().str.split().tolist()

In [7]:
tokens[:10]

[['nasa',
  '#39;s',
  'robotic',
  'explorers',
  'are',
  'back',
  'at',
  'work',
  'on',
  'mars',
  'following',
  'an',
  'extended',
  'siesta',
  'that',
  'had',
  'left',
  'mission',
  'managers',
  'with',
  'only',
  'hope',
  'and',
  'optimism',
  'that',
  'the',
  'rovers'],
 ['new',
  'york',
  '(reuters)',
  '-',
  'boston',
  'red',
  'sox',
  'can',
  'make',
  'baseball',
  'history',
  'on',
  'wednesday',
  'by',
  'becoming',
  'the',
  'first',
  'team',
  'to',
  'overturn',
  'a',
  'three-game',
  'deficit',
  'in',
  'a',
  'best-of-seven',
  'series.'],
 ['brookstone',
  'met',
  'analyst',
  'estimates',
  'and',
  'has',
  'raised',
  'guidance',
  'for',
  'the',
  'all-important',
  'christmas',
  'quarter.'],
 ['ap',
  '-',
  'the',
  'japanese',
  'won',
  'the',
  'pregame',
  'home',
  'run',
  'derby.',
  'then',
  'the',
  'game',
  'started',
  'and',
  'the',
  'major',
  'league',
  'all-stars',
  'put',
  'their',
  'bats',
  'to',
  'work.

In [9]:
tokens, word2idx = preprocess_data(train_corpus)

In [15]:
def create_co_occurrence_matrix(corpus: list) -> dict:
    """
    Create co-occurrence matrix for the given corpus.
    Args:
        corpus (list): The corpus of tokenized sentences.
    Returns:
        co_occurrence_matrix (dict of dicts): The co-occurrence matrix.
    """
    co_occurrence_matrix = Counter()
    for sentence in corpus:
        # use combinations window of size 2 to find adjacent words
        for word1, word2 in combinations(sentence, 2):
            co_occurrence_matrix[(word1, word2)] += 1

    return co_occurrence_matrix


class GloVeModel(nn.Module):
    def __init__(self, vocab_size: int, embed_size: int):
        super(GloVeModel, self).__init__()
        self.wi = nn.Embedding(vocab_size, embed_size)
        self.wj = nn.Embedding(vocab_size, embed_size)
        self.bi = nn.Embedding(vocab_size, 1)
        self.bj = nn.Embedding(vocab_size, 1)

    def forward(self, i_indices: torch.Tensor, j_indices: torch.Tensor):
        w_i = self.wi(i_indices)
        w_j = self.wj(j_indices)
        b_i = self.bi(i_indices).squeeze()
        b_j = self.bj(j_indices).squeeze()

        x = torch.sum(w_i * w_j, dim=1) + b_i + b_j

        return x


def glove_loss(x_hat: torch.Tensor, x: torch.Tensor):
    """
    Loss function for GloVe model.
    Args:
        x_hat (torch.Tensor): Predicted log of co-occurrence.
        x (torch.Tensor): True log of co-occurrence.
    Returns:
        loss (torch.Tensor): The computed loss.
    """
    return torch.mean((x_hat - x) ** 2)


def train_glove_model(corpus: list, word2idx: dict, co_occurrence_matrix: dict, embed_size: int = 100, epochs: int = 5):
    model = GloVeModel(len(word2idx), embed_size)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    for epoch in range(epochs):
        total_loss = 0
        for (word_i, word_j), n_ij in co_occurrence_matrix.items():
            i_idx = torch.tensor([word2idx[word_i]], dtype=torch.long)
            j_idx = torch.tensor([word2idx[word_j]], dtype=torch.long)
            n_ij_tensor = torch.tensor([n_ij], dtype=torch.float)

            optimizer.zero_grad()

            outputs = model(i_idx, j_idx)
            loss = glove_loss(outputs, torch.log(n_ij_tensor))
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        print(f'Epoch: {epoch+1}, Loss: {total_loss}')

    return model


In [17]:
co_occurrence_matrix = create_co_occurrence_matrix(tokens)

In [18]:
model = train_glove_model(tokens, word2idx, co_occurrence_matrix)

IndexError: index out of range in self

In [10]:
class SkipGramModel(torch.nn.Module):
    """
    Skip-gram model for training word embeddings.
    """
    def __init__(self, vocab_size: int, embed_size: int):
        super(SkipGramModel, self).__init__()
        self.in_embed = torch.nn.Embedding(vocab_size, embed_size)
        self.out_embed = torch.nn.Embedding(vocab_size, embed_size)

    def forward(self, target: torch.Tensor, context: torch.Tensor):
        in_embeds = self.in_embed(target)
        out_embeds = self.out_embed(context)
        scores = torch.matmul(in_embeds, out_embeds.t())
        return scores

In [11]:
def train_word_embeddings(data: list, word2idx: dict, embed_size: int = 100, epochs: int = 5):
    """
    Trains word embeddings using a Skip-gram model.
    Args:
        data (List[List[str]]): Tokenized text data.
        word2idx (dict): A dictionary mapping words to their indices.
        embed_size (int): Size of the word embeddings. Default is 100.
        epochs (int): Number of training epochs. Default is 5.
    Returns:
        model (SkipGramModel): Trained Skip-gram model.
    """
    model = SkipGramModel(len(word2idx), embed_size)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
    loss_fn = torch.nn.CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = 0
        for sentence in data:
            sentence_indices = [word2idx[word] for word in sentence]
            for i in range(1, len(sentence_indices) - 1):
                context = [sentence_indices[i-1], sentence_indices[i+1]]
                target = sentence_indices[i]
                target_tensor = torch.tensor([target], dtype=torch.long)
                
                for c in context:
                    context_tensor = torch.tensor([c], dtype=torch.long)
                    scores = model(target_tensor, context_tensor)
                    loss = loss_fn(scores, context_tensor)
                    total_loss += loss.item()

                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

        logger.info(f'Epoch {epoch + 1}, Loss: {total_loss / len(data)}')

    return model

In [12]:
model = train_word_embeddings(tokens, word2idx)

IndexError: Target 2 is out of bounds.

In [None]:
def visualize_embeddings(model, word2idx, words):
    """
    Visualizes word embeddings using t-SNE and Plotly.
    Args:
        model (SkipGramModel): Trained Skip-gram model.
        word2idx (dict): A dictionary mapping words to their indices.
        words (list): List of words to visualize.
    """
    word_embeds = model.in_embed.weight.data.numpy()
    words_idx = [word2idx[word] for word in words]
    words_embed = word_embeds[words_idx]
    
    tsne = TSNE(n_components=2, random_state=0)
    words_tsne = tsne.fit_transform(words_embed)

    df = pd.DataFrame(words_tsne, columns=['x', 'y'])
    df['word'] = words

    fig = px.scatter(df, x='x', y='y', text='word')
    fig.update_traces(textposition='top center')
    fig.show()