In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
import string
import random
from typing import List, Tuple
import math
import plotly.express as px
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity

import log
import mynlputils as nu

In [2]:
logger = log.get_logger(__name__)

In [3]:
@nu.timer
def load_data(raw_txt_train_path: str, raw_txt_test_path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    df_train = pd.read_csv(raw_txt_train_path, header=None, names=["label", "title", "description"])
    df_test = pd.read_csv(raw_txt_test_path, header=None, names=["label", "title", "description"])
    return df_train[["label", "description"]], df_test[["label", "description"]]

@nu.timer
def create_validation_set(corpus: pd.DataFrame, valid_size: float) -> Tuple[pd.DataFrame, pd.DataFrame]:
    train_corpus, valid_corpus = train_test_split(corpus, test_size=valid_size, random_state=1)
    return train_corpus.reset_index(drop=True), valid_corpus.reset_index(drop=True)

@nu.timer
def clean_text(docs: pd.DataFrame) -> pd.DataFrame:
    clean_docs = docs['description']
    clean_docs = clean_docs.str.replace("-", " ")
    clean_docs = clean_docs.str.replace("quot;", " ")
    clean_docs = clean_docs.str.replace("#39;s", "'")
    translation_table = str.maketrans('', '', string.punctuation)
    clean_docs = clean_docs.str.translate(translation_table)
    clean_docs = clean_docs.str.lower()
    clean_docs = clean_docs.str.replace(r'\d+', ' <NUM> ')
    clean_docs = clean_docs.str.replace(r'\s+', ' ')
    return clean_docs.to_frame()

@nu.timer
def split_docs(docs: pd.DataFrame) -> pd.DataFrame:
    return docs['description'].str.split().to_list()

@nu.timer
def tokenize(tokens: List[List[str]], min_freq: int = 5):
    word_freq = Counter([word for sentence in tokens for word in sentence])
    vocab = [word for word, freq in word_freq.items() if freq >= min_freq]
    vocab = ['<PAD>', '<UNK>'] + vocab
    word2idx = {word: idx for idx, word in enumerate(vocab)}
    idx_tokens = [[word2idx.get(word, 1) for word in sentence] for sentence in tokens]
    return vocab, idx_tokens, word2idx

@nu.timer
def create_skipgrams(corpus, window_size, pad_idx):
    data = []
    for sentence in corpus:
        padded_sentence = [pad_idx] * window_size + sentence + [pad_idx] * window_size
        for word_index in range(window_size, len(padded_sentence) - window_size):
            contexts = padded_sentence[word_index - window_size : word_index] + padded_sentence[word_index + 1 : word_index + window_size + 1]
            data.append((contexts, padded_sentence[word_index]))
    return data

@nu.timer
def create_batches(skipgrams, word2idx, pad_idx, batch_size=32, num_neg_samples=5):
    words_list = list(word2idx.keys())
    vocab_size = len(words_list)
    n = len(skipgrams)

    # Shuffle skipgrams
    random.shuffle(skipgrams)

    batches = []

    for batch_start in range(0, n, batch_size):
        context_batch = []
        target_batch = []
        negative_batch = []

        # Create batches
        for contexts, target in skipgrams[batch_start:batch_start + batch_size]:
            negatives = [random.choice(range(vocab_size)) for _ in range(num_neg_samples)]
            context_batch.append(torch.LongTensor(contexts))
            target_batch.append(torch.LongTensor([target]))
            negative_batch.append(torch.LongTensor(negatives))

        # If this is the last batch and it's not full, skip it
        if len(context_batch) < batch_size:
            continue

        # Pad context sequences in batch
        context_batch = pad_sequence(context_batch, batch_first=True, padding_value=pad_idx)

        # Convert target and negative batches to tensors
        target_batch = torch.stack(target_batch)
        negative_batch = torch.stack(negative_batch)

        batches.append((context_batch, target_batch, negative_batch))
        
    return batches


class CBOW_NS(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(CBOW_NS, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_size)

    def forward(self, context_words, target_word, negative_words):
        # Get embeddings for context words, target word and negative words
        context_embeds = self.embeddings(context_words)  # (batch_size, window_size*2, embed_size)
        target_embeds = self.embeddings(target_word)    # (batch_size, 1, embed_size)
        negative_embeds = self.embeddings(negative_words)# (batch_size, num_neg_samples, embed_size)

        # Sum the context word embeddings
        context_embeds_sum = torch.sum(context_embeds, dim=1, keepdim=True)  # (batch_size, 1, embed_size)

        # Compute positive score
        pos_score = torch.bmm(context_embeds_sum, target_embeds.transpose(1,2)) # (batch_size, 1, 1)
        pos_score = F.logsigmoid(pos_score)

        # Compute negative score
        neg_score = torch.bmm(context_embeds_sum, negative_embeds.transpose(1,2)) # (batch_size, 1, num_neg_samples)
        neg_score = F.logsigmoid(-neg_score)

        # Return negative of total score
        return -(torch.sum(pos_score) + torch.sum(neg_score))
    
@nu.timer
def train(model, epochs, train_batches, val_batches, lr):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    train_losses = []
    val_losses = []
    for epoch in range(epochs):
        total_loss = 0
        batch_count = 0
        for context_batch, target_batch, negative_batch in train_batches:
            model.zero_grad()
            loss = model(context_batch, target_batch, negative_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            batch_count += 1
        train_loss = total_loss / batch_count
        train_losses.append(train_loss)

        model.eval()  # set model to eval mode
        val_loss = evaluate(model, val_batches)
        val_losses.append(val_loss)

        logger.info(f'Epoch {epoch}, Train Loss: {train_loss}, Validation Loss: {val_loss}')
    return model, train_losses, val_losses


def evaluate(model, batches):
    total_loss = 0
    batch_count = 0
    with torch.no_grad():  # disable gradient computation to save memory
        for context_batch, target_batch, negative_batch in batches:
            loss = model(context_batch, target_batch, negative_batch)
            total_loss += loss.item()
            batch_count += 1
    return total_loss / batch_count


# def train(model, epochs, batches, lr=0.001):
#     optimizer = torch.optim.Adam(model.parameters(), lr=lr)
#     losses = []
#     for epoch in range(epochs):
#         total_loss = 0
#         batch_count = 0
#         for context_batch, target_batch, negative_batch in batches:
#             model.zero_grad()
#             loss = model(context_batch, target_batch, negative_batch)
#             loss.backward()
#             optimizer.step()
#             total_loss += loss.item()
#             batch_count += 1
#         avg_loss = total_loss / batch_count
#         losses.append(avg_loss)
#         logger.info(f'Epoch {epoch}, Loss: {avg_loss}')
#     return model, losses


# def plot_losses(losses, epochs):
#     # Plotting the losses
#     fig = px.line(x=list(range(epochs)), y=losses, labels={'x':'Epochs', 'y':'Loss'}, title='Loss over time')
#     fig.show()

def plot_losses(train_losses, val_losses, epochs):
    epochs_range = list(range(1, epochs + 1))
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=epochs_range, y=train_losses, mode='lines', name='Train Loss'))
    fig.add_trace(go.Scatter(x=epochs_range, y=val_losses, mode='lines', name='Validation Loss'))
    fig.update_layout(title='Loss over Epochs', xaxis=dict(title='Epoch'), yaxis=dict(title='Loss'))
    fig.show()

In [4]:
conf = nu.load_config("a3")
df_train, df_test = load_data(conf.paths.raw_txt_train, conf.paths.raw_txt_test)
df_train, df_valid = create_validation_set(df_train, 0.1)
df_train_clean = clean_text(df_train)
df_valid_clean = clean_text(df_valid)
df_test_clean = clean_text(df_test)

train_tokens = split_docs(df_train_clean)
valid_tokens = split_docs(df_valid_clean)
test_tokens = split_docs(df_test_clean)

vocab, idx_train_tokens, word2idx = tokenize(train_tokens)
_, idx_valid_tokens, _ = tokenize(valid_tokens)
_, idx_test_tokens, _ = tokenize(test_tokens)

pad_idx = word2idx['<PAD>']
skipgrams_train = create_skipgrams(idx_train_tokens, window_size=2, pad_idx=pad_idx)
skipgrams_valid = create_skipgrams(idx_valid_tokens, window_size=2, pad_idx=pad_idx)
skipgrams_test = create_skipgrams(idx_test_tokens, window_size=2, pad_idx=pad_idx)

train_batches = create_batches(skipgrams_train, word2idx, pad_idx, batch_size=512)
valid_batches = create_batches(skipgrams_valid, word2idx, pad_idx, batch_size=512)
test_batches = create_batches(skipgrams_test, word2idx, pad_idx, batch_size=512)

23-Jul-23 18:26:02 - INFO - Starting 'load_config'.
23-Jul-23 18:26:02 - INFO - Finished 'load_config' in 0.0083 secs.
23-Jul-23 18:26:02 - INFO - Starting 'load_data'.
23-Jul-23 18:26:02 - INFO - Finished 'load_data' in 0.2805 secs.
23-Jul-23 18:26:02 - INFO - Starting 'create_validation_set'.
23-Jul-23 18:26:02 - INFO - Finished 'create_validation_set' in 0.0071 secs.
23-Jul-23 18:26:02 - INFO - Starting 'clean_text'.
  clean_docs = clean_docs.str.replace(r'\d+', ' <NUM> ')
  clean_docs = clean_docs.str.replace(r'\s+', ' ')
23-Jul-23 18:26:03 - INFO - Finished 'clean_text' in 1.3954 secs.
23-Jul-23 18:26:03 - INFO - Starting 'clean_text'.
23-Jul-23 18:26:04 - INFO - Finished 'clean_text' in 0.1643 secs.
23-Jul-23 18:26:04 - INFO - Starting 'clean_text'.
23-Jul-23 18:26:04 - INFO - Finished 'clean_text' in 0.1041 secs.
23-Jul-23 18:26:04 - INFO - Starting 'split_docs'.
23-Jul-23 18:26:04 - INFO - Finished 'split_docs' in 0.2193 secs.
23-Jul-23 18:26:04 - INFO - Starting 'split_docs'.


In [5]:
vocab_size = len(vocab)
model = CBOW_NS(vocab_size, conf.model.embed_size)
trained_model, train_losses, val_losses = train(model, conf.model.epochs, train_batches, valid_batches, conf.model.lr)

23-Jul-23 18:26:44 - INFO - Starting 'train'.
23-Jul-23 18:27:44 - INFO - Epoch 0, Train Loss: 10978.197920315002, Validation Loss: 12168.784194443251
23-Jul-23 18:28:45 - INFO - Epoch 1, Train Loss: 4300.19151875274, Validation Loss: 9050.820541884874
23-Jul-23 18:29:45 - INFO - Epoch 2, Train Loss: 2719.8324911263558, Validation Loss: 6771.897746662517
23-Jul-23 18:30:45 - INFO - Epoch 3, Train Loss: 1855.5695792963388, Validation Loss: 5238.910003997467
23-Jul-23 18:31:45 - INFO - Epoch 4, Train Loss: 1384.6389267372626, Validation Loss: 4290.764213813531
23-Jul-23 18:32:45 - INFO - Epoch 5, Train Loss: 1127.5180811485018, Validation Loss: 3724.794151222313
23-Jul-23 18:33:45 - INFO - Epoch 6, Train Loss: 982.4532469307078, Validation Loss: 3390.2244909936253
23-Jul-23 18:34:46 - INFO - Epoch 7, Train Loss: 895.7740023590582, Validation Loss: 3195.1952346969438
23-Jul-23 18:35:46 - INFO - Epoch 8, Train Loss: 840.1194770256897, Validation Loss: 3085.270539336152
23-Jul-23 18:38:58 -

In [6]:
plot_losses(train_losses, val_losses, conf.model.epochs)

In [7]:
nu.save_pytorch_model(trained_model, file_path=f"{conf.paths.models}cbow_ns_{nu._get_current_dt()}.pt")

In [8]:
# We retrieve the embeddings from the model
embeddings = model.embeddings.weight.data.numpy()

In [13]:
# Define pairs of words
word_pairs = [('nasa', 'space'), ('car', 'bus'), ('cat', 'dog')]

# Calculate cosine similarities
for word1, word2 in word_pairs:
    idx1 = word2idx[word1]
    idx2 = word2idx[word2]
    sim = cosine_similarity([embeddings[idx1]], [embeddings[idx2]])
    print(f"Similarity between {word1} and {word2}: {sim[0][0]}")

Similarity between nasa and space: 0.4396308660507202
Similarity between car and bus: 0.18682055175304413
Similarity between cat and dog: 0.057477667927742004


In [15]:
# Perform t-SNE
tsne = TSNE(n_components=3, random_state=42)  # reduce to 3 components
embeddings_tsne = tsne.fit_transform(embeddings)


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



In [16]:
idx2word = {value: key for key, value in word2idx.items()}

In [17]:
# Convert to DataFrame
df = pd.DataFrame(embeddings_tsne, columns=['Dim1', 'Dim2', 'Dim3'])
df['word'] = idx2word.values()  # assuming idx2word is your index to word mapping

In [20]:
# Subset of words you're interested in
words = vocab  # replace with your words
df_subset = df[df['word'].isin(words)]

In [21]:
# Plot
fig = px.scatter_3d(df_subset, x='Dim1', y='Dim2', z='Dim3', text='word')
fig.show()

In [10]:
# Perform PCA
pca = PCA(n_components=3)  # reduce to 2 components
embeddings_pca = pca.fit_transform(embeddings)

In [None]:
words = vocab[:10]

In [None]:
# Plot some words
# words = ['word1', 'word2', 'word3']  # replace with your words
for word in words:
    idx = word2idx[word]
    plt.scatter(embeddings_pca[idx, 0], embeddings_pca[idx, 1])
    plt.annotate(word, (embeddings_pca[idx, 0], embeddings_pca[idx, 1]))
plt.show()