In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import mlflow

from torch.utils.tensorboard import SummaryWriter

import os

from pathlib import Path
import logging
from enum import StrEnum

from src.data_handling import process_training_data
from src.mlflow_setup import mlflow_setup

logging.basicConfig(level=logging.INFO)

class ModelType(StrEnum):
    SPARSE = "sparse"
    DENSE = "dense"

model_type = ModelType.SPARSE

match model_type:
    case ModelType.SPARSE:
        from src.neural_analysis.sparse import ImdbDataSet, Model
    case ModelType.DENSE:
        from src.neural_analysis.dense import ImdbDataSet, Model
    case _:
        raise Exception

mlflow_setup()
mlflow.set_experiment("Neural analysis")
mlflow.end_run()
mlflow_run = mlflow.start_run(description=model_type)
tb_writer = SummaryWriter(comment=model_type)

data_path = Path() / "data"
source_data_path = data_path / "imdb.csv"
imdb_data = pd.read_csv(source_data_path)


full_context_counts_path = data_path / "full_context_counts.csv"
if not (os.path.exists(full_context_counts_path)):
    process_training_data(
        imdb_data.review,
        imdb_data.sentiment,
        full_context_counts_path
    )



In [None]:
words_more_than = 19
mlflow.log_param("words_more_than", words_more_than)
context_counts = pd.read_csv(full_context_counts_path).query(f"total > {words_more_than}")
# neg_pos_diff = (context_counts["negative"]-context_counts["positive"])/context_counts["total"]
# neg_pos_diff_more_than = 0.2
# mlflow.log_param("neg_pos_diff_more_than", neg_pos_diff_more_than)
# context_counts = context_counts[neg_pos_diff.abs() > neg_pos_diff_more_than]
print(context_counts.word.is_unique)
mlflow.log_input(
    mlflow.data.from_pandas(
        context_counts,
        source=str(full_context_counts_path),
        name="context counts"
    )
)
context_counts


In [None]:
from src.data_handling import process_test_data

test_data_path = data_path / "stemmed_data.csv"

if not (os.path.exists(test_data_path)):
    process_test_data(
        imdb_data.review,
        imdb_data.sentiment,
        test_data_path
    )

In [None]:
from src.data_handling import SparseNumericTestDataIO, StandardLengthTestData

import torch

# write new numeric test data based on context_counts
# ---------------------------------------------------

test_data = pd.read_csv(test_data_path)
mlflow.log_input(
    mlflow.data.from_pandas(
        test_data,
        source=str(test_data_path),
        name="test data"
    )
)

used_words = context_counts.word
numeric_test_data_path = data_path / "sparse_numeric_stemmed_data.dat"
compute = True
if compute:
    SparseNumericTestDataIO(
        test_data.context,
        test_data.words,
        used_words
    ).write(numeric_test_data_path)

numeric_test_data = SparseNumericTestDataIO.read(numeric_test_data_path)
if model_type == ModelType.SPARSE:
    sentence_length = 0.99
    numeric_test_data = StandardLengthTestData.from_sparse(numeric_test_data, sentence_length=sentence_length)
# =========================================================

# the sparse numeric adds <unk> token (and possibly others), so redo used_words
used_words = pd.Series(numeric_test_data.train_words_dict.values())
used_words

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "cpu"
)
dtype = torch.float32


train_size, eval_size, test_size = 40000, 5000, 5000
# It's not really test data, because it's now used as
# the training for the model, but the previous data is
# training data also, as it's used as the basis for
# processing this here "test" data.
mlflow.log_param("train set size", train_size)
train_dataset = ImdbDataSet(
    numeric_test_data,
    end_row=train_size,
    device=device
)
mlflow.log_param("eval set size", eval_size)
eval_dataset = ImdbDataSet(
    numeric_test_data,
    start_row=train_size,
    end_row=train_size+eval_size,
    device=device
)
test_dataset = ImdbDataSet(
    numeric_test_data,
    start_row=train_size+eval_size,
    end_row=train_size+eval_size+test_size,
    device=device
)
mlflow.log_param("test set size", len(test_dataset))

In [None]:
print(len(train_dataset), len(test_dataset))
words, context = train_dataset[:]
words = words.cpu()
print("Data sparsity:", words.to(dtype=bool).to(dtype=float).mean())
del words
del context
train_dataset[:5]

In [None]:
# how many training words are being used
num_used_words = len(used_words)

embedding_dim = 3
mlflow.log_param("embedding dimension", embedding_dim)


args = [num_used_words, embedding_dim]
if model_type is ModelType.SPARSE:
    args.append(numeric_test_data.sentence_length)
        
model = Model(*args, device=device)
artifacts_path = Path() / "artifacts"
artifacts_path.mkdir(exist_ok=True)
model_desc_path = artifacts_path / "model_description.txt"
with open(model_desc_path, "w") as f:
    print(model, file=f)

mlflow.log_artifact(model_desc_path)

print(model.embeddings)

In [None]:
from torch.utils.data import DataLoader
batch_size = 300
shuffle = True
mlflow.log_param("batch size", batch_size)
mlflow.log_param("shuffle during training", shuffle)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)
_ = next(iter(train_dataloader))
print(_)
print(_[0].shape)

In [None]:
words, context = train_dataset[:5]
pred = model(words.to(dtype = torch.float32))

tb_writer.add_graph(model, words.to(dtype = torch.float32))

print(pred)
print(context)
predicted_class = pred.argmax(dim=1)
print(f"{predicted_class=}")
nlll = torch.nn.NLLLoss()
print(nlll(pred, context))
-pred[range(len(context)), context].mean()

The idea here is to train the vectors for the words as the weight matrix
of the first layer. The matrix would get updated according to the loss,
so that a vector that matches a negative word should result in a more
negative guess, and similarly for positive words. Further, words that
appear together get updated similarly.

In [None]:
from copy import deepcopy

model = Model(*args, device=device)
loss_fn = torch.nn.NLLLoss()
loss_path = artifacts_path / "loss.txt"
with open(loss_path, "w") as f:
    print(loss_fn, file=f)
mlflow.log_artifact(loss_path)
optim = torch.optim.Adam(model.parameters(), lr = 1e-3)
optim_path = artifacts_path / "optim.txt"
with open(optim_path, "w") as f:
    print(optim, file=f)

def test(model, test_data):

    words, context = test_data[:]

    model.eval()
    with torch.no_grad():

        pred = model(words).argmax(dim=1)
        
        acc = 1-torch.abs(context-pred).mean(dtype=float)


    model.train()
    return acc.item()

def scaled_normalisation(embedding:torch.Tensor, has_padding = True):
    if has_padding:
        new_embed = embedding[1:, :]
    else:
        new_embed = embedding
    new_embed -= new_embed.min(dim=0).values-1e-6
    new_embed /= torch.max(new_embed, torch.ones_like(new_embed)*1e-6).max(dim=0).values
    # new_embed = new_embed*2.0 - 1.0
    if has_padding:
        embedding[1:, :] = new_embed
    else:
        embedding = new_embed
    return embedding

def unit_normalisation_with_constant(embedding:torch.Tensor):
    new_embed = scaled_normalisation(embedding)
    new_embed[0,:] = 1.0
    return new_embed

def normalisation(embedding:torch.Tensor, has_padding = True):
    embedding = scaled_normalisation(embedding, has_padding)
    # embedding[0,0] = 0.1
    # embedding[1:,0] = -0.1
    return embedding

total_epochs = 20
mlflow.log_param("epochs", total_epochs)


losses = []
train_accs = []
eval_accs = []
max_eval = 0.0
for epoch in range(total_epochs):

    epoch_loss = []
    for i, (words, context) in enumerate(train_dataloader):

        pred = model(words)
        loss = loss_fn(pred, context)/batch_size

        loss.backward()
        optim.step()

        # normalise
        # ------------
        match model_type:
            case ModelType.DENSE:
                model.embeddings = normalisation(model.embeddings, False)
            case _:
                pass
        # ==================

        optim.zero_grad()

        epoch_loss.append(loss.item())


    mean_epoch_loss = torch.tensor(epoch_loss).mean().item()

    tb_writer.add_scalar("mean_epoch_loss/train", mean_epoch_loss, global_step=epoch)
    mlflow.log_metric("mean epoch loss", mean_epoch_loss, step=epoch)
    losses.extend(epoch_loss)

    train_acc = test(model, train_dataset)
    mlflow.log_metric("train accuracy", train_acc, step=epoch)
    print(f"{train_acc=}")
    train_accs.append(train_acc)

    eval_acc = test(model, eval_dataset)
    mlflow.log_metric("eval accuracy", eval_acc, step=epoch)
    if eval_acc > max_eval:
        best_state_dict = deepcopy(model.state_dict())
        max_eval = eval_acc
        best_epoch = epoch

    tb_writer.add_scalars("accuracy", dict(
        evaluation=eval_acc,
        train=train_acc
    ), global_step=epoch)

    print(f"{eval_acc=}")
    eval_accs.append(eval_acc)

losses = torch.tensor(losses)

model.load_state_dict(best_state_dict)
tb_writer.add_embedding(model.embeddings.cpu(), used_words, global_step=best_epoch)
sig = mlflow.models.infer_signature(
    words.cpu().numpy(),
    model(words).detach().cpu().numpy()
)
mlflow.pytorch.log_model(model, "model", signature=sig)

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import matplotlib.style as mplstyle

import itertools

mplstyle.use(["fast"])

print(losses.size())

window = 50
rolling_average_loss = list(
    map(lambda val: sum(val)/window, itertools.batched(losses, window))
)
print(f"Mean: {losses.mean().item()}")
print(f"Var: {losses.var().item()}")
print(f"{eval_acc=}")
plt.close("all")
plt.plot(
    rolling_average_loss,
)
_ = plt.title(f"Rolling average loss, window: {window}\n epochs: {total_epochs} data per epoch: {len(train_dataset)} batch size: {batch_size}")
# plt.plot(sorted(preds))

In [None]:
list(enumerate(used_words))

In [None]:
# get vector representations of the words from the model
word_vecs = model.embeddings.cpu().clone().detach()
chosen_idx = int(used_words[used_words == "bad"].index[0])
chosen_idx = int(used_words[used_words == "excel"].index[0])
print("Chosen idx: ", chosen_idx)
# vector representation of the word
chosen_word = word_vecs[chosen_idx]
# actual word itself
print("Chosen word:", used_words[chosen_idx])

# determine how similar each of the other words is to this one
cos = torch.nn.PairwiseDistance()
similarities = torch.tensor([cos(chosen_word, word_vecs[i]) for i in range(len(word_vecs))])
most_similar = torch.argsort(similarities, descending=False)
plt.plot(similarities.flatten().sort()[0])
similars_list = used_words[most_similar.numpy()].to_list()
print(*similars_list, sep="\n", file=open("temp.dat", "w"))
pd.DataFrame(dict(most_similar=similars_list[:20], least_similar=reversed(similars_list[-20:])))

In [None]:
word_vecs[int(used_words[used_words == "<unk>"].index[0])]
word_vecs.min(dim=0), word_vecs.max(dim=0)

In [None]:
%matplotlib widget

plt.close("all")
fig = plt.figure()
ax = fig.add_subplot(projection="3d")
ax.scatter(*word_vecs.T[[0,1],:])

In [None]:
%matplotlib inline


words, context = test_dataset[:]
model.eval()
pred: torch.Tensor = model(words).detach()
print(pred)
print(torch.exp(pred))
print(context)
pred_class = pred.argmax(dim=1)
acc = 1-torch.abs(context - pred_class).mean(dtype=float)
mlflow.log_metric("test accuracy", acc)
print(f"{acc=}")
pred = torch.exp(pred)
# how far the predictions were from the true value
surprise = 1-torch.gather(pred, 1, context.reshape((-1,1)))
surprise = surprise.flatten().to(device="cpu")
print(surprise)

surprise_mean = surprise.mean(dtype=float)
print(f"Mean of surprises: {surprise_mean}")
plt.close("all")
plt.hist(surprise)
plt.title("Distribution of surprises")
mlflow.log_figure(plt.gcf(), "surprise.png")
plt.figure()
sorted_surprise = surprise.sort()
plt.plot(sorted_surprise.values)
plt.title("Sorted surprises")

In [None]:
sorted_surprise

In [None]:
from collections import Counter
con = test_dataset.context.clone().cpu().numpy()

ft_np = [None]*len(con)
for i, (context_word, context_num, pred_context_num) in enumerate(zip(test_dataset.numeric_context_to_word(), con, pred_class)):

    pred_type = str(bool(context_num == pred_context_num)).lower() + f"_{context_word}"
    ft_np[i] = pred_type

ft_counts = Counter()
ft_counts.update(ft_np)
ft_counts

In [None]:
mlflow.end_run()
tb_writer.close()
    

In [None]:
torch.cuda.empty_cache()