In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import mlflow

from torch.utils.tensorboard import SummaryWriter

import os

from pathlib import Path
import logging
logging.basicConfig(level=logging.INFO)

from src.data_handling import process_training_data
from src.mlflow_setup import mlflow_setup

mlflow_setup()
mlflow.set_experiment("Neural analysis")
tb_writer = SummaryWriter()

data_path = Path() / "data"
source_data_path = data_path / "imdb.csv"
imdb_data = pd.read_csv(source_data_path)


full_context_counts_path = data_path / "full_context_counts.csv"
if not (os.path.exists(full_context_counts_path)):
    process_training_data(
        imdb_data.review,
        imdb_data.sentiment,
        full_context_counts_path
    )



In [None]:
words_more_than = 99
context_counts = pd.read_csv(full_context_counts_path).query(f"total > {words_more_than}")
print(context_counts.word.is_unique)
mlflow.log_input(
    mlflow.data.from_pandas(
        context_counts,
        source=str(full_context_counts_path),
        name="context counts"
    )
)
context_counts


In [None]:
from src.data_handling import process_test_data

test_data_path = data_path / "stemmed_data.csv"

if not (os.path.exists(test_data_path)):
    process_test_data(
        imdb_data.review,
        imdb_data.sentiment,
        test_data_path
    )

In [None]:
from src.data_handling import test_data_to_numeric

import torch

# write new numeric test data based on context_counts
# ---------------------------------------------------

test_data = pd.read_csv(test_data_path)
mlflow.log_input(
    mlflow.data.from_pandas(
        test_data,
        source=str(test_data_path),
        name="test data"
    )
)

used_words = context_counts.word
numeric_test_data_path = data_path / "numeric_stemmed_data.parquet"
compute = False
if compute:
    test_data_to_numeric(
        test_data.context,
        test_data.words,
        used_words,
        numeric_test_data_path
    )
# =========================================================

device = (
    "cuda"
    if torch.cuda.is_available()
    else "cpu"
)
dtype = torch.float32
class ImdbDataSet(torch.utils.data.TensorDataset):

    def __init__(self, data_path, start_row=0, end_row = None, device = None):
        '''
        Arguments:
            dtype:
                should be of integer dtype suitable for torch tensors
        '''


        data = pd.read_parquet(data_path)[start_row:]
        if not (end_row is None):
            data = data[:end_row]

        # create context mappings
        context_categorical = pd.Categorical(data["__context__"])
        self.context = context_categorical.codes
        self.context_mapping = dict(zip(context_categorical.categories, self.context))

        self.device = device or "cpu"
        # needs to be int64 for nll loss, seemingly
        # could also just directly calculate though, which would
        # allow using at least int32
        self.context = torch.tensor(self.context, dtype=torch.int64, device=device)
        
        self.words = torch.tensor(data.iloc[:,1:].to_numpy(), dtype=torch.float32, device=self.device)

    def __len__(self):

        return len(self.context)
    
    def __getitem__(self, idx):

        return self.words[idx], self.context[idx]

    def numeric_context_to_word(self):

        return list(map(lambda val: self.context_mapping[str(int(val))], self.context))


# It's not really test data, because it's now used as
# the training for the model, but the previous data is
# training data also, as it's used as the basis for
# processing this here "test" data.
train_size = 40000
mlflow.log_param("train set size", train_size)
train_dataset = ImdbDataSet(
    numeric_test_data_path,
    end_row=train_size,
    device=device
)
eval_size = 5000
mlflow.log_param("eval set size", eval_size)
eval_dataset = ImdbDataSet(
    numeric_test_data_path,
    start_row=train_size,
    end_row=train_size+eval_size,
    device=device
)
test_dataset = ImdbDataSet(
    numeric_test_data_path,
    start_row=train_size+eval_size,
    device=device
)
mlflow.log_param("test set size", len(test_dataset))

In [None]:
print(len(train_dataset), len(test_dataset))
words, context = train_dataset[:]
print("Data sparsity:", words.to(dtype = torch.float32).mean())
del words
del context
train_dataset[:5]

In [None]:
# how many words to use from context_counts
num_used_words = len(used_words)

def get_model():

    word_vecs = torch.nn.Linear(
        in_features=num_used_words,
        out_features=8,
        bias=False,
        device=device,
        dtype=dtype
    )

    # standard initialisation
    torch.nn.init.constant_(word_vecs.weight, 1/torch.numel(word_vecs.weight))

    return torch.nn.Sequential(
        word_vecs,
        torch.nn.ReLU(),
        torch.nn.Linear(
            in_features=8,
            out_features=4,
            bias=True,
            device=device,
            dtype=dtype
        ),
        torch.nn.Linear(
            in_features=4,
            out_features=2,
            bias=False,
            device=device,
            dtype=dtype
        ),
        torch.nn.LogSoftmax(dim=1)
    )

model = get_model()
artifacts_path = Path() / "artifacts"
artifacts_path.mkdir(exist_ok=True)
model_desc_path = artifacts_path / "model_description.txt"
with open(model_desc_path, "w") as f:
    print(model, file=f)

mlflow.log_artifact(model_desc_path)

print(next(model[0].parameters()))

In [None]:
from torch.utils.data import DataLoader
batch_size = 20
shuffle = True
mlflow.log_param("batch size", batch_size)
mlflow.log_param("shuffle during training", shuffle)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)
_ = next(iter(train_dataloader))
print(_)
print(_[0].shape)

In [None]:
words, context = train_dataset[:5]
pred = model(words.to(dtype = torch.float32))

tb_writer.add_graph(model, words.to(dtype = torch.float32))

print(pred)
print(context)
predicted_class = pred.argmax(dim=1)
print(f"{predicted_class=}")
nlll = torch.nn.NLLLoss()
print(nlll(pred, context))
-pred[range(len(context)), context].mean()

The idea here is to train the vectors for the words as the weight matrix
of the first layer. The matrix would get updated according to the loss,
so that a vector that matches a negative word should result in a more
negative guess, and similarly for positive words. Further, words that
appear together get updated similarly.

Not sure whether these should be updated in batches or not.

In [None]:

model = get_model()
loss_fn = torch.nn.NLLLoss()
loss_path = artifacts_path / "loss.txt"
with open(loss_path, "w") as f:
    print(loss_fn, file=f)
mlflow.log_artifact(loss_path)
optim = torch.optim.Adam(model.parameters(), lr = 1e-3)
optim_path = artifacts_path / "optim.txt"
with open(optim_path, "w") as f:
    print(optim, file=f)


losses = []
train_accs = []
eval_accs = []

def test(model, test_data):

    words, context = test_data[:]

    model.eval()
    with torch.no_grad():

        pred = model(words).argmax(dim=1)
        
        acc = 1-torch.abs(context-pred).mean(dtype=float)


    model.train()
    return acc.item()

total_epochs = 5
mlflow.log_param("epochs", total_epochs)
for epoch in range(total_epochs):

    epoch_loss = []
    for i, (words, context) in enumerate(train_dataloader):

        pred = model(words)
        loss = loss_fn(pred, context)

        loss.backward()
        optim.step()
        optim.zero_grad()

        epoch_loss.append(loss.item())


    mean_epoch_loss = torch.tensor(epoch_loss).mean().item()

    tb_writer.add_scalar("mean_epoch_loss/train", mean_epoch_loss, global_step=epoch)
    mlflow.log_metric("mean epoch loss", mean_epoch_loss, step=epoch)
    losses.extend(epoch_loss)

    train_acc = test(model, train_dataset)
    mlflow.log_metric("train accuracy", train_acc, step=epoch)
    print(f"{train_acc=}")
    train_accs.append(train_acc)

    eval_acc = test(model, eval_dataset)
    mlflow.log_metric("eval accuracy", eval_acc, step=epoch)

    tb_writer.add_scalars("accuracy", dict(
        evaluation=eval_acc,
        train=train_acc
    ), global_step=epoch)

    print(f"{eval_acc=}")
    eval_accs.append(eval_acc)

losses = torch.tensor(losses)


sig = mlflow.models.infer_signature(
    words.cpu().numpy(),
    model(words).detach().cpu().numpy()
)
mlflow.pytorch.log_model(model, "model", signature=sig)

In [None]:
import matplotlib.pyplot as plt

import itertools

print(losses.size())

window = 50
rolling_average_loss = list(
    map(lambda val: sum(val)/window, itertools.batched(losses, window))
)
print(f"Mean: {losses.mean().item()}")
print(f"Var: {losses.var().item()}")
print(f"{eval_acc=}")
plt.plot(
    rolling_average_loss,
)
_ = plt.title(f"Rolling average loss, window: {window}\n epochs: {total_epochs} data per epoch: {len(train_dataset)} batch size: {batch_size}")
# plt.plot(sorted(preds))

In [None]:
list(enumerate(used_words))

In [None]:
# get vector representations of the words from the model
word_vecs = next(iter(model.parameters())).T.clone().detach()
chosen_idx = 19
# vector representation of the word
chosen_word = word_vecs[chosen_idx]
# actual word itself
print("Chosen word:", used_words[chosen_idx])

# determine how similar each of the other words is to this one
cos = torch.nn.CosineSimilarity(dim=-1)
similarities = torch.tensor([cos(chosen_word, word_vecs[i]) for i in range(len(word_vecs))])
most_similar = torch.argsort(similarities, descending=True)
plt.plot(similarities.flatten().sort()[0])
print(*used_words[most_similar.numpy()].to_list(), sep="\n", file=open("temp.dat", "w"))

In [None]:
word_vecs

In [None]:

words, context = test_dataset[:]
model.eval()
pred = model(words).detach()
print(pred)
print(torch.exp(pred))
print(context)
acc = 1-torch.abs(context - pred.argmax(dim=1)).mean(dtype=float)
mlflow.log_metric("test accuracy", acc)
print(f"{acc=}")
pred = torch.exp(pred)
# how far the predictions were from the true value
surprise = 1-torch.gather(pred, 1, context.reshape((-1,1)))
surprise = surprise.flatten().to(device="cpu")
print(surprise)

surprise_mean = surprise.mean(dtype=float)
print(f"Mean of surprises: {surprise_mean}")
plt.hist(surprise)
plt.title("Distribution of surprises")
mlflow.log_figure(plt.gcf(), "surprise.png")
plt.figure()
plt.plot(surprise.sort().values)
plt.title("Sorted surprises")

In [None]:
mlflow.end_run()
tb_writer.close()
    