In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from matplotlib import pyplot as plt

from utilities import get_nodeid2text

In [None]:
device = torch.device("cpu")

In [None]:
# load the text data + labels
nodeid2text = pd.read_pickle("data/nodeid2text_gensim.pkl")
# load splits
_, (train_idx, valid_idx, test_idx) = get_nodeid2text()
# load the output of the LDA model
num_topics_list = [10, 20, 40, 80]
all_gammas = [
    torch.from_numpy(np.load(f"gammas/{n_topics}_topics.npy")) for n_topics in tqdm(num_topics_list)
]

In [None]:
train_ds = list()
valid_ds = list()
for i in range(len(num_topics_list)):
    train_ds.append(TensorDataset(all_gammas[i][train_idx], torch.from_numpy(nodeid2text.loc[train_idx]["label"].values)))
    valid_ds.append(TensorDataset(all_gammas[i][valid_idx], torch.from_numpy(nodeid2text.loc[valid_idx]["label"].values)))

In [None]:
class DenseNet(nn.Module):
    def __init__(self, input_size, n_hidden):
        super(DenseNet, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, n_hidden),
            nn.ReLU(),
            nn.Linear(n_hidden, 40),
        )

    def forward(self, x):
        logits = self.layers(x)
        return logits

def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    return correct, test_loss

In [None]:
train_dl = list()
valid_dl = list()
valid_acc = list()
for train_set, valid_set in zip(train_ds, valid_ds):
    train_dl.append(DataLoader(train_set, batch_size=64, pin_memory=True))
    valid_dl.append(DataLoader(valid_set, batch_size=64, pin_memory=True))
for i, n_topics in enumerate(num_topics_list):
    print(f"Num Topics: {n_topics} -----")
    model = DenseNet(input_size=n_topics, n_hidden=80).to(device)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters())
    epochs = 10
    for t in range(epochs):
        train_loop(train_dl[i], model, loss_fn, optimizer)
        correct, test_loss = test_loop(valid_dl[i], model, loss_fn)
        print(f"Epoch {t+1}: Valid Acc: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f}")
    valid_acc.append(correct)
    print("Done!")

In [None]:
valid_acc

In [None]:
# copied from LDA_LogisticRegression.ipynb notebook
lda_lr_valid_acc = [0.444478, 0.44142421, 0.53783684, 0.56149535,]

In [None]:
plt.title("LDA Validation Acc. v. Num Topics")
plt.xlabel("Number of Topics")
plt.ylabel("Validation Accuracy")
xaxis = np.arange(len(num_topics_list))
plt.bar(xaxis-0.2, valid_acc, width=0.4, label="LDA+DenseNet")
plt.bar(xaxis+0.2, lda_lr_valid_acc, width=0.4, label="LDA+LR")
plt.xticks(xaxis, [str(n_topics) for n_topics in num_topics_list])
plt.legend()