In [1]:
import pandas as pd
import os
import torch
import numpy as np
import nltk
import string
from tqdm.notebook import tqdm

In [2]:
embed_size = 1024
batch_size = 10

In [3]:
torch.manual_seed(0)

<torch._C.Generator at 0x1b63c742bd0>

In [4]:
inputs = torch.load('train_embeds_roberta_augmented.pt')

In [5]:
inputs = torch.unsqueeze(inputs, dim = 1)

In [6]:
inputs.shape

torch.Size([15716, 1, 19, 1024])

In [7]:
flattened_shape = inputs.shape[2]*inputs.shape[3]

In [8]:
labels = np.load('labels_aug_roberta.npy')

In [9]:
labels = np.array([int(i) for i in labels], dtype = np.int64)

In [10]:
labels.shape

(15716,)

In [11]:
labels

array([2, 2, 2, ..., 1, 1, 1], dtype=int64)

In [12]:
labels = torch.LongTensor(labels)
labels.shape

torch.Size([15716])

In [13]:
dataset = torch.utils.data.TensorDataset(inputs, labels)

In [14]:
class RepEncoder(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = torch.nn.Flatten()
        self.relu = torch.nn.ReLU()
        self.dense1 = torch.nn.Linear(in_features = flattened_shape, out_features = 32)
        self.dense2 = torch.nn.Linear(in_features = 32, out_features = 4)
    
    def forward(self, inputs):
        out = self.flatten(inputs)
        out = self.dense1(out)
        out = self.relu(out)
        out = self.dense2(out)
        return out

In [15]:
def train(epochs, batch_size):
    model = RepEncoder()
    model.double()
    loss_fn = torch.nn.CrossEntropyLoss()
    dataset = torch.utils.data.TensorDataset(inputs, labels)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
    loader = torch.utils.data.DataLoader(dataset, sampler = torch.utils.data.RandomSampler(dataset), batch_size = batch_size, pin_memory = True)
    for epoch in tqdm(range(epochs)):
        for idx, data in enumerate(loader):
            reqs, tgts = data
            tgts.type(torch.LongTensor)
            tgts = tgts - 1
            preds = model(reqs)
            loss = loss_fn(preds, tgts)
            model.zero_grad()
            loss.backward()
            optimizer.step()
        print(f"Epoch: {epoch}, loss: {loss}")
    torch.save(model.state_dict(), 'saved_model.pt')

In [None]:
train(30, 1000)

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

In [None]:
model = RepEncoder()
model.load_state_dict(torch.load('saved_model.pt'))
model.eval()

In [None]:
dev_inputs = torch.load('dev_embeds_roberta.pt')

In [None]:
dev_inputs.shape

In [None]:
dev_inputs = torch.unsqueeze(dev_inputs, axis = 1)
dev_inputs.shape

In [None]:
model.double()
preds = model(dev_inputs)

In [None]:
preds

In [None]:
class_preds = np.argmax(preds.detach().numpy(), axis = 1)

In [None]:
class_preds = class_preds + 1
class_preds

In [None]:
dev_ids = np.load('topics_dev.npy')
dev_ids

In [None]:
outs = [(dev_ids[i], class_preds[i]) for i in range(len(preds))]
outs

In [None]:
outs = np.array(outs)
outs

In [None]:
np.savetxt('preds_dev_roberta_augmented.txt', outs, fmt="%s %s")

In [None]:
from sklearn.metrics import confusion_matrix

labels = np.load('labels_dev_embeds.npy')

labels.shape

cf_matrix = confusion_matrix(labels, class_preds)
print(cf_matrix)

import seaborn as sns
sns.heatmap(cf_matrix, annot=True)