In [1]:
import numpy as np
import pandas as pd

import os

import transformers
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder

from transformers import AutoTokenizer, BertModel, DistilBertModel
from transformers import AutoModel, BertForSequenceClassification, BertTokenizer

from datasets import Dataset, ClassLabel

import torch
from torch.utils.data import Dataset as TorchDataset, DataLoader
from torch.nn import TripletMarginLoss
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm

from sklearn.model_selection import train_test_split

import neptune.new as neptune

In [2]:
dataset_df = pd.read_csv('dataset/embeddings.csv')

In [3]:
train_dataset, test_dataset = train_test_split(dataset_df, stratify=dataset_df['labels'], )

In [4]:
class TweetDataset(TorchDataset):
    def __init__(self, dataset: pd.DataFrame):
        self.labels = torch.Tensor(dataset['labels'])
        self.embeddings = torch.Tensor(dataset.drop(['labels', 'index'], axis=1).to_numpy())

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        anchor = self.embeddings[item]
        anchor_class = self.labels[item]

        positive_indices = self.labels == anchor_class
        positive_indices = positive_indices.nonzero()
        positive_idx = positive_indices[torch.randint(high=len(positive_indices), size=(1, ))[0]]
        #positive_example = self.input_ids[positive_idx].flatten()
        #positive_attention = self.attention_mask[positive_idx]
        positive_example = self.embeddings[positive_idx]

        negative_indices = self.labels != anchor_class
        negative_indices = negative_indices.nonzero()
        negative_idx = negative_indices[torch.randint(high=len(negative_indices), size=(1, ))[0]]
        #negative_example = self.input_ids[negative_idx].flatten()
        #negative_attention = self.attention_mask[negative_idx]
        negative_example = self.embeddings[negative_idx]

        return anchor, positive_example.flatten(), negative_example.flatten()

### Model

In [5]:
### PARAMS
MAX_SAMPLES = 10000
BATCH_SIZE = 8
LR = 1e-3
EPOCHS = 50

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
#device = torch.device('cpu')

In [6]:
train_ds = TweetDataset(train_dataset.reset_index())
test_ds = TweetDataset(test_dataset.reset_index())

train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=True)

In [7]:
class TweetBERTTail(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.pooler = torch.nn.Sequential(
            torch.nn.Linear(768, 2048),
            torch.nn.ReLU(),
            torch.nn.Linear(2048, 2048),
            torch.nn.ReLU(),
            torch.nn.Linear(2048, 768)
            #torch.nn.Tanh()
        )

    def forward(self, x):
        return self.pooler(x)

In [8]:
model = TweetBERTTail()
model.to(device=device)

TweetBERTTail(
  (pooler): Sequential(
    (0): Linear(in_features=768, out_features=2048, bias=True)
    (1): ReLU()
    (2): Linear(in_features=2048, out_features=2048, bias=True)
    (3): ReLU()
    (4): Linear(in_features=2048, out_features=768, bias=True)
  )
)

In [9]:
optimizer = Adam(model.parameters(), lr=LR)
criterion = TripletMarginLoss()
scheduler = StepLR(optimizer, step_size=5, gamma=0.9)

In [11]:
run = neptune.init(
    project="konradszewczyk/TweetBuble",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI0MWIyOTA1ZS03ODc3LTQ5MzQtYjk0OS05ZjNjYzdiMDFjMDcifQ==",
)

os.mkdir(os.path.join('models', run['sys/id'].fetch()))

for epoch in range(EPOCHS):
    model.train()
    train_loss_log = []
    for batch_idx, (anchor, positive_ex, negative_ex) in enumerate(tqdm(train_dl)):
        anchor = anchor.to(device=device)
        archor_output = model(anchor)

        positive_ex = positive_ex.to(device=device)
        positive_ex_output = model(positive_ex)

        negative_ex = negative_ex.to(device=device)
        negative_ex_output = model(negative_ex)

        optimizer.zero_grad()
        train_loss = criterion(archor_output, positive_ex_output, negative_ex_output)
        train_loss.backward()

        optimizer.step()

        train_loss_log.append(train_loss.detach().cpu())

    train_loss = np.mean(train_loss_log)
    run['train_loss'].log(train_loss)
    print("Epoch {:02d} train: {:.5f}".format(epoch, train_loss))

    file_name = 'epoch-{:02d}.pt'.format(epoch)
    PATH = os.path.join('models', run['sys/id'].fetch(), file_name)
    torch.save(model.state_dict(), PATH)

    model.eval()
    test_loss_log = []
    with torch.no_grad():
        for batch_idx, (anchor, positive_ex, negative_ex) in enumerate(tqdm(test_dl)):
            anchor = anchor.to(device=device)
            archor_output = model(anchor)

            positive_ex = positive_ex.to(device=device)
            positive_ex_output = model(positive_ex)

            negative_ex = negative_ex.to(device=device)
            negative_ex_output = model(negative_ex)

            test_loss = criterion(archor_output, positive_ex_output, negative_ex_output)

            test_loss_log.append(test_loss.cpu())

    test_loss = np.mean(test_loss_log)
    run['test_loss'].log(test_loss)
    print("Epoch {:02d} val: {:.5f}".format(epoch, test_loss))

run.stop()

https://app.neptune.ai/konradszewczyk/TweetBuble/e/BUBL-53
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


 16%|█▌        | 538/3398 [00:06<00:36, 77.47it/s] 


KeyboardInterrupt: 