In [1]:
import numpy as np
import pandas as pd

import os

import transformers
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder

from transformers import AutoTokenizer, BertModel, DistilBertModel
from transformers import AutoModel

from datasets import Dataset, ClassLabel

import torch
from torch.utils.data import Dataset as TorchDataset, DataLoader
from torch.nn import TripletMarginLoss
from torch.optim import Adam
from tqdm import tqdm

import neptune.new as neptune

In [2]:
### PARAMS
MAX_SAMPLES = 10000
BATCH_SIZE = 4
LR = 1e-3
EPOCHS = 20

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
#device = torch.device('cpu')

In [3]:
dataset_df = pd.read_csv('dataset/tweet_dataset.csv')
dataset_df.dropna(inplace=True)
dataset_df

Unnamed: 0,short_description,category
0,Health experts said it is too early to predict...,U.S. NEWS
1,He was subdued by passengers and crew when he ...,U.S. NEWS
2,"""Until you have a dog you don't understand wha...",COMEDY
3,"""Accidentally put grown-up toothpaste on my to...",PARENTING
4,Amy Cooper accused investment firm Franklin Te...,U.S. NEWS
...,...,...
209522,Verizon Wireless and AT&T are already promotin...,TECH
209523,"Afterward, Azarenka, more effusive with the pr...",SPORTS
209524,"Leading up to Super Bowl XLVI, the most talked...",SPORTS
209525,CORRECTION: An earlier version of this story i...,SPORTS


In [4]:
X, y = dataset_df[['short_description']], dataset_df[['category']]

### Undersampling

In [5]:
undersampler = RandomUnderSampler(random_state=42)
X_res, y_res = undersampler.fit_resample(X, y)

### One-hot encoding

In [6]:
oh_encoder = LabelEncoder()
y_enc = oh_encoder.fit_transform(y_res)

  y = column_or_1d(y, warn=True)


### Dataset creation

In [7]:
data_df = {"text": X_res["short_description"], "labels": y_enc.tolist()}
data_df = Dataset.from_dict(data_df)
data_df

Dataset({
    features: ['text', 'labels'],
    num_rows: 36246
})

### Tokenization

In [8]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
dataset_features = data_df.features.copy()

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

dataset = data_df.map(tokenize_function)
dataset.features['labels'] = ClassLabel(num_classes=42)

  0%|          | 0/36246 [00:00<?, ?ex/s]

In [9]:
dataset = dataset.remove_columns(["text"])
dataset.set_format("torch")
dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 36246
})

In [10]:
dataset = dataset.shuffle().select(range(5000))

In [11]:
split_dataset = dataset.train_test_split(test_size=0.1, stratify_by_column="labels")

### Dataset definition

In [12]:
class TweetDataset(TorchDataset):
    def __init__(self, dataset: Dataset):
        self.input_ids = dataset['input_ids']
        self.attention_mask = dataset['attention_mask']
        self.dataset = dataset.remove_columns("labels")
        self.labels = dataset['labels']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, item):
        #anchor = self.input_ids[item]
        anchor = self.dataset[item]
        anchor_class = self.labels[item]
        #anchor_attention = self.attention_mask[item]


        positive_indices = self.labels == anchor_class
        positive_indices = positive_indices.nonzero()
        positive_idx = positive_indices[torch.randint(high=len(positive_indices), size=(1, ))[0]]
        #positive_example = self.input_ids[positive_idx].flatten()
        #positive_attention = self.attention_mask[positive_idx]
        positive_example = self.dataset[positive_idx]

        negative_indices = self.labels != anchor_class
        negative_indices = negative_indices.nonzero()
        negative_idx = negative_indices[torch.randint(high=len(negative_indices), size=(1, ))[0]]
        #negative_example = self.input_ids[negative_idx].flatten()
        #negative_attention = self.attention_mask[negative_idx]
        negative_example = self.dataset[negative_idx]

        return anchor, positive_example, negative_example

In [13]:
train_ds = TweetDataset(split_dataset['train'])
test_ds = TweetDataset(split_dataset['test'])

train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=True)

### Model Training

In [14]:
class TweetBERT(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-cased")
        self.pooler = torch.nn.Linear(768, 768)
        self.tahn = torch.nn.Tanh()

    def forward(self, x):
        x = self.bert(**x)
        x = self.pooler(x[0][:, 0])
        return self.tahn(x)

In [15]:
model = TweetBERT()

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
model = model.to(device=device)

In [17]:
optimizer = Adam(model.parameters(), lr=LR)
loss = TripletMarginLoss()

In [None]:
run = neptune.init(
    project="konradszewczyk/TweetBuble",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI0MWIyOTA1ZS03ODc3LTQ5MzQtYjk0OS05ZjNjYzdiMDFjMDcifQ==",
)

os.mkdir(os.path.join('models', run['sys/id'].fetch()))

for epoch in range(EPOCHS):
    model.train()
    train_loss_log = []
    for batch_idx, (anchor, positive_ex, negative_ex) in enumerate(tqdm(train_dl)):
        #anchor = anchor.to(device=device)
        anchor = {k: v.to(device) for k, v in anchor.items()}
        archor_output = model(anchor)

        #positive_ex = positive_ex.to(device=device)
        positive_ex = {k: v[0].to(device) for k, v in positive_ex.items()}
        positive_ex_output = model(positive_ex)

        #negative_ex = negative_ex.to(device=device)
        negative_ex = {k: v[0].to(device) for k, v in negative_ex.items()}
        negative_ex_output = model(negative_ex)

        optimizer.zero_grad()
        train_loss = loss(archor_output, positive_ex_output, negative_ex_output)
        train_loss.backward()

        optimizer.step()

        train_loss_log.append(train_loss.detach().cpu())

    train_loss = np.mean(train_loss_log)
    run['train_loss'].log(train_loss)
    print("Epoch {:02d} train: {:.5f}".format(epoch, train_loss))

    file_name = 'epoch-{:02d}.pt'.format(epoch)
    PATH = os.path.join('models', run['sys/id'].fetch(), file_name)
    torch.save(model.state_dict(), PATH)

    model.eval()
    test_loss_log = []
    with torch.no_grad():
        for batch_idx, (anchor, positive_ex, negative_ex) in enumerate(tqdm(test_dl)):
            #anchor = anchor.to(device=device)
            anchor = {k: v.to(device) for k, v in anchor.items()}
            archor_output = model(anchor)

            #positive_ex = positive_ex.to(device=device)
            positive_ex = {k: v[0].to(device) for k, v in positive_ex.items()}
            positive_ex_output = model(positive_ex)

            #negative_ex = negative_ex.to(device=device)
            negative_ex = {k: v[0].to(device) for k, v in negative_ex.items()}
            negative_ex_output = model(negative_ex)

            test_loss = loss(archor_output, positive_ex_output, negative_ex_output)

            test_loss_log.append(test_loss.cpu())

    test_loss = np.mean(test_loss_log)
    run['test_loss'].log(test_loss)
    print("Epoch {:02d} val: {:.5f}".format(epoch, test_loss))

run.stop()

  run = neptune.init(


https://app.neptune.ai/konradszewczyk/TweetBuble/e/BUBL-39
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


  0%|          | 0/1125 [00:00<?, ?it/s]

In [None]:
run.stop()