In [None]:
!pip install transformers
!pip install pytorch-lightning
!pip install pandas
!pip install numpy
!pip install torch
!pip install torchmetrics
!pip install ipywidgets
!pip install IProgress

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 9.7MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 37.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 43.0MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Installing collect

## Imports

In [34]:
# Python
import json
import os
from typing import Optional

# General 3rd Party
import pandas as pd
import numpy as np

# HuggingFace
from transformers import BertForSequenceClassification, BertTokenizerFast, BertConfig
from transformers import AdamW, get_linear_schedule_with_warmup

# PyTorch
import torch
from torch.functional import F
from torch.utils.data import random_split, DataLoader

# PyTorch Lightning
import pytorch_lightning as pl
from pytorch_lightning import Trainer, loggers, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint

import torchmetrics

In [35]:
seed_everything(42, workers=True)

Global seed set to 42


42

In [36]:
MODEL_DIRECTORY = '/home/brian/Documents/kubernetes/models/offensive'

directories_needed = [MODEL_DIRECTORY]

input_paths = '/home/brian/Documents/kubernetes/datasets/live/offensive/test_a_labels_all.csv'
labels_path = '/home/brian/Documents/kubernetes/datasets/live/offensive/test_a_tweets_all.tsv'


files_needed = [input_paths, labels_path]

def assertFilesAndDirectoriesExist(files, directories):
  for d in directories:
    assert os.path.isdir(d)

  for f in files:
    assert os.path.exists(f)
  
  print("+ All files and directories accounted for!")

assertFilesAndDirectoriesExist(files_needed, directories_needed)

+ All files and directories accounted for!


## Load Data from Disk

In [37]:
labels_frame = pd.read_csv(input_paths, names=['id', 'label'])
inputs_frame = pd.read_csv(labels_path, sep="\t")

In [38]:
inputs_frame.head()

Unnamed: 0,id,tweet
0,B0,Has @USER quit? I've not heard of any #knifecr...
1,B1,"In celebration of Emancipation Day, we urge yo..."
2,B2,@USER @USER It’d be a literal dream come true ...
3,B3,Brilliant news to read that Hoggy has signed a...
4,B4,@USER She speaks of the truth 😌


In [39]:
labels_frame.head()

Unnamed: 0,id,label
0,BC0,OFF
1,BC1,OFF
2,BC2,OFF
3,BC3,OFF
4,BC4,OFF


In [40]:
labels_frame['label'] = labels_frame['label'].apply(lambda x: 1 if x == 'OFF' else 0)

In [41]:
inputs_frame.head()

Unnamed: 0,id,tweet
0,B0,Has @USER quit? I've not heard of any #knifecr...
1,B1,"In celebration of Emancipation Day, we urge yo..."
2,B2,@USER @USER It’d be a literal dream come true ...
3,B3,Brilliant news to read that Hoggy has signed a...
4,B4,@USER She speaks of the truth 😌


In [42]:
full_frame = inputs_frame.merge(labels_frame, left_on="id", right_on="id")[['tweet', 'label']].rename(columns={'tweet': 'text'})

In [43]:
full_frame.head()

Unnamed: 0,text,label
0,Has @USER quit? I've not heard of any #knifecr...,0
1,"In celebration of Emancipation Day, we urge yo...",0
2,@USER @USER It’d be a literal dream come true ...,0
3,Brilliant news to read that Hoggy has signed a...,0
4,@USER She speaks of the truth 😌,0


In [44]:
full_frame = full_frame.sample(frac=1)

In [45]:
full_frame.head()

Unnamed: 0,text,label
2404,@USER One of my favorite songs. Reminds me now...,0
1916,@USER HOWWJDJJSKDKS. is it bc he gRew uP LMFOA...,0
2503,"im tired of not being able to fucking sleep , ...",1
2513,"I am legit sick of being into women, they’re T...",1
2024,@USER We live riding that. To bad it costs the...,0


In [47]:
inputs = full_frame['text'].values
labels = full_frame['label'].values

# Data Preparation

In [48]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [49]:
class TextLabelTokenizerDataset(torch.utils.data.Dataset):
  """Offensive Language Dataset"""

  def __init__(self, inputs, labels, tokenizer):
    super().__init__()

    self.inputs = inputs
    self.labels = labels
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.inputs)
  
  def __getitem__(self, idx):
    if torch.is_tensor(idx):
      idx = idx.tolist()

    encoding = self.tokenizer(
        self.inputs[idx],
        return_tensors='pt',
        truncation=True,
        padding='max_length',
        max_length=500
    )

    input_ids = encoding["input_ids"]
    attention_mask = encoding["attention_mask"]
    
    return {
        "input_ids": input_ids.type(torch.long),
        "attention_mask": attention_mask.type(torch.long),
        "target": torch.tensor(self.labels[idx], dtype=torch.float)
    }

In [50]:
text_label_tokenizer_dataset = TextLabelTokenizerDataset(inputs, labels, tokenizer)

In [51]:
class CustomDataModule(pl.LightningDataModule):
  def __init__(self, dataset, batch_size: int = 32):
    super().__init__()

    self.dataset = dataset
    self.batch_size = batch_size

  def prepare_data(self):
    # download
    pass

  def setup(self, stage: Optional[str] = None):
    print("DATA MODULE SETUP")
    train_size = int(0.6 * len(self.dataset))
    val_size = int(0.2 * len(self.dataset))
    test_size = len(self.dataset) - train_size - val_size

    self.train, self.val, self.test = random_split(
        self.dataset, 
        [train_size, val_size, test_size]
        )

  def train_dataloader(self):
      return DataLoader(self.train, batch_size=self.batch_size, num_workers=16, shuffle=True)

  def val_dataloader(self):
      return DataLoader(self.val, batch_size=self.batch_size, num_workers=16)

  def test_dataloader(self):
      return DataLoader(self.test, batch_size=self.batch_size, num_workers=16)

In [52]:
custom_datamodule = CustomDataModule(text_label_tokenizer_dataset, batch_size=16)

# Model Preparation

In [54]:
class BertForSequenceClassificationLM(pl.LightningModule):
  def __init__(self, pretrained='bert-base-uncased', bert_weights=None, freeze=True, lr=3e-5, eps=1e-5):
    super().__init__()

    self.lr = lr
    self.eps = eps

    config = BertConfig.from_pretrained(pretrained)
    config.num_labels = 1

    self.classifier = BertForSequenceClassification(
        config
        )
    
    if (bert_weights):
      weights = torch.load(bert_weights)
      self.classifier.bert = weights

    if (freeze):    
      for param in self.classifier.bert.parameters():
        param.requires_grad = False
    
    self.train_loss = []
    self.val_loss = []
    self.val_acc = []
    self.test_loss = []
    self.test_acc = []

    self.loss = F.binary_cross_entropy_with_logits

    self.validation_accuracy = torchmetrics.Accuracy()
    self.test_accuracy = torchmetrics.Accuracy()

  def forward(self, input_ids, attention_mask):
    return self.classifier(
        input_ids=input_ids, 
        attention_mask=attention_mask, 
        return_dict=True
        )['logits']

  def training_step(self, batch, batch_idx):
    b_input_ids = batch['input_ids'].view(batch['input_ids'].shape[0], -1)
    b_attention_masks = batch["attention_mask"].view(batch['attention_mask'].shape[0], -1)
    b_targets = batch["target"].unsqueeze(1)

    logits = self(b_input_ids, b_attention_masks)

    loss = self.loss(logits, b_targets)

    self.log('train_loss', loss, on_step=True, on_epoch=True, 
             prog_bar=True)

    return {'loss': loss}
  
  def training_epoch_end(self, outputs):
    avg_loss = torch.stack([x['loss'] for x in outputs]).mean()

    self.log(f'epoch_train_loss_{self.current_epoch}', avg_loss)
    self.train_loss.append(avg_loss)

  def validation_step(self, batch, batch_idx):
    b_input_ids = batch['input_ids'].view(batch['input_ids'].shape[0], -1)
    b_attention_masks = batch["attention_mask"].view(batch['attention_mask'].shape[0], -1)
    b_targets = batch["target"].unsqueeze(1)

    logits = self(b_input_ids, b_attention_masks)

    preds = torch.sigmoid(logits)

    val_acc = torch.mean(((preds > 0.5) == b_targets).to(torch.float))
    val_loss = self.loss(logits, b_targets)

    logs = {'val_loss': val_loss, 'val_acc': val_acc}

    self.log('val_loss', val_loss, on_step=True, on_epoch=True, 
             prog_bar=True)
    
    return logs

  def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
    avg_acc = torch.stack([x['val_acc'] for x in outputs]).mean()

    self.log(f'epoch_val_accuracy', avg_acc, on_epoch=True, prog_bar=True)
    self.log(f'epoch_val_loss', avg_loss, on_epoch=True, prog_bar=True)
    print("val_acc", avg_acc)
    print("val_loss", avg_loss)
    
    self.val_acc.append(avg_acc)
    self.val_loss.append(avg_loss)
  
  def test_step(self, batch, batch_idx):
    b_input_ids = batch['input_ids'].view(batch['input_ids'].shape[0], -1)
    b_attention_masks = batch["attention_mask"].view(batch['attention_mask'].shape[0], -1)
    b_targets = batch["target"].unsqueeze(1)

    logits = self(b_input_ids, b_attention_masks)

    preds = torch.sigmoid(logits)

    test_acc = torch.mean(((preds > 0.5) == b_targets).to(torch.float))
    test_loss = self.loss(logits, b_targets)

    logs = {'test_loss': test_loss, 'test_acc': test_acc}
 
    self.log('test_loss', test_loss, on_step=True, on_epoch=True, 
             prog_bar=True)
    
    return logs
  
  def test_epoch_end(self, outputs):
    avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
    avg_acc = torch.stack([x['test_acc'] for x in outputs]).mean()

    self.log(f'epoch_test_accuracy', avg_acc, on_epoch=True, prog_bar=True)
    self.log(f'epoch_test_loss', avg_loss, on_epoch=True, prog_bar=True)
    
    self.test_acc.append(avg_acc)
    self.test_loss.append(avg_loss)

  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), lr=self.lr, eps=self.eps)

    return [optimizer]

# Training

In [None]:
checkpoint_callback = ModelCheckpoint(
    monitor='epoch_val_accuracy',
    dirpath=MODEL_DIRECTORY,
    filename='offensive-{epoch:02d}-{epoch_val_accuracy:.6f}',
    save_top_k=1,
    mode='max',
)



In [None]:
EPOCHS = 4
trainer = Trainer(gpus=1, max_epochs=EPOCHS)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [None]:
model = BertForSequenceClassificationLM(bert_weights="/home/brian/Documents/kubernetes/models/base.pt")

In [None]:
model.train(mode=True)
print("setting training mode...")

setting training mode...


In [None]:
trainer.fit(model,custom_datamodule)

In [None]:
trainer.test(model, datamodule=custom_datamodule)

In [None]:
# torch.save(model, MODEL_DIRECTORY+"/offensive.pt")