In [None]:
!pip install transformers
!pip install pytorch-lightning
!pip install pandas
!pip install numpy
!pip install torch
!pip install torchmetrics
!pip install ipywidgets
!pip install IProgress

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


## Imports

In [None]:
# Python
import json
import os
from typing import Optional

# General 3rd Party
import pandas as pd
import numpy as np

# HuggingFace
from transformers import BertForSequenceClassification, BertTokenizerFast, BertConfig
from transformers import AdamW, get_linear_schedule_with_warmup

# PyTorch
import torch
from torch.functional import F
from torch.utils.data import random_split, DataLoader

# PyTorch Lightning
import pytorch_lightning as pl
from pytorch_lightning import Trainer, loggers, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint

import torchmetrics

In [None]:
seed_everything(42, workers=True)

Global seed set to 42


42

In [None]:
MODEL_DIRECTORY = '/home/brian/Documents/kubernetes/models/toxic'

directories_needed = [MODEL_DIRECTORY]

dataset_path = '/home/brian/Documents/kubernetes/datasets/live/toxicity/train.csv'


files_needed = [dataset_path]

def assertFilesAndDirectoriesExist(files, directories):
  for d in directories:
    assert os.path.isdir(d)

  for f in files:
    assert os.path.exists(f)
  
  print("+ All files and directories accounted for!")

assertFilesAndDirectoriesExist(files_needed, directories_needed)

+ All files and directories accounted for!


## Load Data from Disk

In [None]:
toxicity_frame = pd.read_csv(dataset_path, usecols=['id', 'comment_text','target'])

In [None]:
toxicity_frame['label'] = toxicity_frame['target'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
toxicity_frame = toxicity_frame[['comment_text', 'label']].rename(columns={'comment_text': 'text'})

In [None]:
full_frame = toxicity_frame

In [None]:
full_frame.head()

Unnamed: 0,text,label
0,"This is so cool. It's like, 'would you want yo...",0
1,Thank you!! This would make my life a lot less...,0
2,This is such an urgent design problem; kudos t...,0
3,Is this something I'll be able to install on m...,0
4,haha you guys are a bunch of losers.,1


In [None]:
full_frame = full_frame.sample(frac=1)

In [None]:
full_frame.head()

Unnamed: 0,text,label
286892,What a breathe of fresh air to have someone wh...,1
419218,Your jewish friends were the ones who told you...,1
1055330,Possible collusion by Trump and his affiliates...,0
1382764,Exactly. We need a % of GDP spending cap at t...,0
256049,"By your own comment, even if some of them vote...",0


In [None]:
inputs = full_frame['text'].values
labels = full_frame['label'].values

# Data Preparation

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
class TextLabelTokenizerDataset(torch.utils.data.Dataset):
  """Offensive Language Dataset"""

  def __init__(self, inputs, labels, tokenizer):
    super().__init__()

    self.inputs = inputs
    self.labels = labels
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.inputs)
  
  def __getitem__(self, idx):
    if torch.is_tensor(idx):
      idx = idx.tolist()

    encoding = self.tokenizer(
        self.inputs[idx],
        return_tensors='pt',
        truncation=True,
        padding='max_length',
        max_length=500
    )

    input_ids = encoding["input_ids"]
    attention_mask = encoding["attention_mask"]
    
    return {
        "input_ids": input_ids.type(torch.long),
        "attention_mask": attention_mask.type(torch.long),
        "target": torch.tensor(self.labels[idx], dtype=torch.float)
    }

In [None]:
text_label_tokenizer_dataset = TextLabelTokenizerDataset(inputs, labels, tokenizer)

In [None]:
class CustomDataModule(pl.LightningDataModule):
  def __init__(self, dataset, batch_size: int = 32):
    super().__init__()

    self.dataset = dataset
    self.batch_size = batch_size

  def prepare_data(self):
    # download
    pass

  def setup(self, stage: Optional[str] = None):
    print("DATA MODULE SETUP")
    train_size = int(0.99 * len(self.dataset))
    val_size = int(0.009 * len(self.dataset))
    test_size = len(self.dataset) - train_size - val_size

    self.train, self.val, self.test = random_split(
        self.dataset, 
        [train_size, val_size, test_size]
        )

  def train_dataloader(self):
      return DataLoader(self.train, batch_size=self.batch_size, num_workers=16, shuffle=True)

  def val_dataloader(self):
      return DataLoader(self.val, batch_size=self.batch_size, num_workers=16)

  def test_dataloader(self):
      return DataLoader(self.test, batch_size=self.batch_size, num_workers=16)

In [None]:
custom_datamodule = CustomDataModule(text_label_tokenizer_dataset, batch_size=16)

# Model Preparation

In [None]:
class BertForSequenceClassificationLM(pl.LightningModule):
  def __init__(self, pretrained='bert-base-uncased', bert_weights=None, freeze=True, lr=3e-5, eps=1e-5):
    super().__init__()

    self.lr = lr
    self.eps = eps

    config = BertConfig.from_pretrained(pretrained)
    config.num_labels = 1

    self.classifier = BertForSequenceClassification(
        config
        )
    
    if (bert_weights):
      weights = torch.load(bert_weights)
      self.classifier.bert = weights

    if (freeze):    
      for param in self.classifier.bert.parameters():
        param.requires_grad = False
    
    self.train_loss = []
    self.val_loss = []
    self.val_acc = []
    self.test_loss = []
    self.test_acc = []

    self.loss = F.binary_cross_entropy_with_logits

    self.validation_accuracy = torchmetrics.Accuracy()
    self.test_accuracy = torchmetrics.Accuracy()

  def forward(self, input_ids, attention_mask):
    return self.classifier(
        input_ids=input_ids, 
        attention_mask=attention_mask, 
        return_dict=True
        )['logits']

  def training_step(self, batch, batch_idx):
    b_input_ids = batch['input_ids'].view(batch['input_ids'].shape[0], -1)
    b_attention_masks = batch["attention_mask"].view(batch['attention_mask'].shape[0], -1)
    b_targets = batch["target"].unsqueeze(1)

    logits = self(b_input_ids, b_attention_masks)

    loss = self.loss(logits, b_targets)

    self.log('train_loss', loss, on_step=True, on_epoch=True, 
             prog_bar=True)

    return {'loss': loss}
  
  def training_epoch_end(self, outputs):
    avg_loss = torch.stack([x['loss'] for x in outputs]).mean()

    self.log(f'epoch_train_loss_{self.current_epoch}', avg_loss)
    self.train_loss.append(avg_loss)

  def validation_step(self, batch, batch_idx):
    b_input_ids = batch['input_ids'].view(batch['input_ids'].shape[0], -1)
    b_attention_masks = batch["attention_mask"].view(batch['attention_mask'].shape[0], -1)
    b_targets = batch["target"].unsqueeze(1)

    logits = self(b_input_ids, b_attention_masks)

    preds = torch.sigmoid(logits)

    val_acc = torch.mean(((preds > 0.5) == b_targets).to(torch.float))
    val_loss = self.loss(logits, b_targets)

    logs = {'val_loss': val_loss, 'val_acc': val_acc}

    self.log('val_loss', val_loss, on_step=True, on_epoch=True, 
             prog_bar=True)
    
    return logs

  def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
    avg_acc = torch.stack([x['val_acc'] for x in outputs]).mean()

    self.log(f'epoch_val_accuracy', avg_acc, on_epoch=True, prog_bar=True)
    self.log(f'epoch_val_loss', avg_loss, on_epoch=True, prog_bar=True)
    print("val_acc", avg_acc)
    print("val_loss", avg_loss)
    
    self.val_acc.append(avg_acc)
    self.val_loss.append(avg_loss)
  
  def test_step(self, batch, batch_idx):
    b_input_ids = batch['input_ids'].view(batch['input_ids'].shape[0], -1)
    b_attention_masks = batch["attention_mask"].view(batch['attention_mask'].shape[0], -1)
    b_targets = batch["target"].unsqueeze(1)

    logits = self(b_input_ids, b_attention_masks)

    preds = torch.sigmoid(logits)

    test_acc = torch.mean(((preds > 0.5) == b_targets).to(torch.float))
    test_loss = self.loss(logits, b_targets)

    logs = {'test_loss': test_loss, 'test_acc': test_acc}
 
    self.log('test_loss', test_loss, on_step=True, on_epoch=True, 
             prog_bar=True)
    
    return logs
  
  def test_epoch_end(self, outputs):
    avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
    avg_acc = torch.stack([x['test_acc'] for x in outputs]).mean()

    self.log(f'epoch_test_accuracy', avg_acc, on_epoch=True, prog_bar=True)
    self.log(f'epoch_test_loss', avg_loss, on_epoch=True, prog_bar=True)
    
    self.test_acc.append(avg_acc)
    self.test_loss.append(avg_loss)

  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), lr=self.lr, eps=self.eps)

    return [optimizer]

# Training

In [None]:
checkpoint_callback = ModelCheckpoint(
    monitor='epoch_val_accuracy',
    dirpath=MODEL_DIRECTORY,
    filename='toxic-{epoch:02d}-{epoch_val_accuracy:.6f}',
    save_top_k=1,
    mode='max',
)



In [None]:
EPOCHS = 1

trainer = Trainer(gpus=1, max_epochs=EPOCHS, callbacks=[checkpoint_callback])

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [None]:
model = BertForSequenceClassificationLM(bert_weights="/home/brian/Documents/kubernetes/models/base.pt")

In [None]:
model.train(mode=True)
print("setting training mode...")

setting training mode...


In [None]:
trainer.fit(model,custom_datamodule)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


DATA MODULE SETUP



  | Name                | Type                          | Params
----------------------------------------------------------------------
0 | classifier          | BertForSequenceClassification | 109 M 
1 | validation_accuracy | Accuracy                      | 0     
2 | test_accuracy       | Accuracy                      | 0     
----------------------------------------------------------------------
769       Trainable params
109 M     Non-trainable params
109 M     Total params
437.932   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 42


val_acc tensor(0.5312, device='cuda:0')
val_loss tensor(0.6890, device='cuda:0')


Training: 0it [00:00, ?it/s]



In [None]:
# torch.save(model, MODEL_DIRECTORY+"/toxic.pt")