In [None]:
!pip install transformers
!pip install pytorch-lightning
!pip install pandas
!pip install numpy
!pip install torch
!pip install torchmetrics
!pip install ipywidgets
!pip install IProgress

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


## Imports

In [None]:
# Python
import json
import os
from typing import Optional

# General 3rd Party
import pandas as pd
import numpy as np

# HuggingFace
from transformers import BertForSequenceClassification, BertTokenizerFast, BertConfig
from transformers import AdamW, get_linear_schedule_with_warmup

# PyTorch
import torch
from torch.functional import F
from torch.utils.data import random_split, DataLoader

# PyTorch Lightning
import pytorch_lightning as pl
from pytorch_lightning import Trainer, loggers, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint

import torchmetrics

In [None]:
seed_everything(42, workers=True)

Global seed set to 42


42

In [None]:
MODEL_DIRECTORY = '/home/brian/Documents/kubernetes/models/hatespeech'

directories_needed = [MODEL_DIRECTORY]

twitter = "/home/brian/Documents/kubernetes/datasets/live/hatespeech/labeled_data.csv"
gab = "/home/brian/Documents/kubernetes/datasets/live/hatespeech/gab.csv"
reddit = "/home/brian/Documents/kubernetes/datasets/live/hatespeech/reddit.csv"


files_needed = [twitter, gab, reddit]

def assertFilesAndDirectoriesExist(files, directories):
  for d in directories:
    assert os.path.isdir(d)

  for f in files:
    assert os.path.exists(f)
  
  print("+ All files and directories accounted for!")

assertFilesAndDirectoriesExist(files_needed, directories_needed)

+ All files and directories accounted for!


## Load Data from Disk

In [None]:
twitter_frame = pd.read_csv(twitter)
gab_frame = pd.read_csv(gab, usecols=["text", "hate_speech_idx"])
reddit_frame = pd.read_csv(reddit, usecols=["text", "hate_speech_idx"])

In [None]:
def expandDataset(row,row_accumulator,separator):
  # print(row['hate_speech_idx'], row['hate_speech_idx'] == float("nan"), type(row['hate_speech_idx']))
  hate_speech_inds = []
  if (isinstance(row['hate_speech_idx'], str)):
    hate_speech_inds = json.loads(row['hate_speech_idx'])
  # print(type(row['text']))
  text_arr = row['text'].split("\n")
  for ind in range(len(text_arr)):
    new_row = {}
    if (text_arr[ind] == ""):
        continue
    if (ind+1 in hate_speech_inds):
      new_row['label'] = 1
      new_row['text'] = str.strip(text_arr[ind])
      row_accumulator.append(new_row)
    else:
      new_row['label'] = 0
      new_row['text'] = text_arr[ind]
      row_accumulator.append(new_row)

In [None]:
new_rows = []
reddit_frame.apply(expandDataset, axis=1,args=(new_rows, "\n"))
reddit_frame_parsed = pd.DataFrame(new_rows)[['text', 'label']]

In [None]:
reddit_frame_parsed.head()

Unnamed: 0,text,label
0,1. A subsection of retarded Hungarians? Ohh bo...,1
1,2. \tHiii. Just got off work. 444 is mainly th...,0
2,3. \t\twow i guess soyboys are the same in eve...,0
3,4. \t\t\tOwen Benjamin's soyboy song goes for ...,0
4,"1. > ""y'all hear sumn?"" by all means I live i...",0


In [None]:
new_rows = []
gab_frame.apply(expandDataset, axis=1,args=(new_rows, "\n"))
gab_frame_parsed = pd.DataFrame(new_rows)[['text', 'label']]

In [None]:
gab_frame_parsed.head()

Unnamed: 0,text,label
0,1. i joined gab to remind myself how retarded ...,1
1,1. This is what the left is really scared of. ...,0
2,2. \tThat literally looks like a monkey. Why a...,0
3,3. \t\tDumb Cunt,1
4,1. It makes you an asshole.,0


In [None]:
twitter_frame['label'] = twitter_frame['hate_speech'].apply(lambda x: 1 if x > 0 else 0)
twitter_frame_parsed = twitter_frame[['tweet', 'label']].rename(columns={'tweet': 'text'})

In [None]:
twitter_frame_parsed.head()

Unnamed: 0,text,label
0,!!! RT @mayasolovely: As a woman you shouldn't...,0
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,0
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,0
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,0
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,0


In [None]:
full_frame = pd.concat([reddit_frame_parsed, gab_frame_parsed, twitter_frame_parsed], axis=0)

In [None]:
full_frame = full_frame.sample(frac=1)

In [None]:
full_frame.head()

Unnamed: 0,text,label
5763,1. @Patriotic1 Cunt cunt cunt cunt LOL!,1
19124,RT @iAmDaHarper: he's saying...he doesn't disc...,0
2741,@CallMeDaishaa ghetto ass Pocahontas .,1
27238,2. \tdon't let that commie nigger into the Guv...,1
3210,"@Fugazi3011 ""leave you beaner retard""",1


In [None]:
inputs = full_frame['text'].values
labels = full_frame['label'].values

# Data Preparation

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
class TextLabelTokenizerDataset(torch.utils.data.Dataset):
  """Offensive Language Dataset"""

  def __init__(self, inputs, labels, tokenizer):
    super().__init__()

    self.inputs = inputs
    self.labels = labels
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.inputs)
  
  def __getitem__(self, idx):
    if torch.is_tensor(idx):
      idx = idx.tolist()

    encoding = self.tokenizer(
        self.inputs[idx],
        return_tensors='pt',
        truncation=True,
        padding='max_length',
        max_length=500
    )

    input_ids = encoding["input_ids"]
    attention_mask = encoding["attention_mask"]
    
    return {
        "input_ids": input_ids.type(torch.long),
        "attention_mask": attention_mask.type(torch.long),
        "target": torch.tensor(self.labels[idx], dtype=torch.float)
    }

In [None]:
text_label_tokenizer_dataset = TextLabelTokenizerDataset(inputs, labels, tokenizer)

In [None]:
class CustomDataModule(pl.LightningDataModule):
  def __init__(self, dataset, batch_size: int = 32):
    super().__init__()

    self.dataset = dataset
    self.batch_size = batch_size

  def prepare_data(self):
    # download
    pass

  def setup(self, stage: Optional[str] = None):
    print("DATA MODULE SETUP")
    train_size = int(0.6 * len(self.dataset))
    val_size = int(0.2 * len(self.dataset))
    test_size = len(self.dataset) - train_size - val_size

    self.train, self.val, self.test = random_split(
        self.dataset, 
        [train_size, val_size, test_size]
        )

  def train_dataloader(self):
      return DataLoader(self.train, batch_size=self.batch_size, num_workers=16, shuffle=True)

  def val_dataloader(self):
      return DataLoader(self.val, batch_size=self.batch_size, num_workers=16)

  def test_dataloader(self):
      return DataLoader(self.test, batch_size=self.batch_size, num_workers=16)

In [None]:
custom_datamodule = CustomDataModule(text_label_tokenizer_dataset, batch_size=16)

# Model Preparation

In [None]:
class BertForSequenceClassificationLM(pl.LightningModule):
  def __init__(self, pretrained='bert-base-uncased', bert_weights=None, freeze=True, lr=3e-5, eps=1e-5):
    super().__init__()

    self.lr = lr
    self.eps = eps

    config = BertConfig.from_pretrained(pretrained)
    config.num_labels = 1

    self.classifier = BertForSequenceClassification(
        config
        )
    
    if (bert_weights):
      weights = torch.load(bert_weights)
      self.classifier.bert = weights

    # self.classifier = BertForSequenceClassification.from_pretrained(
    #     pretrained
    #     )
    if (freeze):    
      for param in self.classifier.bert.parameters():
        param.requires_grad = False
    
    self.train_loss = []
    self.val_loss = []
    self.val_acc = []
    self.test_loss = []
    self.test_acc = []

    self.loss = F.binary_cross_entropy_with_logits

    self.validation_accuracy = torchmetrics.Accuracy()
    self.test_accuracy = torchmetrics.Accuracy()

  def forward(self, input_ids, attention_mask):
    return self.classifier(
        input_ids=input_ids, 
        attention_mask=attention_mask, 
        return_dict=True
        )['logits']

  def training_step(self, batch, batch_idx):
    b_input_ids = batch['input_ids'].view(batch['input_ids'].shape[0], -1)
    b_attention_masks = batch["attention_mask"].view(batch['attention_mask'].shape[0], -1)
    b_targets = batch["target"].unsqueeze(1)

    logits = self(b_input_ids, b_attention_masks)

    loss = self.loss(logits, b_targets)

    self.log('train_loss', loss, on_step=True, on_epoch=True, 
             prog_bar=True)

    return {'loss': loss}
  
  def training_epoch_end(self, outputs):
    avg_loss = torch.stack([x['loss'] for x in outputs]).mean()

    self.log(f'epoch_train_loss_{self.current_epoch}', avg_loss)
    self.train_loss.append(avg_loss)

  def validation_step(self, batch, batch_idx):
    b_input_ids = batch['input_ids'].view(batch['input_ids'].shape[0], -1)
    b_attention_masks = batch["attention_mask"].view(batch['attention_mask'].shape[0], -1)
    b_targets = batch["target"].unsqueeze(1)

    logits = self(b_input_ids, b_attention_masks)

    preds = torch.sigmoid(logits)

    val_acc = torch.mean(((preds > 0.5) == b_targets).to(torch.float))
    val_loss = self.loss(logits, b_targets)

    logs = {'val_loss': val_loss, 'val_acc': val_acc}

    self.log('val_loss', val_loss, on_step=True, on_epoch=True, 
             prog_bar=True)
    
    return logs

  def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
    avg_acc = torch.stack([x['val_acc'] for x in outputs]).mean()

    self.log(f'epoch_val_accuracy', avg_acc, on_epoch=True, prog_bar=True)
    self.log(f'epoch_val_loss', avg_loss, on_epoch=True, prog_bar=True)
    print("val_acc", avg_acc)
    print("val_loss", avg_loss)
    
    self.val_acc.append(avg_acc)
    self.val_loss.append(avg_loss)
  
  def test_step(self, batch, batch_idx):
    b_input_ids = batch['input_ids'].view(batch['input_ids'].shape[0], -1)
    b_attention_masks = batch["attention_mask"].view(batch['attention_mask'].shape[0], -1)
    b_targets = batch["target"].unsqueeze(1)

    logits = self(b_input_ids, b_attention_masks)

    preds = torch.sigmoid(logits)

    test_acc = torch.mean(((preds > 0.5) == b_targets).to(torch.float))
    test_loss = self.loss(logits, b_targets)

    logs = {'test_loss': test_loss, 'test_acc': test_acc}
 
    self.log('test_loss', test_loss, on_step=True, on_epoch=True, 
             prog_bar=True)
    
    return logs
  
  def test_epoch_end(self, outputs):
    avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
    avg_acc = torch.stack([x['test_acc'] for x in outputs]).mean()

    self.log(f'epoch_test_accuracy', avg_acc, on_epoch=True, prog_bar=True)
    self.log(f'epoch_test_loss', avg_loss, on_epoch=True, prog_bar=True)
    
    self.test_acc.append(avg_acc)
    self.test_loss.append(avg_loss)

  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), lr=self.lr, eps=self.eps)

    return [optimizer]

# Training

In [None]:
checkpoint_callback = ModelCheckpoint(
    monitor='epoch_val_accuracy',
    dirpath=MODEL_DIRECTORY,
    filename='hatespeech-{epoch:02d}-{epoch_val_accuracy:.6f}',
    save_top_k=1,
    mode='max',
)

In [None]:
EPOCHS = 4

trainer = Trainer(gpus=1, max_epochs=EPOCHS, callbacks=[checkpoint_callback])

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [None]:
model = BertForSequenceClassificationLM(bert_weights="/home/brian/Documents/kubernetes/models/base.pt")

In [None]:
model.train(mode=True)
print("setting training mode...")

setting training mode...


In [None]:
trainer.fit(model, custom_datamodule)