<a href="https://colab.research.google.com/github/ahmedovich19/Machine-Learning-Projects/blob/master/Fine_Tuning_BERT_with_HuggingFace_and_PyTorch_Lightning_for_Multilabel_Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#!nvidia-smi

In [4]:
!pip install --quiet pytorch-lightning


[K     |████████████████████████████████| 829kB 7.5MB/s 
[K     |████████████████████████████████| 276kB 15.9MB/s 
[K     |████████████████████████████████| 829kB 23.8MB/s 
[K     |████████████████████████████████| 112kB 42.2MB/s 
[K     |████████████████████████████████| 1.3MB 17.4MB/s 
[K     |████████████████████████████████| 296kB 52.3MB/s 
[K     |████████████████████████████████| 143kB 52.7MB/s 
[?25h  Building wheel for PyYAML (setup.py) ... [?25l[?25hdone
  Building wheel for future (setup.py) ... [?25l[?25hdone


In [5]:
!pip install --quiet transformers

[K     |████████████████████████████████| 1.9MB 7.9MB/s 
[K     |████████████████████████████████| 3.2MB 38.1MB/s 
[K     |████████████████████████████████| 890kB 54.6MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [6]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from  pytorch_lightning.metrics.functional.classification import auroc
%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [7]:
%cd drive/MyDrive

/content/drive/MyDrive


In [8]:
!unzip train.csv.zip

Archive:  train.csv.zip
  inflating: train.csv               


In [9]:
df = pd.read_csv('train.csv')

In [10]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [11]:
train_df, val_df =train_test_split(df, test_size=0.05)

In [12]:
train_df.shape, val_df.shape

((151592, 8), (7979, 8))

In [13]:
LABEL_COLUMNS = ['toxic', 'severe_toxic','obscene','threat','insult','identity_hate']

In [14]:
train_df[LABEL_COLUMNS].sum()

toxic            14546
severe_toxic      1515
obscene           8028
threat             465
insult            7467
identity_hate     1334
dtype: int64

In [15]:
train_df[LABEL_COLUMNS].sum().sum()

33355

In [16]:
train_df[LABEL_COLUMNS].sum(axis=1).head()

41414     0
141338    0
145906    0
113950    0
136413    0
dtype: int64

In [17]:
train_toxic = train_df[train_df[LABEL_COLUMNS].sum(axis=1)>0]

In [18]:
train_toxic.shape

(15427, 8)

In [19]:
train_clean = train_df[train_df[LABEL_COLUMNS].sum(axis=1) == 0]

In [20]:
train_toxic.shape,train_clean.shape

((15427, 8), (136165, 8))

In [21]:
train_df = pd.concat([
  train_toxic,
  train_clean.sample(15_000)
])
train_df.shape

(30427, 8)

In [22]:
train_df[LABEL_COLUMNS].sum()

toxic            14546
severe_toxic      1515
obscene           8028
threat             465
insult            7467
identity_hate     1334
dtype: int64

In [23]:
sample_row = df.iloc[16]
sample_comment = sample_row.comment_text
sample_labels = sample_row[LABEL_COLUMNS]
print(sample_comment)
print()
print(sample_labels.to_dict())

Bye! 

Don't look, come or think of comming back! Tosser.

{'toxic': 1, 'severe_toxic': 0, 'obscene': 0, 'threat': 0, 'insult': 0, 'identity_hate': 0}


In [24]:

BERT_MODEL_NAME = 'bert-base-cased'
tokenizer =BertTokenizer.from_pretrained(BERT_MODEL_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




In [25]:
encoding = tokenizer.encode_plus(
    sample_comment,
    add_special_tokens=True,
    max_length=512,
    return_token_type_ids = False,
    padding = 'max_length',
    return_attention_mask=True,
    return_tensors = 'pt'
)

In [26]:
 encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [27]:
encoding['input_ids'].shape,encoding['attention_mask'].shape

(torch.Size([1, 512]), torch.Size([1, 512]))

In [28]:
encoding['input_ids'].squeeze()[:20]

tensor([  101, 17774,   106,  1790,   112,   189,  1440,   117,  1435,  1137,
         1341,  1104,  3254,  5031,  1171,   106,  1706, 14607,   119,   102])

In [29]:
encoding['attention_mask'].squeeze()[:20]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [30]:
print(tokenizer.convert_ids_to_tokens(encoding['input_ids'].squeeze())[:20])

['[CLS]', 'Bye', '!', 'Don', "'", 't', 'look', ',', 'come', 'or', 'think', 'of', 'com', '##ming', 'back', '!', 'To', '##sser', '.', '[SEP]']


In [31]:
class ToxicCommentDataset(Dataset):
  def __init__(self,data:pd.DataFrame,tokenizer:BertTokenizer,max_token_len: int = 128):
    self.data =data
    self.tokenizer = tokenizer
    self.max_token_len = max_token_len
  
  def __len__(self):
    return len(self.data)

  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]
    comment_text = data_row.comment_text
    labels = data_row[LABEL_COLUMNS]

    encoding = self.tokenizer.encode_plus(
      comment_text,
      add_special_tokens=True,
      max_length=self.max_token_len,
      return_token_type_ids=False,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt'
      
    )

    return dict(
        comment_text=comment_text,
        input_ids=encoding['input_ids'].flatten(),
        attention_mask = encoding['attention_mask'].flatten(),
        labels = torch.FloatTensor(labels)
    )

In [32]:
train_dataset = ToxicCommentDataset(train_df,tokenizer)

In [33]:
sample_item = train_dataset[0]

In [34]:
sample_item.keys()

dict_keys(['comment_text', 'input_ids', 'attention_mask', 'labels'])

In [35]:
sample_item['comment_text']

'Hi, ya fucking idiot. ^_^'

In [36]:
sample_item['labels']

tensor([1., 0., 1., 0., 1., 0.])

In [37]:
sample_item['input_ids'].shape

torch.Size([128])

In [38]:
bert_model = BertModel.from_pretrained(BERT_MODEL_NAME,return_dict=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




In [39]:
sample_item['input_ids'].unsqueeze(dim=0).shape,sample_item['attention_mask'].unsqueeze(dim=0).shape

(torch.Size([1, 128]), torch.Size([1, 128]))

In [40]:
prediction = bert_model(sample_item['input_ids'].unsqueeze(dim=0),sample_item['attention_mask'].unsqueeze(dim=0))

In [41]:
prediction

BaseModelOutputWithPoolingAndCrossAttentions([('last_hidden_state',
                                               tensor([[[ 0.0846,  0.4219,  0.2352,  ..., -0.0680,  0.0722, -0.0598],
                                                        [ 0.2486, -0.1283,  1.0093,  ..., -0.2350,  0.4228, -0.0814],
                                                        [ 0.3565,  0.3140,  0.8552,  ...,  0.2619, -0.3000, -0.5343],
                                                        ...,
                                                        [-0.0259,  0.2138,  0.0866,  ..., -0.0545, -0.1180,  0.4628],
                                                        [ 0.0199,  0.1895, -0.0152,  ...,  0.0256, -0.1167,  0.5580],
                                                        [-0.0069,  0.1778, -0.0506,  ..., -0.1385,  0.0760,  0.5762]]],
                                                      grad_fn=<NativeLayerNormBackward>)),
                                              ('pooler_output',
      

In [42]:
prediction.last_hidden_state.shape,prediction.pooler_output.shape

(torch.Size([1, 128, 768]), torch.Size([1, 768]))

In [43]:
class ToxicCommentDataModule(pl.LightningDataModule):
  def __init__(self, train_df, test_df, tokenizer, batch_size=8,max_token_len=128):
    super().__init__()
    self.train_df=train_df
    self.test_df = test_df
    self.tokenizer = tokenizer
    self.batch_size =batch_size
    self.max_token_len = max_token_len

  def setup(self):
    self.train_dataset = ToxicCommentDataset(
        self.train_df,
        self.tokenizer,
        self.max_token_len
    )
    self.test_dataset = ToxicCommentDataset(
      self.test_df,
      self.tokenizer,
      self.max_token_len
    )
  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        batch_size = self.batch_size,
        shuffle = True,
        num_workers = 4
    )
  def val_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size = 1,
        num_workers = 4
    )
  def test_dataloader(self):
    return DataLoader(
        self.test_dataset,
        batch_size = 1,
        num_workers = 4
    )

In [55]:
N_EPOCHS = 10
BATCH_SIZE = 32
data_module = ToxicCommentDataModule(train_df,val_df,tokenizer,batch_size=BATCH_SIZE)
data_module.setup()


#Modeling

In [45]:
criterion = nn.BCELoss()
prediction =torch.FloatTensor(
    [10.95873564,1.07321467,1.58524066,0.03839076,15.72987556,2.09513213]
)
labels = torch.FloatTensor(
    [1.,0.,0.,0.,1.,0.]
)

In [46]:
torch.sigmoid(prediction)

tensor([1.0000, 0.7452, 0.8299, 0.5096, 1.0000, 0.8904])

In [47]:
output = criterion(torch.sigmoid(prediction),labels)
output

tensor(1.0104)

In [78]:
class ToxicCommentTagger(pl.LightningModule):
  def __init__(self,n_classes:int,steps_per_epoch=None,n_epochs=None):
    super().__init__()
    self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
    self.classifier = nn.Linear(self.bert.config.hidden_size,n_classes)
    self.steps_per_epoch = steps_per_epoch
    self.n_epochs = n_epochs
    self.criterion = nn.BCELoss()
  def forward(self,input_ids, attention_mask, labels=None):
    output = self.bert(input_ids, attention_mask=attention_mask)
    output = self.classifier(output.pooler_output)
    output = torch.sigmoid(output)
    loss = 0
    if labels is not None:
      loss = self.criterion(output, labels)
    return loss, output
  
  def training_step(self, batch,batch_idx):
    input_ids = batch['input_ids']
    attention_mask =  batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log('train_loss', loss,prog_bar=True,logger=True)
    return {'loss':loss, 'prediction': outputs,'labels':labels}
  def validation_step(self, batch,batch_idx):
    input_ids = batch['input_ids']
    attention_mask =  batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log('val_loss', loss,prog_bar=True,logger=True)
    return loss
  def test_step(self, batch,batch_idx):
    input_ids = batch['input_ids']
    attention_mask =  batch['attention_mask']
    labels = batch['labels']
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log('test_loss', loss,prog_bar=True,logger=True)
    return loss
  def training_epoch_end(self, outputs):
    labels = []
    predictions = []

    for output in outputs:
      for out_labels in output['labels'].detach().cpu():
        labels.append(out_labels)
      for out_predictions in output['predictions'].detach().cpu():
        predictions.append(out_predictions)
    
    labels = torch.stack(labels)
    predictions =torch.stack(predictions)

    for i, name in enumerate(LABEL_COLUMNS):
      roc_score = auroc(predictions[:,i],labels[:,i])
      self.logger.experiment.add_scalar(f"{name} roc_auc/Train",roc_score,self.current_epoch)

  def configure_optimizers(self):
    optimizer = AdamW(self.parameters(), lr=2e-5)
    warmup_steps = self.steps_per_epoch // 3
    total_steps = self.steps_per_epoch * self.n_epochs - warmup_steps

    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      warmup_steps,
      total_steps
    )

    return [optimizer], [scheduler]

In [79]:
model = ToxicCommentTagger(
    n_classes=6,
    steps_per_epoch = len(train_df) // BATCH_SIZE,
    n_epochs=N_EPOCHS
)

In [80]:
_, predictions = model(sample_item['input_ids'].unsqueeze(dim=0),sample_item['attention_mask'].unsqueeze(dim=0))

In [81]:
predictions

tensor([[0.5535, 0.4485, 0.4944, 0.3304, 0.4561, 0.5499]],
       grad_fn=<SigmoidBackward>)

In [82]:
trainer = pl.Trainer(max_epochs=N_EPOCHS,gpus=1,progress_bar_refresh_rate=30)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores


In [None]:
trainer.fit(model,data_module)

In [None]:
trained_model = ToxicCommentTagger.load_from_checkpoint('toxic-comment-classifier.ckpt',n_classes=6)

In [None]:
test_comment = ' I really love you, You are a complete winner. stay with me'

In [90]:
encoding = tokenizer.encode_plus(
    test_comment,
    add_special_tokens=True,
    max_length=128,
    return_token_type_ids = False,
    padding = 'max_length',
    return_attention_mask=True,
    return_tensors = 'pt'
)

In [None]:
_, test_prediction = trained_model(encoding['inputs_ids'],encoding['attentio+n_mask'])
test_prediction

In [None]:
test_prediction  = test_predtion.flatten().numpy()
test_prediction

In [None]:
predicted_labels = []
for i, label_name in enumerate(LABEL_COLUMNS):
  label_probability =  test_prediction[i]
  if label_probability > 0.5:
    predicted_labels.append(label_name)
predicted_labels

In [None]:
def classify(comment_text, model, tokenizer, label_names, threshold=0.5):
  encoding = tokenizer.encode_plus(
    test_comment,
    add_special_tokens=True,
    max_length=128,
    return_token_type_ids = False,
    padding = 'max_length',
    return_attention_mask=True,
    return_tensors = 'pt'
  )
  _, prediction = model(encoding['inputs_ids'],encoding['attentio+n_mask'])
  prediction = prediction.flatten().numpy()
  predicted_labels =[]
  for i, label_name in enumerate(label_names):
    label_probability = prediction[i]
    if label_probability > threshold:
      predicted_labels.append(label_name)
  
  return predicted_labels

In [None]:
text = "You are such a loser! You'll regret everything you did to me"
classify(text, trained_model, tokenizer, LABEL_COLUMNS)

In [None]:
text = "Hi, I'm Meredith and I'm an alch.. good at supplier relations"
classify(text, trained_model, tokenizer, LABEL_COLUMNS)