In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 17 17:18:05 2022

@author: laurenwilkes
"""

import pandas as pd
import transformers
from transformers import XLNetTokenizer, XLNetModel, AdamW, get_linear_schedule_with_warmup
import torch

df = pd.read_csv('~/Downloads/long_tweets.csv')

df['tweet_text']

import os
import math

import torch
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, XLNetTokenizer, XLNetModel, XLNetLMHeadModel, XLNetConfig
from tensorflow import keras 
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
import matplotlib.pyplot as plt
%matplotlib inline

train, test= train_test_split(df, test_size=0.2)

tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)

train_text_list = train['tweet_text'].values
test_text_list = test['tweet_text'].values


def tokenize_inputs(text_list, tokenizer, num_embeddings=512):
    """
    Tokenizes the input text input into ids. Appends the appropriate special
    characters to the end of the text to denote end of sentence. Truncate or pad
    the appropriate sequence length.
    """
    # tokenize the text, then truncate sequence to the desired length minus 2 for
    # the 2 special characters
    tokenized_texts = list(map(lambda t: tokenizer.tokenize(t)[:num_embeddings-2], text_list))
    # convert tokenized text into numeric ids for the appropriate LM
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    # append special token "<s>" and </s> to end of sentence
    input_ids = [tokenizer.build_inputs_with_special_tokens(x) for x in input_ids]
    # pad sequences
    input_ids = pad_sequences(input_ids, maxlen=num_embeddings, dtype="long", truncating="post", padding="post")
    return input_ids

def create_attn_masks(input_ids):
    """
    Create attention masks to tell model whether attention should be applied to
    the input id tokens. Do not want to perform attention on padding tokens.
    """
    # Create attention masks
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
    return attention_masks

train_input_ids = tokenize_inputs(train_text_list, tokenizer, num_embeddings=250)
test_input_ids = tokenize_inputs(test_text_list, tokenizer, num_embeddings=250)

train_attention_masks = create_attn_masks(train_input_ids)
test_attention_masks = create_attn_masks(test_input_ids)

train["features"] = train_input_ids.tolist()
train["masks"] = train_attention_masks

test["features"] = test_input_ids.tolist()
test["masks"] = test_attention_masks


#Adding the right shape to the data frame
lst1 = [0] * 548
lst2 = [0] * 548
lst3 = [0] * 548
lst4 = [0] * 548
lst5 = [0] * 548
lst6 = [0] * 548
i=0
for x in train['cyberbullying_type']:
    if x == 'not_cyberbullying':
        lst1[i] = 1
    if x == 'gender':
        lst2[i] = 1
    if x == 'religion':
        lst3[i] = 1
    if x == 'other_cyberbullying':
        lst4[i] = 1
    if x == 'age':
        lst5[i] = 1
    if x == 'ethnicity':
        lst6[i] = 1
    i+=1
    
train['not_cyberbullying'] = lst1
train['gender'] = lst2
train['religion'] = lst3
train['other_cyberbullying'] = lst4
train['age'] = lst5
train['ethnicity'] = lst6

train, valid = train_test_split(train, test_size=0.2, random_state=42)

X_train = train["features"].values.tolist()
X_valid = valid["features"].values.tolist()

train_masks = train["masks"].values.tolist()
valid_masks = valid["masks"].values.tolist()


label_cols = ['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying','age', 'ethnicity']
Y_train = train[label_cols].values.tolist()
Y_valid = valid[label_cols].values.tolist()

# AM I GOING TO END UP NEEDING MASKS AND SHIT

X_train = torch.tensor(X_train)
X_valid = torch.tensor(X_valid)

Y_train = torch.tensor(Y_train, dtype=torch.float32)
Y_valid = torch.tensor(Y_valid, dtype=torch.float32)

train_masks = torch.tensor(train_masks, dtype=torch.long)
valid_masks = torch.tensor(valid_masks, dtype=torch.long)

# Select a batch size for training
batch_size = 16

# Create an iterator of our data with torch DataLoader. This helps save on 
# memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(X_train, train_masks, Y_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,\
                              sampler=train_sampler,\
                              batch_size=batch_size)

validation_data = TensorDataset(X_valid, valid_masks, Y_valid)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data,\
                                   sampler=validation_sampler,\
                                   batch_size=batch_size)




def train(model, num_epochs,\
          optimizer,\
          train_dataloader, valid_dataloader,\
          model_save_path,\
          train_loss_set=[], valid_loss_set = [],\
          lowest_eval_loss=None, start_epoch=0,\
          device="cpu"
          ):
  """
  Train the model and save the model with the lowest validation loss
  """

  model.to(device)

  # trange is a tqdm wrapper around the normal python range
  for i in trange(num_epochs, desc="Epoch"):
    # if continue training from saved model
    actual_epoch = start_epoch + i

    # Training

    # Set our model to training mode (as opposed to evaluation mode)
    model.train()

    # Tracking variables
    tr_loss = 0
    num_train_samples = 0

    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
      # Add batch to GPU
      batch = tuple(t.to(device) for t in batch)
      # Unpack the inputs from our dataloader
      b_input_ids, b_input_mask, b_labels = batch
      # Clear out the gradients (by default they accumulate)
      optimizer.zero_grad()
      # Forward pass
      loss = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
      # store train loss
      tr_loss += loss.item()
      num_train_samples += b_labels.size(0)
      # Backward pass
      loss.backward()
      # Update parameters and take a step using the computed gradient
      optimizer.step()
      #scheduler.step()

    # Update tracking variables
    epoch_train_loss = tr_loss/num_train_samples
    train_loss_set.append(epoch_train_loss)

    print("Train loss: {}".format(epoch_train_loss))

    # Validation

    # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()

    # Tracking variables 
    eval_loss = 0
    num_eval_samples = 0

    # Evaluate data for one epoch
    for batch in valid_dataloader:
      # Add batch to GPU
      batch = tuple(t.to(device) for t in batch)
      # Unpack the inputs from our dataloader
      b_input_ids, b_input_mask, b_labels = batch
      # Telling the model not to compute or store gradients,
      # saving memory and speeding up validation
      with torch.no_grad():
        # Forward pass, calculate validation loss
        loss = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        # store valid loss
        eval_loss += loss.item()
        num_eval_samples += b_labels.size(0)

    epoch_eval_loss = eval_loss/num_eval_samples
    valid_loss_set.append(epoch_eval_loss)

    print("Valid loss: {}".format(epoch_eval_loss))

    if lowest_eval_loss == None:
      lowest_eval_loss = epoch_eval_loss
      # save model
      save_model(model, model_save_path, actual_epoch,\
                 lowest_eval_loss, train_loss_set, valid_loss_set)
    else:
      if epoch_eval_loss < lowest_eval_loss:
        lowest_eval_loss = epoch_eval_loss
        # save model
        save_model(model, model_save_path, actual_epoch,\
                   lowest_eval_loss, train_loss_set, valid_loss_set)
    print("\n")

  return model, train_loss_set, valid_loss_set




def save_model(model, save_path, epochs, lowest_eval_loss, train_loss_hist, valid_loss_hist):
  """
  Save the model to the path directory provided
  """
  model_to_save = model.module if hasattr(model, 'module') else model
  checkpoint = {'epochs': epochs, \
                'lowest_eval_loss': lowest_eval_loss,\
                'state_dict': model_to_save.state_dict(),\
                'train_loss_hist': train_loss_hist,\
                'valid_loss_hist': valid_loss_hist
               }
  torch.save(checkpoint, save_path)
  print("Saving model at epoch {} with validation loss of {}".format(epochs,\
                                                                     lowest_eval_loss))
  return
  
def load_model(save_path):
  """
  Load the model from the path directory provided
  """
  checkpoint = torch.load(save_path)
  model_state_dict = checkpoint['state_dict']
  model = XLNetForMultiLabelSequenceClassification(num_labels=model_state_dict["classifier.weight"].size()[0])
  model.load_state_dict(model_state_dict)

  epochs = checkpoint["epochs"]
  lowest_eval_loss = checkpoint["lowest_eval_loss"]
  train_loss_hist = checkpoint["train_loss_hist"]
  valid_loss_hist = checkpoint["valid_loss_hist"]
  
  return model, epochs, lowest_eval_loss, train_loss_hist, valid_loss_hist

torch.cuda.empty_cache()


#config = XLNetConfig()
        
class XLNetForMultiLabelSequenceClassification(torch.nn.Module):
  
  def __init__(self, num_labels=2):
    super(XLNetForMultiLabelSequenceClassification, self).__init__()
    self.num_labels = num_labels
    self.xlnet = XLNetModel.from_pretrained('xlnet-base-cased')
    self.classifier = torch.nn.Linear(768, num_labels)

    torch.nn.init.xavier_normal_(self.classifier.weight)

  def forward(self, input_ids, token_type_ids=None,\
              attention_mask=None, labels=None):
    # last hidden layer
    last_hidden_state = self.xlnet(input_ids=input_ids,\
                                   attention_mask=attention_mask,\
                                   token_type_ids=token_type_ids)
    # pool the outputs into a mean vector
    mean_last_hidden_state = self.pool_hidden_state(last_hidden_state)
    logits = self.classifier(mean_last_hidden_state)
        
    if labels is not None:
      loss_fct = BCEWithLogitsLoss()
      loss = loss_fct(logits.view(-1, self.num_labels),\
                      labels.view(-1, self.num_labels))
      return loss
    else:
      return logits
    
  def freeze_xlnet_decoder(self):
    """
    Freeze XLNet weight parameters. They will not be updated during training.
    """
    for param in self.xlnet.parameters():
      param.requires_grad = False
    
  def unfreeze_xlnet_decoder(self):
    """
    Unfreeze XLNet weight parameters. They will be updated during training.
    """
    for param in self.xlnet.parameters():
      param.requires_grad = True
    
  def pool_hidden_state(self, last_hidden_state):
    """
    Pool the output vectors into a single mean vector 
    """
    last_hidden_state = last_hidden_state[0]
    mean_last_hidden_state = torch.mean(last_hidden_state, 1)
    return mean_last_hidden_state
    
model = XLNetForMultiLabelSequenceClassification(num_labels=len(Y_train[0]))
#model = torch.nn.DataParallel(model)
#model.cuda()


optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01, correct_bias=False)
#scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)  # PyTorch scheduler

num_epochs=3








A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["features"] = train_input_ids.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["masks"] = train_attention_masks
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["features"] = test_input_ids.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .

In [11]:
#conda install -c pytorch torchvision cudatoolkit=9.0 pytorch
#file = open("Downloads/xlnet_toxic.bin", "wb")


In [12]:
model_save_path = output_model_file = os.path.join("Downloads/xlnet_toxic.bin")
model, train_loss_set, valid_loss_set = train(model=model,\
                                              num_epochs=num_epochs,\
                                              optimizer=optimizer,\
                                              train_dataloader=train_dataloader,\
                                              valid_dataloader=validation_dataloader,\
                                              model_save_path=model_save_path)









Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Train loss: 0.0033424626057754914
Valid loss: 0.0008652640591290864


Epoch:  33%|███▎      | 1/3 [28:05<56:10, 1685.23s/it]

Saving model at epoch 0 with validation loss of 0.0008652640591290864


Train loss: 0.0013551976716201157


Epoch:  67%|██████▋   | 2/3 [51:46<26:46, 1606.04s/it]

Valid loss: 0.0010873947710603137


Train loss: 0.0007083566851257963


Epoch: 100%|██████████| 3/3 [1:13:57<00:00, 1479.03s/it]

Valid loss: 0.0011569286604009737







In [14]:
model_save_path = output_model_file = os.path.join("Downloads/xlnet_toxic.bin")
model, start_epoch, lowest_eval_loss, train_loss_hist, valid_loss_hist = load_model(model_save_path)

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetModel: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01, correct_bias=False)

In [16]:
num_epochs=3
model, train_loss_set, valid_loss_set = train(model=model,\
                                              num_epochs=num_epochs,\
                                              optimizer=optimizer,\
                                              train_dataloader=train_dataloader,\
                                              valid_dataloader=validation_dataloader,\
                                              model_save_path=model_save_path,\
                                              train_loss_set=train_loss_hist,\
                                              valid_loss_set=valid_loss_hist,\
                                              lowest_eval_loss=lowest_eval_loss,\
                                              start_epoch=start_epoch)

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Train loss: 0.0048184811247121455


Epoch:  33%|███▎      | 1/3 [24:58<49:56, 1498.03s/it]

Valid loss: 0.001531270204577595


Train loss: 0.002941035337655244


Epoch:  67%|██████▋   | 2/3 [47:38<24:16, 1456.76s/it]

Valid loss: 0.0034160106019540267


Train loss: 0.0017285100293354255
Valid loss: 0.0002916694465304979


Epoch: 100%|██████████| 3/3 [1:11:31<00:00, 1430.56s/it]

Saving model at epoch 2 with validation loss of 0.0002916694465304979







In [17]:
def generate_predictions(model, df, num_labels, device="cpu", batch_size=32):
  num_iter = math.ceil(df.shape[0]/batch_size)
  
  pred_probs = np.array([]).reshape(0, num_labels)
  
  model.to(device)
  model.eval()
  
  for i in range(num_iter):
    df_subset = df.iloc[i*batch_size:(i+1)*batch_size,:]
    X = df_subset["features"].values.tolist()
    masks = df_subset["masks"].values.tolist()
    X = torch.tensor(X)
    masks = torch.tensor(masks, dtype=torch.long)
    X = X.to(device)
    masks = masks.to(device)
    with torch.no_grad():
      logits = model(input_ids=X, attention_mask=masks)
      logits = logits.sigmoid().detach().cpu().numpy()
      pred_probs = np.vstack([pred_probs, logits])
  
  return pred_probs

In [19]:
num_labels = len(label_cols)
pred_probs = generate_predictions(model, test, num_labels, batch_size=32)
pred_probs
rounded_pred_probs = np.round(pred_probs)

  masks = torch.tensor(masks, dtype=torch.long)


In [20]:
label_cols = ['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying','age', 'ethnicity']

test['not_cyberbullying'] = rounded_pred_probs[:,0]
test['gender'] = rounded_pred_probs[:,1]
test['religion'] = rounded_pred_probs[:,2]
test['other_cyberbullying'] = rounded_pred_probs[:,3]
test['age'] = rounded_pred_probs[:,4]
test['ethnicity'] = rounded_pred_probs[:,5]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['not_cyberbullying'] = rounded_pred_probs[:,0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['gender'] = rounded_pred_probs[:,1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['religion'] = rounded_pred_probs[:,2]
A value is trying to be set on a copy of a slice from a DataFrame.
Tr

In [21]:
y_true = test['cyberbullying_type']
y_pred = []

In [23]:
test.head()

Unnamed: 0.1,Unnamed: 0,tweet_text,cyberbullying_type,lang,score,y,tweet_length,features,masks,not_cyberbullying,gender,religion,other_cyberbullying,age,ethnicity
233,17365,"If you're a Christian, &amp;you're ok w/voting...",religion,en,0.714285,2,288,"[108, 44, 26, 88, 24, 31747, 19, 1123, 1701, 9...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",0.0,0.0,1.0,0.0,0.0,0.0
678,43635,You thought this was a read....however the boo...,ethnicity,en,0.999995,5,321,"[44, 449, 52, 30, 24, 828, 9, 9, 9, 9, 7336, 3...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",0.0,0.0,0.0,0.0,0.0,1.0
661,42528,“all this hatred” are u kidding... white ppl h...,ethnicity,en,0.85714,5,288,"[221, 2225, 52, 13184, 407, 41, 17, 660, 19880...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",0.0,0.0,0.0,0.0,0.0,1.0
135,14935,You are proud to hindu but you are not even sa...,religion,en,0.999997,2,286,"[44, 41, 4298, 22, 20187, 660, 57, 44, 41, 50,...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",0.0,0.0,1.0,0.0,0.0,0.0
8,6999,You can either choose to host a fascist who is...,gender,en,0.999997,1,284,"[44, 64, 725, 1573, 22, 2057, 24, 17, 23476, 6...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",0.0,1.0,0.0,0.0,0.0,0.0


In [32]:
for row in test.iterrows():
    if row[1]['not_cyberbullying'] == 1.0:
        y_pred.append('not_cyberbullying')
    if row[1]['gender'] == 1.0:
        y_pred.append('gender')
    if row[1]['religion'] == 1.0:
        y_pred.append('religion')
    if row[1]['other_cyberbullying'] == 1.0:
        y_pred.append('other_cyberbullying')
    if row[1]['age'] == 1.0:
        y_pred.append('age')
    if row[1]['ethnicity'] == 1.0:
        y_pred.append('ethnicity')
    

In [46]:
y_true = y_true.to_list()

In [49]:
len(y_pred)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [51]:
print(classification_report(y_true, y_pred, labels=['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying','age', 'ethnicity']))

                     precision    recall  f1-score   support

  not_cyberbullying       0.00      0.00      0.00         0
             gender       0.95      1.00      0.97        19
           religion       1.00      1.00      1.00        65
other_cyberbullying       1.00      1.00      1.00         1
                age       1.00      1.00      1.00        36
          ethnicity       1.00      0.94      0.97        16

          micro avg       0.99      0.99      0.99       137
          macro avg       0.83      0.82      0.82       137
       weighted avg       0.99      0.99      0.99       137



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
