In [1]:
import torch
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, RobertaForSequenceClassification, XLNetModel, XLNetTokenizer, RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup, DataCollatorWithPadding
from torch.nn import BCEWithLogitsLoss
from sklearn.metrics import f1_score, accuracy_score
from imblearn.under_sampling import RandomUnderSampler
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#!pip install evaluate

In [2]:
#df = pd.read_csv("/kaggle/input/essays1/essays.csv", encoding='latin1')
df = pd.read_csv("essays.csv", encoding='latin1')

df.describe()

Unnamed: 0,#AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
count,2468,2468,2468,2468,2468,2468,2468
unique,2468,2468,2,2,2,2,2
top,1997_504851.txt,"Well, right now I just woke up from a mid-day ...",y,n,y,y,y
freq,1,1,1277,1235,1310,1254,1272


In [3]:
encoder = LabelEncoder()
df['cEXT'] = encoder.fit_transform(df['cEXT'])
df['cNEU'] = encoder.fit_transform(df['cNEU'])
df['cAGR'] = encoder.fit_transform(df['cAGR'])
df['cCON'] = encoder.fit_transform(df['cCON'])
df['cOPN'] = encoder.fit_transform(df['cOPN'])

In [4]:
df.head()

Unnamed: 0,#AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,1997_504851.txt,"Well, right now I just woke up from a mid-day ...",0,1,1,0,1
1,1997_605191.txt,"Well, here we go with the stream of consciousn...",0,0,1,0,0
2,1997_687252.txt,An open keyboard and buttons to push. The thin...,0,1,0,1,1
3,1997_568848.txt,I can't believe it! It's really happening! M...,1,0,1,1,0
4,1997_688160.txt,"Well, here I go with the good old stream of co...",1,0,1,0,1


In [5]:
cols = df.columns
label_cols = list(cols[2:])
num_labels = len(label_cols)
print('Label columns: ', label_cols)

Label columns:  ['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']


In [6]:
df['one_hot_labels'] = list(df[label_cols].values)
df.head()

Unnamed: 0,#AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN,one_hot_labels
0,1997_504851.txt,"Well, right now I just woke up from a mid-day ...",0,1,1,0,1,"[0, 1, 1, 0, 1]"
1,1997_605191.txt,"Well, here we go with the stream of consciousn...",0,0,1,0,0,"[0, 0, 1, 0, 0]"
2,1997_687252.txt,An open keyboard and buttons to push. The thin...,0,1,0,1,1,"[0, 1, 0, 1, 1]"
3,1997_568848.txt,I can't believe it! It's really happening! M...,1,0,1,1,0,"[1, 0, 1, 1, 0]"
4,1997_688160.txt,"Well, here I go with the good old stream of co...",1,0,1,0,1,"[1, 0, 1, 0, 1]"


In [7]:
labels = list(df.one_hot_labels.values)
text = list(df.TEXT.values)

In [None]:
import re
import string
from bs4 import BeautifulSoup
import nltk
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import spacy

def clean_html(html):

    # parse html content
    soup = BeautifulSoup(html, "html.parser")

    for data in soup(['style', 'script', 'code', 'a']):
        # Remove tags
        data.decompose()

    # return data by retrieving the tag content
    return ' '.join(soup.stripped_strings)

# Load spacy
nlp = spacy.load('en_core_web_sm')

def clean_string(text, stem="None"):

    final_string = ""

    # Make lower
    text = text.lower()

    text = re.sub(r'\n', '', text)

    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # Remove stop words
    text = text.split()
    useless_words = nltk.corpus.stopwords.words("english")
    useless_words = useless_words + ['hi', 'im']

    text_filtered = [word for word in text if not word in useless_words]

    # Remove numbers
    text_filtered = [re.sub(r'\w*\d\w*', '', w) for w in text_filtered]

    # Stem or Lemmatize
    if stem == 'Stem':
        stemmer = PorterStemmer() 
        text_stemmed = [stemmer.stem(y) for y in text_filtered]
    elif stem == 'Lem':
        lem = WordNetLemmatizer()
        text_stemmed = [lem.lemmatize(y) for y in text_filtered]
    elif stem == 'Spacy':
        text_filtered = nlp(' '.join(text_filtered))
        text_stemmed = [y.lemma_ for y in text_filtered]
    else:
        text_stemmed = text_filtered

    final_string = ' '.join(text_stemmed)

    return final_string

In [None]:
print("started cleaning text...")
text = [clean_string(t, stem='Spacy') for t in text]
print("finished cleaning text")

In [8]:
train_text, test_text, train_labels, test_labels= train_test_split(text, labels, random_state=42, test_size=0.30)
test_text, val_text, test_labels, val_labels = train_test_split(test_text, test_labels, random_state=42, test_size=0.50)
print(len(train_text), len(val_text), len(test_text))

1727 371 370


In [24]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, tokenizer2):
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
            self.tokenizer2 = tokenizer2
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', truncation=True, max_length = 512, padding='max_length')
        encoding2 = self.tokenizer2(text, return_tensors='pt', truncation=True, max_length = 512, padding='max_length')
        
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'input_ids2' : encoding2['input_ids'].flatten() ,'labels': torch.tensor(label)}

In [25]:
batch_size = 4
checkpoint = "FacebookAI/roberta-base"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer2 = AutoTokenizer.from_pretrained("xlnet-base-cased")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataset = TextClassificationDataset(train_text, train_labels, tokenizer, tokenizer2)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)

validation_dataset = TextClassificationDataset(val_text, val_labels, tokenizer, tokenizer2)
validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)

test_dataset = TextClassificationDataset(test_text, test_labels, tokenizer, tokenizer2)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)



In [11]:
batch = next(iter(train_dataloader))
print({k: v.shape for k, v in batch.items()})

{'input_ids': torch.Size([4, 512]), 'attention_mask': torch.Size([4, 512]), 'input_ids2': torch.Size([4, 512]), 'labels': torch.Size([4, 5])}


In [14]:
import torch.nn as nn

class Roberta_XLNetForSequenceRegression(nn.Module):
  def __init__(self, output_size):
    super(Roberta_XLNetForSequenceRegression, self).__init__()
    self.roberta = RobertaModel.from_pretrained("FacebookAI/roberta-base",
                                          output_attentions = False,
                                          output_hidden_states = False)
    self.xlnet = XLNetModel.from_pretrained("xlnet-base-cased")
    self.roberta_tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")
    self.xlnet_tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
    self.out = nn.Linear(self.xlnet.config.hidden_size, output_size)
    
  def forward(self, input_ids, attention_mask, input_ids2):
    mid_hidden_states = self.roberta(input_ids=input_ids, attention_mask=attention_mask).pooler_output
        
    last_hidden_states = self.xlnet(input_ids=input_ids2, attention_mask=attention_mask, encoder_hidden_states=mid_hidden_states).last_hidden_state
    last_hidden_states = torch.mean(last_hidden_states, 1)
    
    output = self.out(last_hidden_states)
    return output

  


model = Roberta_XLNetForSequenceRegression(output_size=5)

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], input_ids2=batch['input_ids2'])

tensor([[ 1.6253, -2.6561,  1.2477,  3.5390, -2.5981],
        [ 2.5704, -2.1949,  1.5665,  4.2422, -2.6152],
        [ 2.9960, -2.3802,  1.8910,  4.4281, -3.2821],
        [ 2.5288, -2.4503,  1.8736,  4.4232, -2.9246]],
       grad_fn=<AddmmBackward0>)

In [16]:
torch.cuda.is_available()

True

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Roberta_XLNetForSequenceRegression(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
           

In [18]:
epochs = 5
num_labels = 5

def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)
optimizer = AdamW(model.parameters(), lr = 1e-4)    
total_steps = len(train_dataloader) * epochs
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0.1 * total_steps, num_training_steps = total_steps)



In [None]:
from tqdm.auto import tqdm
from tqdm import trange
import evaluate


train_loss_set = []
val_loss_set = []

progress_bar = tqdm(range(total_steps))

accuracy_metric = evaluate.load("accuracy")
best_val_loss = float('inf')
early_stopping_patience = 2


for epoch_i in trange(epochs, desc="Epoch"):

  # Training
  
  model.train()

  tr_loss = 0 
  nb_tr_examples, nb_tr_steps = 0, 0
  step = 0
  
  for batch in train_dataloader:

    if step % 40 == 0 and not step == 0:            
      print('  Batch {:>5,}  of  {:>5,}. in epoch {:>5,}'.format(step, len(train_dataloader), epoch_i))
    step+=1

    batch = {k: v.to(device) for k, v in batch.items()}
    b_input_ids = batch['input_ids']
    b_input_mask = batch['attention_mask']
    b_labels = batch['labels']
    optimizer.zero_grad()

    # Forward pass for multilabel classification
    outputs = model(b_input_ids, attention_mask=b_input_mask)
    logits = outputs
    loss = loss_fn(logits.view(-1,num_labels), b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    train_loss_set.append(loss.item())    

    
    loss.backward()
    optimizer.step()
    lr_scheduler.step()

    progress_bar.update(1)
    
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

###############################################################################

  # Validation

  model.eval()

  logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]
  val_loss = 0
  nb_val_steps, nb_val_examples = 0, 0
  eval_steps = 0

  # Predict
  for batch in validation_dataloader:

    if eval_steps % 40 == 0 and not eval_steps == 0:
      print(' Eval Batch {:>5,}  of  {:>5,}.'.format(eval_steps, len(validation_dataloader)))
    eval_steps +=1

    batch = {k: v.to(device) for k, v in batch.items()}
    b_input_ids = batch['input_ids']
    b_input_mask = batch['attention_mask']
    b_labels = batch['labels']
    
    with torch.no_grad():
      # Forward pass
      outs = model(b_input_ids, attention_mask=b_input_mask)
      b_logit_pred = outs
      pred_label = torch.sigmoid(b_logit_pred)

      loss = loss_fn(outs.view(-1,num_labels), b_labels.type_as(outs).view(-1,num_labels)) #convert labels to float for calculation
      val_loss_set.append(loss.item())

      b_logit_pred = outs
      pred_label = torch.sigmoid(b_logit_pred)
      predictions = (pred_label >= 0.5).int().reshape(-1)
    
      accuracy_metric.add_batch(predictions=predictions, references=b_labels.int().reshape(-1))

      # b_logit_pred = b_logit_pred.detach().cpu().numpy()
      # pred_label = pred_label.to('cpu').numpy()
      # b_labels = b_labels.to('cpu').numpy() 

    val_loss += loss.item()
    nb_val_examples += b_input_ids.size(0)
    nb_val_steps += 1

    tokenized_texts.append(b_input_ids)
    logit_preds.append(b_logit_pred)
    true_labels.append(b_labels)
    pred_labels.append(pred_label)
  
  print("Validation loss: {}".format(val_loss/nb_val_steps))

  # # Flatten outputs
  # pred_labels = [item for sublist in pred_labels for item in sublist]
  # true_labels = [item for sublist in true_labels for item in sublist]

  # # Calculate Accuracy
  # threshold = 0.50
  # pred_bools = [pl>threshold for pl in pred_labels]
  # true_bools = [tl==1 for tl in true_labels]
  # val_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')*100
  # val_flat_accuracy = accuracy_score(true_bools, pred_bools)*100
  avg_val_loss = val_loss/nb_val_steps
  if avg_val_loss < best_val_loss:
    best_val_loss = avg_val_loss
    epochs_without_improvement = 0
    # Save the best model
    torch.save(model.state_dict(), "/kaggle/working/big-reddit-roberta-model.pth")
  else:
    epochs_without_improvement += 1
    if epochs_without_improvement >= early_stopping_patience:
      print(f'Early stopping triggered after {epoch_i} epochs without improvement.')
      break

  val_flat_accuracy = accuracy_metric.compute()

  # print('F1 Validation Accuracy: ', val_f1_accuracy)
  print('Validation Accuracy: ', val_flat_accuracy)

In [26]:
import evaluate

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

accuracy_metric = evaluate.load("accuracy")
model.eval()
step = 0
for batch in test_dataloader:
    
    if step % 40 == 0 and not step == 0:
        print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(test_dataloader)))
    step+=1
    
    batch = {k: v.to(device) for k, v in batch.items()}
    b_input_ids = batch['input_ids']
    b_input_mask = batch['attention_mask']
    b_labels = batch['labels']
    with torch.no_grad():
        outs = model(b_input_ids, attention_mask=b_input_mask, input_ids2=batch['input_ids2'])
    b_logit_pred = outs
    pred_label = torch.sigmoid(b_logit_pred)
    predictions = (pred_label >= 0.5).int().reshape(-1)
    
    accuracy_metric.add_batch(predictions=predictions, references=b_labels.int().reshape(-1))

accuracy_metric.compute()

  Batch    40  of     93.
  Batch    80  of     93.


{'accuracy': 0.5097297297297297}