In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import AutoTokenizer,AutoModel #,AutoModelForSequenceClassification

modelName = "roberta-base"
# model = AutoModelForSequenceClassification.from_pretrained(modelName)
model = AutoModel.from_pretrained(
    modelName,
    num_labels=3,
    )
tokenizer = AutoTokenizer.from_pretrained(modelName)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
#example working of tokenizer
tokens = tokenizer.tokenize("Hello bert model stance","Hello apple")
tokenIds = tokenizer.convert_tokens_to_ids(tokens)

In [None]:
tokens

['Hello', 'Ġb', 'ert', 'Ġmodel', 'Ġstance', 'Hello', 'Ġapple']

In [None]:
import pandas as pd
import numpy as np
import torch 

In [None]:
df = pd.read_csv('./stance_all_1_4.csv')
df = df.sample(frac = 1)
df.head(10)

Unnamed: 0.1,Unnamed: 0,Body ID,articleBody,Headline,Stance
7025,3004,176,Hunky mensch who took down violent bully looks...,'Banksy' Reacts To Paris Attack With Poignant ...,2
3652,22555,1303,Description: Fake news / Satire\nCirculating s...,"L. Jinny? Abdel-Majed Abdel Bary, UK Rapper, S...",2
7995,21044,1245,YouTuber Josh Paler Lin is normally a prankste...,Eyewitness Says Viral Video of Homeless Man Wa...,0
9383,3274,195,A video posted by ISIL terrorists in Iraq purp...,Attorney: New audio reveals pause in gunfire w...,2
2368,9744,592,Warning: graphic image below\n\nA 22-year-old ...,Report: Taliban Detainee Swapped for Bowe Berg...,2
4805,40554,2130,"Vice founder Shane Smith, with something that ...",Obama: murder of James Foley 'shocks the consc...,2
9639,7134,444,Hospital authorities are carrying out an inves...,Rumor: Gold Apple Watch Edition priced up to $...,2
7954,31024,1728,A Twitter account associating itself with Fox ...,The @FoxNewsPress Account Tweeting Lawsuit Thr...,1
7090,27458,1545,Macaulay Culkin has once again died — at least...,Apple 'working on 12-inch MacBook Air',2
1289,28812,1610,"Ahmed Abdi Godane — the leader of al Shabab, t...",Pentagon: Airstrike killed terror leader in So...,1


In [None]:
df['Stance'].unique()

array([2, 0, 1])

In [None]:
len(df)

10000

In [None]:
articleBodies = df.articleBody.values
headlines = df.Headline.values
stances = df.Stance.values

In [None]:
input_ids=[]
attention_masks=[]

for i in range(len(headlines)): 
  encoded_dict=tokenizer.encode_plus(
      [headlines[i],articleBodies[i]], #sentence to encode
      add_special_tokens=True, #add special characters
      max_length=256,
      truncation=True, #add max len and truncate the sentence
      # padding = 'max_length'
      pad_to_max_length=True, 
      return_attention_mask=True,#construct attention mask
      return_tensors='pt' #return pytorch tensor
  )
  #add encoded sentence to the list
  input_ids.append(encoded_dict['input_ids'])

  # And its attention mask (simply differentiates padding from non-padding).
  attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
stances = torch.tensor(stances)

# Print sentence 0, now as a list of IDs.
print('Headline: ', headlines[0])
print('Article Body: ',articleBodies[0])
print('Token IDs:', input_ids[0])



Headline:  'Banksy' Reacts To Paris Attack With Poignant Drawing
Article Body:  Hunky mensch who took down violent bully looks like Paul Rudd, is not Paul Rudd

Sorry, amateur celebrity sleuths — the actor Paul Rudd was not involved in the tackling of a man shouting homophobic slurs in a Dallas airport last week.

A man who looked almost exactly like the This Is 40 star was caught on video helping to tackle a man who yelled homophobic slurs and kicked another man waiting in Dallas-Fort Worth International Airport last week.

After the man became violent, a group of people swarmed him and brought him to the ground, with the help of a wavy-haired hunk in a checkered shirt who looks a lot like Paul Rudd. The video of the incident quickly went viral, amid rumors that the Hollywood funnyman was involved.

But Paul Rudd’s rep confirmed to TIME on Monday afternoon that the man with a noble heart, quick reflexes, and a strong sense of social outrage is not Paul Rudd.

Ladies: That means there 

In [None]:
from torch.utils.data import TensorDataset, random_split
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, stances)

In [None]:
# 0.7 - training and 0.3- testing
train_size = int(0.7*len(dataset))
test_size = len(dataset)-train_size

# splitting 
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

train_dataloader = DataLoader(train_dataset,sampler = RandomSampler(train_dataset),batch_size = 16)
test_dataloader = DataLoader(test_dataset,sampler = RandomSampler(test_dataset),batch_size = 16)


In [None]:
itr = iter(train_dataloader)
data = itr.next()
print(data)

[tensor([[    0, 37589, 16371,  ...,  3533,    21,     2],
        [    0,  5532,  3091,  ..., 50118,   133,     2],
        [    0, 37142,    18,  ..., 10652, 17918,     2],
        ...,
        [    0, 34052,   611,  ...,    49,  1420,     2],
        [    0,   104, 20115,  ...,  9072,    41,     2],
        [    0,  7629,    29,  ..., 14631,  1464,     2]]), tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), tensor([2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2])]


In [None]:
# optimizer 
from transformers import AdamW
optimizer=AdamW(model.parameters(),lr=5e-5,eps=1e-8)
epochs=2



In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
print(len(train_dataset))
device = torch.device('cpu')

7000


In [None]:
df.Stance.value_counts()

2    8863
1     914
0     223
Name: Stance, dtype: int64

In [None]:
len(train_dataloader)

438

In [None]:
import torch.nn as nn
import torch.nn.functional as F

In [None]:
classifier = torch.nn.Linear(model.config.hidden_size, 1)
loss_func = nn.BCEWithLogitsLoss(reduction='mean')

In [None]:
for epoch_i in range(0,epochs):
  model.train()
  print(epoch_i)
  # print(len(train_dataloader))
  epoch_loss = 0
  for step,batch in enumerate(train_dataloader):
    if step>2:
      break
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    # print(b_labels)
    print("epoch =",epoch_i,"step =",step,end=" ")
    model.zero_grad() 
    output = model(input_ids=b_input_ids, 
                             attention_mask=b_input_mask, 
                            #  labels=b_labels
                   )
    pooled_output = torch.mean(output.last_hidden_state, 1)
    # final logits
    pooled_output = nn.Dropout()(pooled_output)
    pooled_output = torch.nn.Linear(model.config.hidden_size, model.config.hidden_size)(pooled_output)
    pooled_output = F.relu(pooled_output)
    pooled_output = nn.Dropout()(pooled_output)
    logits = classifier(pooled_output)
    print(logits)
    expected = []
    for i in range(len(logits[0])):
      expected.append(logits[0][i])
    print(type(b_labels))
    # calculate loss
    loss = loss_func(expected, b_labels)
    print("loss =",loss)
    epoch_loss += loss
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()

  print("Total loss in epoch =",epoch_loss)
  print("avg loss in epoch",epoch_i," =",epoch_loss/len(train_dataloader))


0
epoch = 0 step = 0 tensor([[ 9.5538e-02],
        [ 4.1897e-01],
        [-1.0487e-03],
        [ 2.4889e-02],
        [ 1.1356e-01],
        [-3.5705e-01],
        [-3.1098e-04],
        [ 1.5651e-02],
        [ 2.5249e-02],
        [ 1.7481e-01],
        [-4.0313e-01],
        [ 1.6982e-01],
        [-1.9781e-02],
        [-1.1296e-01],
        [ 3.3837e-02],
        [-3.7252e-02]], grad_fn=<AddmmBackward0>)
<class 'torch.Tensor'>


AttributeError: ignored

In [None]:
model_save_name="bert-stance-model.pt"
path = f'/content/gdrive/My Drive/{model_save_name}'
torch.save(model.state_dict(),path)

In [None]:
save_directory = 'saved'
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

In [None]:
#Define a helper function for calculating accuracy.
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def validation_accuracy(orig, pred):
  correct = 0
  agree_correct=0
  disagree_correct=0
  unrelated_correct=0
  total_agree=0
  total_disagree=0
  total_unrelated=0
  for i in range(len(orig)):
    if orig[i]==pred[i]:
      correct+=1
    if orig[i]==1:
      total_agree+=1
      if pred[i]==1:
        agree_correct+=1
    if orig[i]==0:
      total_disagree+=1
      if pred[i]==0:
        disagree_correct+=1 
    if orig[i]==2:
      total_unrelated+=1
      if pred[i]==2:
        unrelated_correct+=1

  if total_agree==0:
    agree_acc = 1
  else:
    agree_acc = agree_correct/total_agree

  if total_disagree==0:
    disagree_acc=1
  else:
    disagree_acc = disagree_correct/total_disagree

  if total_unrelated==0:
    unrelated_acc=1
  else:
    unrelated_acc = unrelated_correct/total_unrelated
  return [correct/len(orig),agree_acc,disagree_acc,total_agree,total_disagree,unrelated_acc,total_unrelated]


In [None]:
import time
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
## validation ##

import torch.nn.functional as F
t0 = time.time()
model.eval()

# Tracking variables 
total_eval_accuracy = 0
total_agree_accuracy=0
total_disagree_accuracy=0
total_unrelated_accuracy=0
total_eval_loss = 0
nb_eval_steps = 0
total_agree=0
total_disagree=0
total_unrelated=0

# Evaluate data for one epoch
for step,batch in enumerate(test_dataloader):
    
    # Unpack this training batch from our dataloader. 
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    
    with torch.no_grad():        

        print('step =',step)
        output = model(b_input_ids, 
                                # token_type_ids=None, 
                                attention_mask=b_input_mask,
                                labels=b_labels)
        predictions = F.softmax(output.logits,dim=1)
        labels = torch.argmax(predictions,dim=1)
        print("original labels =",b_labels )
        print("labels =",labels)

    acc = validation_accuracy(b_labels, labels)
    print("Accuracy for batch",step+1," =",acc[0])
    total_eval_accuracy += acc[0]
    total_agree_accuracy += acc[1]
    total_disagree_accuracy+=acc[2]
    total_agree+=acc[3]
    total_disagree+=acc[4]
    total_unrelated_accuracy+=acc[5]
    total_unrelated+=acc[6]

    # Accumulate the validation loss.
    l=output.loss.item()
    print("loss =",l)
    total_eval_loss += l

    # Move logits and labels to CPU
    logits = output.logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    print()
    print()

    

# Report the final accuracy for this validation run.
avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
print("Accuracy: {0:.2f}".format(avg_val_accuracy))


# Report the final agree accuracy for this validation run.
avg_agree_accuracy = total_agree_accuracy / total_agree
print("Average Agree Accuracy: {0:.2f}".format(avg_agree_accuracy))

# Report the final disagree accuracy for this validation run.
avg_disagree_accuracy = total_disagree_accuracy / total_disagree
print("Average Disagree Accuracy: {0:.2f}".format(avg_disagree_accuracy))

# Report the final unrelated accuracy for this validation run.
avg_unrelated_accuracy = total_unrelated_accuracy / total_unrelated
print("Average Unrelated Accuracy: {0:.2f}".format(avg_unrelated_accuracy))

# Calculate the average loss over all of the batches.
avg_val_loss = total_eval_loss / len(test_dataloader)
print("Average validation loss =",avg_val_loss)

# Measure how long the validation run took.
validation_time = format_time(time.time() - t0)
print("Validation took: {:}".format(validation_time))
