In [1]:
! pip install transformers
! pip install emoji



In [2]:
import json
import os
import pandas as pd
import numpy as np
import re
import emoji
from pandas.core.frame import DataFrame
import torch
from transformers import AutoModel, AutoTokenizer
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [3]:
def getData(data_path, label_path, data_obejct_path):
  data_texts = []
  data_num_labels = []
  data_ids = open(data_path, 'r')
  data_labels = open(label_path, 'r')
  for line in data_ids.readlines():
    temp = []
    temp_data = line.strip('\n')
    ids = temp_data.split(',')
    for id in ids:
      f = open(data_obejct_path + str(id) + '.json', 'r')
      data = json.load(f)
      temp.append(data['text'])
    data_texts.append(' '.join(temp))

  for line in data_labels.readlines():
    label = line.strip('\n')
    if label == "rumour":
        data_num_labels.append('1')
    elif label == "nonrumour":
        data_num_labels.append('0')
  
  return DataFrame({"text":data_texts, "label":data_num_labels})


def getDataForTest(data_path, data_obejct_path):
  data_texts = []
  data_ids = open(data_path, 'r')
  for line in data_ids.readlines():
    temp = []
    temp_data = line.strip('\n')
    ids = temp_data.split(',')
    for id in ids:
      f = open(data_obejct_path + str(id) + '.json', 'r')
      data = json.load(f)
      temp.append(data['text'])
    data_texts.append(' '.join(temp))
  
  return DataFrame({"text":data_texts})


def covert_to_label(probs):
  res = []
  for i in probs:
    if i[0] > 0.5 :
      res.append(1)
    else:
      res.append(0)
  return res


def cleanData(data):
    data['text'] = data['text'].apply(lambda x: re.sub(r'[0-9]+\.', ' ', x))

In [4]:
train_data = pd.read_csv('/content/drive/MyDrive/nlp/train.csv')
dev_data = pd.read_csv('/content/drive/MyDrive/nlp/dev.csv')
test_data = pd.read_csv('/content/drive/MyDrive/nlp/test.csv')

cleanData(train_data)
cleanData(dev_data)
cleanData(test_data)

print(train_data['text'][0])

  Can regularly rinsing your nose with saline help prevent infection with the new coronavirus? https://t.co/ccMjhhD7BK   Can eating garlic help prevent infection with the new coronavirus? #COVID19Malaysia https://t.co/q133xXBiwl   Do vaccines against pneumonia protect you against the new coronavirus? https://t.co/wL0mlEqU95   Can spraying alcohol or chlorine all over your body kill the new coronavirus? #Chamber https://t.co/zunVR7Ht0V   How effective are thermal scanners in detecting people infected with the new coronavirus? https://t.co/nyLOyKAb1H   Can an ultraviolet disinfection lamp kill the new coronavirus? https://t.co/ZrlllbkIjm   Are hand dryers effective in killing the new coronavirus? https://t.co/cSDKXO1bGr   The new coronavirus CANNOT be transmitted through mosquito bites. https://t.co/ZRL8bjRkpl   Taking a hot bath does not prevent the new coronavirus disease https://t.co/bICOqSTOuD   Cold weather and snow CANNOT kill the new coronavirus. https://t.co/7yeQQ6gLNo   COVID-19

In [5]:
def get_bert_tokens(data):
  tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-large", normalization=True)
  input_ids = []
  attention_masks = []
  for tweet in data['text'].values:
    encoding = tokenizer.encode_plus(
               tweet,
               max_length = 128,                    
               add_special_tokens = True,
               padding = 'max_length', 
               return_attention_mask = True,  
               truncation = True,
               return_tensors = 'pt',
               )
    input_ids.append(encoding['input_ids'])
    attention_masks.append(encoding['attention_mask'])
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  data_map = {'input_ids': input_ids, 'attention_masks':attention_masks}
  return data_map

In [6]:
train_map = get_bert_tokens(train_data)
dev_map = get_bert_tokens(dev_data)
test_map = get_bert_tokens(test_data)

In [7]:
def create_data_loader_for_train(data ,data_map, batch_size):
  labels = torch.tensor(data['label'])
  dataset = TensorDataset(data_map['input_ids'], data_map['attention_masks'], labels)
  dataloader = DataLoader(
               dataset,
               sampler = RandomSampler(dataset), 
               batch_size = batch_size
               )
  return dataloader


def create_data_loader_for_dev(data ,data_map, batch_size):
  labels = torch.tensor(data['label'])
  dataset = TensorDataset(data_map['input_ids'], data_map['attention_masks'], labels)
  dataloader = DataLoader(
               dataset,
               sampler = SequentialSampler(dataset), 
               batch_size = batch_size
               )
  return dataloader


def create_data_loader_for_test(data_map, batch_size):
  dataset = TensorDataset(data_map['input_ids'], data_map['attention_masks'])
  dataloader = DataLoader(
               dataset, 
               sampler = SequentialSampler(dataset), 
               batch_size = batch_size
               )
  return dataloader

In [8]:
train_dataloader = create_data_loader_for_train(train_data, train_map, 16)
dev_dataloader = create_data_loader_for_dev(dev_data, dev_map, 16)
test_dataloader = create_data_loader_for_test(test_map, 16)

In [9]:
class RunmorClassifier(nn.Module):
  def __init__(self):
    super().__init__()
    self.bert = AutoModel.from_pretrained("vinai/bertweet-large")
    self.drop_layer = nn.Dropout(p=0.5) 
    self.cls_layer = nn.Linear(self.bert.config.hidden_size, 1)  
     
  def forward(self, seq, attention_mask):
    output = self.bert(seq, attention_mask = attention_mask)
    output = self.drop_layer(output['pooler_output'])
    logits = self.cls_layer(output)
    return logits

In [10]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc

In [11]:
def evaluate(model, criterion, dev_dataloader, gpu):
    model.eval()

    mean_acc, mean_loss = 0, 0
    count = 0
    
    label_list = []
    res = []
    with torch.no_grad():
        for input_ids, attention_masks, labels in dev_dataloader:
            cuda_input_ids = input_ids.cuda(gpu)
            cuda_attention_masks = attention_masks.cuda(gpu)
            cuda_labels = labels.cuda(gpu)

            logits = model(cuda_input_ids, cuda_attention_masks)

            mean_loss += criterion(logits.squeeze(-1), cuda_labels.float()).item()
            mean_acc += get_accuracy_from_logits(logits, cuda_labels)
            count += 1

            probs = torch.sigmoid(logits.unsqueeze(-1))
            probs = probs.detach().cpu().numpy()
            label_temp = labels.detach().cpu().numpy()
            for prob in probs:
              res.append(prob)
            for label in label_temp:
              label_list.append(label)

    predictions = covert_to_label(res)
    f1 = f1_score(label_list, predictions)
    precision = precision_score(label_list, predictions)
    recall = recall_score(label_list, predictions)

    return mean_acc / count, mean_loss / count, f1, precision, recall

In [12]:
def predict(model, dataloader, gpu):
  model.eval()
  res = []
  with torch.no_grad():
    for input_ids, attention_masks, _ in dataloader:
      cuda_input_ids = input_ids.cuda(gpu)
      cuda_attention_masks = attention_masks.cuda(gpu)
      logits = model(cuda_input_ids, cuda_attention_masks)
      probs = torch.sigmoid(logits.unsqueeze(-1))
      probs = probs.detach().cpu().numpy()
      for prob in probs:
        res.append(prob)
  return res

In [13]:
def train(model, optimizer, criterion, train_dataloader, dev_dataloader, epochs, gpu, ):

    for i in range(0, epochs):        
        model.train()
        print("Epoch " + str(i+1))
        total_train_loss = 0
        for it, (input_ids, attention_masks, labels) in enumerate(train_dataloader):
            model.zero_grad() 
            cuda_input_ids = input_ids.cuda(gpu)
            cuda_attention_masks = attention_masks.cuda(gpu)
            cuda_labels = labels.cuda(gpu)
            
            logits = model(cuda_input_ids, cuda_attention_masks)
            loss = criterion(logits.squeeze(-1), cuda_labels.float())
            total_train_loss += loss.item()

            loss.backward()
            optimizer.step()
            
            print("Batch {} of epoch {} complete.".format(it, i+1))
        
        print("Train loss: {0:.2f}".format(total_train_loss/len(train_dataloader)))    
        dev_acc, dev_loss, dev_f1, dev_precision, dev_recall = evaluate(model, criterion, dev_dataloader, gpu)
        print("Dev loss: {0:.2f}".format(dev_loss))
        print("Dev acc: {0:.2f}".format(dev_acc))
        print("Dev f1: {0:.2f}".format(dev_f1))
        print("Dev precision: {0:.2f}".format(dev_precision))
        print("Dev recall: {0:.2f}".format(dev_recall))

In [14]:
gpu = 0
model = RunmorClassifier()
model.cuda(gpu)
optimizer = optim.Adam(model.parameters(), lr = 2e-5)
criterion = nn.BCEWithLogitsLoss()
epochs = 7

Some weights of the model checkpoint at vinai/bertweet-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at vinai/bertweet-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use 

In [15]:
train(model, optimizer, criterion, train_dataloader, dev_dataloader, epochs, gpu)

Epoch 1
Batch 0 of epoch 1 complete.
Batch 1 of epoch 1 complete.
Batch 2 of epoch 1 complete.
Batch 3 of epoch 1 complete.
Batch 4 of epoch 1 complete.
Batch 5 of epoch 1 complete.
Batch 6 of epoch 1 complete.
Batch 7 of epoch 1 complete.
Batch 8 of epoch 1 complete.
Batch 9 of epoch 1 complete.
Batch 10 of epoch 1 complete.
Batch 11 of epoch 1 complete.
Batch 12 of epoch 1 complete.
Batch 13 of epoch 1 complete.
Batch 14 of epoch 1 complete.
Batch 15 of epoch 1 complete.
Batch 16 of epoch 1 complete.
Batch 17 of epoch 1 complete.
Batch 18 of epoch 1 complete.
Batch 19 of epoch 1 complete.
Batch 20 of epoch 1 complete.
Batch 21 of epoch 1 complete.
Batch 22 of epoch 1 complete.
Batch 23 of epoch 1 complete.
Batch 24 of epoch 1 complete.
Batch 25 of epoch 1 complete.
Batch 26 of epoch 1 complete.
Batch 27 of epoch 1 complete.
Batch 28 of epoch 1 complete.
Batch 29 of epoch 1 complete.
Batch 30 of epoch 1 complete.
Batch 31 of epoch 1 complete.
Batch 32 of epoch 1 complete.
Batch 33 of 

In [16]:
dev_probs = predict(model, dev_dataloader, gpu)
dev_predictions = covert_to_label(dev_probs)

print("Accuracy: " + str(accuracy_score(dev_data['label'], dev_predictions)))
print("F1: " + str(f1_score(dev_data['label'], dev_predictions)))
print("Precision: "+ str(precision_score(dev_data['label'], dev_predictions)))
print("Recall: " + str(recall_score(dev_data['label'], dev_predictions)))

Accuracy: 0.9662288930581614
F1: 0.9203539823008849
Precision: 0.9285714285714286
Recall: 0.9122807017543859


In [17]:
def predict_for_test(model, dataloader, gpu):
  model.eval()
  res = []
  with torch.no_grad():
    for input_ids, attention_masks in dataloader:
      cuda_input_ids = input_ids.cuda(gpu)
      cuda_attention_masks = attention_masks.cuda(gpu)
      logits = model(cuda_input_ids, cuda_attention_masks)
      probs = torch.sigmoid(logits.unsqueeze(-1))
      probs = probs.detach().cpu().numpy()
      for prob in probs:
        res.append(prob)
  return res

In [18]:
test_probs = predict_for_test(model, test_dataloader, gpu)
test_predictions = covert_to_label(test_probs)
print(test_predictions)
print(len(test_predictions))

[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 

In [20]:
index = range(len(test_predictions))
res_map = {"Id":index, "Predicted":test_predictions}
df = DataFrame(res_map)
df.to_csv('/content/drive/MyDrive/nlp/bertweet_v4_res.csv', index=False)