In [None]:
pip install simpletransformers

In [40]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import logging
import jsonlines
import matplotlib.pyplot as plt
import numpy as np
import json
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import re
import itertools
import emoji
import sklearn
from simpletransformers.language_representation import RepresentationModel
from sklearn.feature_extraction.text import CountVectorizer

In [41]:
# get texts that are related to the source tweet
def get_valid_text(tweet):
    all_id = []
    for item in tweet:
        all_id.append(item['id'])
    invalid_id = []
    for item in tweet:
        if item['in_reply_to_status_id'] not in all_id:
            invalid_id.append(item['id'])
    
    text = tweet[0]['text']
    for i in range(1, len(tweet)):
        if tweet[i]['in_reply_to_status_id'] in all_id and tweet[i]['in_reply_to_status_id'] not in invalid_id:
            text = text + ' ' + tweet[i]['text']

    return text

In [42]:
# load data from each json file
train_list = []
with open('train.data.jsonl', 'r') as file:
    for item in jsonlines.Reader(file):
        train_list.append(item)
        
with open ('train.label.json', 'r') as file1:
    train_labels = json.loads(file1.read())
    
train_data_list = []

for tweet in train_list:
    tweetId = tweet[0]['id_str']
    text = get_valid_text(tweet)
    text = text.replace('\n', '').replace('\r', '')
    if train_labels[tweetId] == 'non-rumour':
        train_data_list.append('1\t' + text + '\n')
    elif train_labels[tweetId] == 'rumour':
        train_data_list.append('0' + '\t' + text + '\n')

dev_list = []
with open('dev.data.jsonl', 'r') as file:
    for item in jsonlines.Reader(file):
        dev_list.append(item)
        
with open ('dev.label.json', 'r') as file1:
    dev_labels = json.loads(file1.read())
    
dev_data_list = []
for tweet in dev_list:
    tweetId = tweet[0]['id_str']
    text = get_valid_text(tweet)
    text = text.replace('\n', '').replace('\r', '')
    if dev_labels[tweetId] == 'non-rumour':
        dev_data_list.append('1\t' + text + '\n')
    elif dev_labels[tweetId] == 'rumour':
        dev_data_list.append('0' + '\t' + text + '\n')



In [23]:
test_list = []
with open('test.data.jsonl', 'r') as file:
    for item in jsonlines.Reader(file):
        test_list.append(item)

test_data_list = []
for tweet in test_list:
    tweetId = tweet[0]['id_str']
    text = get_valid_text(tweet)
    text = text.replace('\n', '').replace('\r', '')
    test_data_list.append(text + '\n')

In [49]:
covid_list = []
with open('covid.data.jsonl', 'r') as file:
    for item in jsonlines.Reader(file):
        covid_list.append(item)

covid_data_list = []
for tweet in covid_list:
    tweetId = tweet[0]['id_str']
    text = get_valid_text(tweet)
    text = text.replace('\n', '').replace('\r', '')
    covid_data_list.append(text + '\n')

In [24]:
# create data frames
train_df = []
for item in train_data_list:
    item = item.split('\t')
    train_df.append([item[1], int(item[0])])
train_df = pd.DataFrame(train_df, columns=['text', 'label'])

dev_df = []
for item in dev_data_list:
    item = item.split('\t')
    dev_df.append([item[1], int(item[0])])
dev_df = pd.DataFrame(dev_df, columns=['text', 'label'])

In [25]:
test_df = []
for item in test_data_list:
    test_df.append(item)
test_df = pd.DataFrame(test_df, columns=['text'])

In [50]:
covid_df = []
for item in covid_data_list:
    covid_df.append(item)
covid_df = pd.DataFrame(covid_df, columns=['text'])

In [45]:
# tokenisation and normalisation
def preprocess(text):
        stopw = set(stopwords.words('english'))
        text = text.lower()
        tt = TweetTokenizer()
        text = tt.tokenize(text)
        for i in range(len(text) - 1, -1, -1):
            containAlphabets = False
            for j in range(len(text[i])):
                if ord(text[i][j]) >= 97 and ord(text[i][j]) <= 122:
                    containAlphabets = True
                    break
            if not containAlphabets:
                text.pop(i)

        for word in text:
            if word in stopw:
                text.remove(word)
        
        return ' '.join(text)

In [27]:
train_df['text'] = train_df['text'].apply(preprocess)
dev_df['text'] = dev_df['text'].apply(preprocess)

In [28]:
test_df['text'] = test_df['text'].apply(preprocess)

In [51]:
covid_df['text'] = covid_df['text'].apply(preprocess)

In [29]:
# over-sampling of the training data
train_non_rumour = train_df.loc[train_df['label'] == 1]
train_rumour = train_df.loc[train_df['label'] == 0]
train_df = train_df.append(train_rumour)

In [54]:
# the hyperparameters of BERT
train_args = {
    'evaluate_during_training': True,
    'logging_steps': 100,
    'num_train_epochs': 3,
    'evaluate_during_training_steps': 100,
    'save_eval_checkpoints': False,
    'train_batch_size': 32,
    'eval_batch_size': 64,
    'overwrite_output_dir': True,
    'fp16': False,
    'wandb_project': "visualization-demo"
}

In [55]:
# create the BERT model
bert_model = ClassificationModel('bert', 'bert-base-cased', num_labels=2, use_cuda=False, args=train_args)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [56]:
# train the BERT model
bert_model.train_model(train_df, eval_df=dev_df)



HBox(children=(FloatProgress(value=0.0, max=6224.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Training loss,0.09813
lr,0.0
global_step,780.0
_runtime,21911.0
_timestamp,1620750426.0
_step,17.0
tp,361.0
tn,151.0
fp,36.0
fn,32.0


0,1
Training loss,▇█▅▆▂▁▃
lr,█▇▆▅▃▂▁
global_step,▁▁▂▂▂▃▃▄▄▄▅▅▆▆▆▇▇█
_runtime,▁▁▂▂▂▃▃▄▄▄▅▅▆▆▆▇▇█
_timestamp,▁▁▂▂▂▃▃▄▄▄▅▅▆▆▆▇▇█
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██
tp,▁█▃█▂▃█▇▇▇▇
tn,▅▁▆▂█▇▂▄▃▂▃
fp,▄█▃▇▁▂▇▅▆▇▆
fn,█▁▆▁▇▆▁▂▂▂▂


wandb: wandb version 0.10.30 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 3', max=195.0, style=ProgressStyle(des…



HBox(children=(FloatProgress(value=0.0, max=580.0), HTML(value='')))







HBox(children=(FloatProgress(value=0.0, max=580.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 3', max=195.0, style=ProgressStyle(des…



HBox(children=(FloatProgress(value=0.0, max=580.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, max=580.0), HTML(value='')))







HBox(children=(FloatProgress(value=0.0, max=580.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 3', max=195.0, style=ProgressStyle(des…



HBox(children=(FloatProgress(value=0.0, max=580.0), HTML(value='')))






HBox(children=(FloatProgress(value=0.0, max=580.0), HTML(value='')))







HBox(children=(FloatProgress(value=0.0, max=580.0), HTML(value='')))





(585,
 {'global_step': [100, 195, 200, 300, 390, 400, 500, 585],
  'tp': [292, 329, 314, 356, 322, 354, 353, 356],
  'tn': [171, 157, 165, 147, 168, 150, 153, 155],
  'fp': [16, 30, 22, 40, 19, 37, 34, 32],
  'fn': [101, 64, 79, 37, 71, 39, 40, 37],
  'mcc': [0.6157637010270793,
   0.6513047087292281,
   0.6450796003628926,
   0.6949194310225264,
   0.681560708378925,
   0.7009593312749574,
   0.7106224298001573,
   0.7297566026540231],
  'train_loss': [0.6055215001106262,
   0.270935982465744,
   0.19663278758525848,
   0.3372179865837097,
   0.37326779961586,
   0.170146644115448,
   0.028444252908229828,
   0.10164628177881241],
  'eval_loss': [0.44366034865379333,
   0.37111039757728576,
   0.39359167516231536,
   0.37432430386543275,
   0.4336180448532104,
   0.4553589552640915,
   0.5162048220634461,
   0.5401531487703324],
  'auroc': [0.9050223836932412,
   0.9227252316610197,
   0.9239090500877658,
   0.9407682573376333,
   0.9389857261433373,
   0.9295560000544285,
   0.938890

In [85]:
# evaluate using dev_data
result, model_outputs, wrong_predictions = bert_model.eval_model(dev_df, acc=sklearn.metrics.accuracy_score)

HBox(children=(FloatProgress(value=0.0, max=580.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=10.0, style=ProgressStyle(descri…




VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Training loss,0.14776
lr,1e-05
global_step,292.0
_runtime,7853.0
_timestamp,1620459955.0
_step,5.0
tp,360.0
tn,135.0
fp,52.0
fn,33.0


0,1
Training loss,█▁
lr,█▁
global_step,▁▁▃▅▅█
_runtime,▁▁▃▅▅█
_timestamp,▁▁▃▅▅█
_step,▁▂▄▅▇█
tp,▁█▄▆
tn,▁▄▇█
fp,█▅▂▁
fn,█▁▅▃


wandb: wandb version 0.10.30 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


In [37]:
# predict labels of test data
test_sentences = []
for text in test_df['text']:
    test_sentences.append(text)
predictions, raw_outputs = bert_model.predict(test_sentences)

HBox(children=(FloatProgress(value=0.0, max=581.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [34]:
# predict labels of dev data
dev_sentences = []
for text in dev_df['text']:
    dev_sentences.append(text)
dev_predictions, dev_raw_outputs = bert_model.predict(dev_sentences)

HBox(children=(FloatProgress(value=0.0, max=580.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [57]:
# predict labels of task2 COVID-19 data
covid_sentences = []
for text in covid_df['text']:
    covid_sentences.append(text)
covid_predictions, covid_raw_outputs = bert_model.predict(covid_sentences)

HBox(children=(FloatProgress(value=0.0, max=17458.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=273.0), HTML(value='')))




In [38]:
# get output and convert them to json files
output = {}
for i in range(len(test_list)):
    if predictions[i] == 0:
        output[test_list[i][0]['id_str']] = 'rumour'
    else:
        output[test_list[i][0]['id_str']] = 'non-rumour'

In [35]:
dev_output = {}
for i in range(len(dev_list)):
    if dev_predictions[i] == 0:
        dev_output[dev_list[i][0]['id_str']] = 'rumour'
    else:
        dev_output[dev_list[i][0]['id_str']] = 'non-rumour'

In [39]:
with open('test-output.json', 'w') as f:
        json.dump(output, f)

In [36]:
with open('dev-output.json', 'w') as f:
    json.dump(dev_output, f)

In [64]:
covid_rumours = []
covid_nonrumours = []
for i in range(len(covid_sentences)):
    if covid_predictions[i] == 0:
        covid_rumours.append(covid_sentences[i])
    else:
        covid_nonrumours.append(covid_sentences[i])

In [68]:
with open('covid_rumours.txt', 'w', encoding='utf-8') as f:
    for sentence in covid_rumours:
        f.write(sentence + '\n')

with open('covid_nonrumours.txt', 'w', encoding='utf-8') as f:
    for sentence in covid_nonrumours:
        f.write(sentence + '\n')