In [None]:
import json
import csv
import re
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer
import pandas as pd
from torch.utils.data import DataLoader
import torch.nn as nn
from transformers import BertModel
import torch.optim as optim
import time
import operator

In [None]:
train_file = r"data\train.data.jsonl"
train_label_file = r"data\train.label.json"
train_tsv = r".\train.tsv"

dev_file = r"data\dev.data.jsonl"
dev_label_file = r"data\dev.label.json"
dev_tsv = r".\dev.tsv"

# Data Transfer to TSV File Section
#### If transferred, this section is not required to run

In [None]:
# extract original tweet and reply do not do any processing
def transfer_to_tsv(tweet_file, label_file, transferred_file):
    contents = []
    labels = []
    classes = ["non-rumour", "rumour"]
    
    # get labels
    with open(label_file) as lable_obj:
        unprocessed_labels = json.load(lable_obj)
    
    #get tweets 
    with open(tweet_file, 'r') as tweet_obj:
        json_list = list(tweet_obj)
    
    # extract the content we want and append to list
    for json_str in json_list:
        # get one event
        event = json.loads(json_str)
        
        # concat. original tweet and replies
        # Insering the CLS and SEP token in the beginning of original tweet and end of the each tweet and reply
        content = ""
        for i in range(len(event)):
            if event[i]["text"][-1] == ".":
                content = content + event[i]["text"] + ' '
            else:
                content = content + event[i]["text"] + '. '
                
        # remove @, and hashtag
        content = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', content, flags=re.MULTILINE)
        content = re.sub(r'@(\w+)?', '', content, flags=re.MULTILINE)
        content = re.sub(r'#(\w+)?', '', content, flags=re.MULTILINE)
        
        contents.append(content.replace("\n", " ").replace("\r", " "))
        labels.append(classes.index(unprocessed_labels[event[0]['id_str']]))
        
    # write into tsv
    with open(transferred_file, 'w', encoding="utf-8") as f:
        tsv_w = csv.writer(f, delimiter='\t', lineterminator='\n')
        tsv_w.writerow(['content', 'label'])
        for num in range(len(contents)):
            tsv_w.writerow([contents[num], labels[num]])

In [None]:
# transfer jsonl and json to tsv file
transfer_to_tsv(train_file, train_label_file, train_tsv)
transfer_to_tsv(dev_file, dev_label_file, dev_tsv)

# Transferring TSV to Training Form Section
#### If just for prediction please go to Building Model Section.

In [None]:
class TrainDataset(Dataset):

    def __init__(self, filename, input_size):

        #Store the contents of the file in a pandas dataframe
        self.df = pd.read_csv(filename, delimiter = '\t')

        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        
        # the input length for BERT model. Max length is 512
        self.input_size = 0
        if input_size > 512:
            self.input_size = 512
        else:
            self.input_size = input_size
            
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):

        #Selecting the content and label at the specified index in the data frame
        tweet = self.df.loc[index, 'content']
        label = self.df.loc[index, 'label']
        
        # Tokenize the tweet and insering the CLS and SEP
        tokens = ['[CLS]'] + self.tokenizer.tokenize(tweet) + ['[SEP]'] 
        
        if len(tokens) < self.input_size:
            #Padding token
            tokens = tokens + ['[PAD]' for _ in range(self.input_size - len(tokens))] 
        else:
            # if tokens length > input_size, extract the first input_size-1 and add SEP
            tokens = tokens[:self.input_size-1] + ['[SEP]'] 
        
        #Converting the token to a pytorch ID tensor
        tokens_ids_tensor = torch.tensor(self.tokenizer.convert_tokens_to_ids(tokens)) 

        #Obtaining the attention mask
        attn_mask = (tokens_ids_tensor != 0).long()
        
        return tokens_ids_tensor, attn_mask, label

In [None]:
#Creating instances of training and development set
train_set = TrainDataset(filename = train_tsv, input_size = 512)
dev_set = TrainDataset(filename = dev_tsv, input_size = 512)

#Creating intsances of training and development dataloaders
#batch_size = 8 for 512 input size (GTX-1080Ti 11G)
#batch_size = 4 for 512 input size (RTX-2080 8G)
train_loader = DataLoader(train_set, batch_size = 4, num_workers = 0)
dev_loader = DataLoader(dev_set, batch_size = 4, num_workers = 0)

print("Done preprocessing training and development data.")

# Building Model Section

In [None]:
class RumorClassifier(nn.Module):

    def __init__(self):
        super(RumorClassifier, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased', return_dict=False)
        
        #Linear model. Since the dimension of contextual representation is 768, the input size of linear model is 768
        self.cls_layer = nn.Linear(768, 1)

    def forward(self, seq, attn_masks):
        #Feeding the input to BERT model to obtain contextualized representations
        cont_reps, _ = self.bert_layer(seq, attention_mask = attn_masks)

        #Obtaining the representation of [CLS] head (the first token)
        cls_rep = cont_reps[:, 0]

        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits

In [None]:
gpu = 0 #gpu ID

print("Creating the Rumor classifier, initialised with pretrained BERT-BASE parameters...")
net = RumorClassifier()

In [None]:
net.cuda(gpu) #Enable gpu support for the model
print("Done creating the Rumor classifier.")

In [None]:
criterion = nn.BCEWithLogitsLoss()
opti = optim.Adam(net.parameters(), lr = 2e-5)

In [None]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc

def evaluate(net, criterion, dataloader, gpu):
    net.eval()

    mean_acc, mean_loss = 0, 0

    with torch.no_grad():
        for seq, attn_masks, labels in dev_loader:
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)
            logits = net(seq, attn_masks)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            mean_acc += get_accuracy_from_logits(logits, labels)
    
    return mean_acc / len(dev_loader), mean_loss / len(dev_loader)

# Training Section
#### If just for prediction please go to Testing Section

In [None]:
def train(net, criterion, opti, train_loader, dev_loader, max_eps, gpu):

    best_acc = 0
    st = time.time()
    for ep in range(max_eps):
        
        for it, (seq, attn_masks, labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()  
            #Converting these to cuda tensors
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)

            #Obtaining the logits from the model
            logits = net(seq, attn_masks)

            #Computing loss
            loss = criterion(logits.squeeze(-1), labels.float())

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()
              
            if it % 100 == 0:
                
                acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Time taken (s): {}".format(it, ep, loss.item(), acc, (time.time()-st)))
                st = time.time()

        # evaluate for each epoch
        dev_acc, dev_loss = evaluate(net, criterion, dev_loader, gpu)
        print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(ep, dev_acc, dev_loss))
        if dev_acc > best_acc:
            print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
            best_acc = dev_acc
            torch.save(net.state_dict(), 'sstcls_{}.dat'.format(ep))

In [None]:
num_epoch = 16

#fine-tune the model
train(net, criterion, opti, train_loader, dev_loader, num_epoch, gpu)

# Testing Section

In [None]:
def test_to_tsv(tweet_file, transferred_file):
    contents = []
    tweet_ID = []
    
    #get tweets 
    with open(tweet_file, 'r') as tweet_obj:
        json_list = list(tweet_obj)
    
    # extract the content we want and append to list
    for json_str in json_list:
        # get one event
        event = json.loads(json_str)
        
        # concat. original tweet and replies
        # Insering the CLS and SEP token in the beginning of original tweet and end of the each tweet and reply
        content = ""
        for i in range(len(event)):
            if event[i]["text"][-1] == "." or event[i]["text"][-1] == "?" or event[i]["text"][-1] == "!":
                content = content + event[i]["text"] + ' '
            else:
                content = content + event[i]["text"] + '. '
        
        content = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', content, flags=re.MULTILINE)
        content = re.sub(r'@(\w+)?', '', content, flags=re.MULTILINE)
        content = re.sub(r'#(\w+)?', '', content, flags=re.MULTILINE)
        
        contents.append(content.replace("\n", " ").replace("\r", " "))
        tweet_ID.append(event[0]['id_str'])
        
    # write into tsv
    with open(transferred_file, 'w', encoding="utf-8") as f:
        tsv_w = csv.writer(f, lineterminator='\n')
        tsv_w.writerow(['content'])
        for num in range(len(contents)):
            tsv_w.writerow([contents[num].replace("\n", " ")])
            
    return tweet_ID

In [None]:
test_file = r"data\test.data.jsonl"
test_tsv = r".\test.tsv"
dev_for_evaluate =  r".\dev_result.tsv"

# test_order used to construct dict. with prediction
#test_order = test_to_tsv(test_file, test_tsv)

# produce json prediction for dev in order to apply eval.py
test_order = test_to_tsv(dev_file, dev_for_evaluate)

In [None]:
class TestDataset(Dataset):

    def __init__(self, filename, input_size):

        #Store the contents of the file in a pandas dataframe
        self.df = pd.read_csv(filename, delimiter = '\t')

        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        
        # the input length for BERT model. Max length is 512
        self.input_size = 0
        if input_size > 512:
            self.input_size = 512
        else:
            self.input_size = input_size

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):

        #Selecting the content and label at the specified index in the data frame
        tweet = self.df.loc[index, 'content']
        
        # Tokenize the tweet and insering the CLS and SEP
        tokens = ['[CLS]'] + self.tokenizer.tokenize(tweet) + ['[SEP]'] 
        
        if len(tokens) < self.input_size:
            #Padding token
            tokens = tokens + ['[PAD]' for _ in range(self.input_size - len(tokens))] 
        else:
            # if tokens length > input_size, extract the first input_size-1 and add SEP
            tokens = tokens[:self.input_size-1] + ['[SEP]'] 
        
        #Converting the token to a pytorch ID tensor
        tokens_ids_tensor = torch.tensor(self.tokenizer.convert_tokens_to_ids(tokens)) 

        #Obtaining the attention mask
        attn_mask = (tokens_ids_tensor != 0).long()
        
        return tokens_ids_tensor, attn_mask

In [None]:
# test for dev set
test_set = TestDataset(filename = dev_for_evaluate, input_size = 512)

# test for testing set
#test_set = TestDataset(filename = test_tsv, maxlen = 512)

test_loader = DataLoader(test_set, batch_size = 1, num_workers = 0)

In [None]:
def predict(net, test_order, test_loader, weight_file, predict_file):
    # load weight
    net.load_state_dict(torch.load(weight_file))
    
    predictions = []
    classes = ["non-rumour", "rumour"]
    
    # Predict process
    with torch.no_grad():
        for seq, attn_masks in test_loader:
            seq, attn_masks = seq.cuda(gpu), attn_masks.cuda(gpu)
            logits = net(seq, attn_masks)
            probs = torch.sigmoid(logits.unsqueeze(-1))
            soft_probs = (probs > 0.5).long()
            predictions.append(classes[soft_probs.cpu().numpy().squeeze()])
    
    # Write into json file
    dictionary = dict(zip(test_order, predictions))
    
    with open(predict_file, 'w') as result_file:
        json.dump(dictionary, result_file)

In [None]:
weight_file = r".\OR_remove@URL#.dat"
predict_file = "test-output.json"
prediction = predict(net, test_order, test_loader, weight_file, predict_file)

# Other Processing Method for Convert to tsv function

In [None]:
content = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', content, flags=re.MULTILINE) # remove url
content = re.sub(r'@(\w+)?', '', content, flags=re.MULTILINE) # remove @username
content = re.sub(r'#(\w+)?', '', content, flags=re.MULTILINE) # remove hashtag

##### Concatenet source and reply tweet

In [None]:
content = ""
for i in range(len(event)):
    if event[i]["text"][-1] == "." or event[i]["text"][-1] == "?" or event[i]["text"][-1] == "!":
        content = content + event[i]["text"] + ' '
    else:
        content = content + event[i]["text"] + '. '

##### Concatenet source and last 10 reply tweets

In [None]:
content = ""
# get the original tweet first
if event[0]["text"][-1] == "." or event[0]["text"][-1] == "?" or event[0]["text"][-1] == "!":
    content = content + event[0]["text"] + ' '
else:
    content = content + event[0]["text"] + '. '

#get the last 10 replies
if len(event) > 10: 
# process tweet create time in order to sort tweet by time
    for e in event:
        e["processed_time"] = time.strftime('%Y-%m-%d %H:%M:%S',time.strptime(e['created_at'],'%a %b %d %H:%M:%S +0000 %Y'))
    event.sort(key = operator.itemgetter("processed_time"))

    for e in event[-10:]:
        if e["text"][-1] == "." or e["text"][-1] == "?" or e["text"][-1] == "!":
            content = content + e["text"] + ' '
        else:
            content = content + e["text"] + '. '
else:
    # get all replies if there are no enough replies
    for i in range(1, len(event)):
        if event[i]["text"][-1] == "." or event[i]["text"][-1] == "?" or event[i]["text"][-1] == "!":
            content = content + event[i]["text"] + ' '
        else:
            content = content + event[i]["text"] + '. '
        content = content + event[i]["text"] + '. '

# Other Processing Method for Convert to tensor function

##### remove token which do not contain alphabet

In [None]:
new_tokens = []
for token in tokens:
    if (re.search('[a-zA-Z.?!]', token)):
        new_tokens.append(token)

new_tokens = ['[CLS]'] + new_tokens + ['[SEP]']
if len(new_tokens) < self.maxlen:
    new_tokens = new_tokens + ['[PAD]' for _ in range(self.maxlen - len(new_tokens))] #Padding token
else:
    new_tokens = new_tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

tokens_ids = self.tokenizer.convert_tokens_to_ids(new_tokens) #Obtaining the indices of the tokens in the BERT Vocabulary