In [1]:
import json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pandas import DataFrame
import torch
from sklearn.linear_model import LogisticRegression

# Models
import torch.nn as nn
from transformers import BertModel, BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, AutoTokenizer, AutoModelForSequenceClassification, AutoModel

# Training
import torch.optim as optim

# Evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')


In [2]:
# Task 1
# Training Data
train_data_jsonl = open('train.data.jsonl', 'r')
train_label = open('train.label.json', 'r')

# Dev Data
dev_data_jsonl = open('dev.data.jsonl', 'r')
dev_label = open('dev.label.json', 'r')

# Test Data
test_data_jsonl = open('test.data.jsonl', 'r')


# Task 2 Covid Data
covid_data_jsonl = open('covid.data.jsonl', 'r')

In [3]:
# Tweet ID (Key)
# USer ID
# Follower Count
# Text
# Time Posted
# Parent Tweets ID (If Any)
# Child tweets ID (If any)


# Find BERT tokenize max size
bert_token_list = []

tweet_dict = {}

train_label_json = json.load(train_label)
dev_label_json = json.load(dev_label)


# Make label 1 or 0
# 0 for non-rumour
def binary_label(label):
    if label == 'non-rumour':
        return 0
    else:
        return 1

def jsonl_to_list(jsonl, labels):
    out_list = []
    
    for line in jsonl:
        data = json.loads(line)

        for tweet_data in data:
            
            tweet_id = tweet_data['id']
            user_id = tweet_data['user']['id']
            follower_count = tweet_data['user']['followers_count']
            
            
            text = tweet_data['text']            
            time = tweet_data['created_at']
            parent = tweet_data['in_reply_to_status_id']


            tweet_dict[tweet_id] = {'user_id' : user_id,
                                   'follower_count' : follower_count,
                                   'text' : text,
                                   'time': time,
                                   'parent': parent}
            
            
            # get label and conver to 0 or 1
            
            try:
                label = binary_label(labels[str(tweet_id)])
                out_list.append([tweet_id, text, time, parent, label, follower_count, user_id])
            except KeyError:
                continue
        
    return out_list



# test data
def no_label_json(jsonl):
    output = []
    for line in jsonl:

        data = json.loads(line)
        
        for tweet_data in data:

            tweet_id = tweet_data['id']
            user_id = tweet_data['user']['id']
            follower_count = tweet_data['user']['followers_count']
            text = tweet_data['text']
            time = tweet_data['created_at']
            parent = tweet_data['in_reply_to_status_id']


            tweet_dict[tweet_id] = {'user_id' : user_id,
                                   'follower_count' : follower_count,
                                   'text' : text,
                                   'time': time,
                                   'parent': parent}

            output.append([tweet_id, text, time, parent, follower_count, user_id])
    return output
    
train_list = jsonl_to_list(train_data_jsonl, train_label_json)
dev_list = jsonl_to_list(dev_data_jsonl, dev_label_json)



test_list = no_label_json(test_data_jsonl)
covid_list = no_label_json(covid_data_jsonl)
    


In [4]:
# Make Dataframes

train_df = DataFrame(train_list, columns=['tweet_id', 'text', 'time', 'parent', 'label', 'follower_count', 'user_id'])
dev_df = DataFrame(dev_list, columns=['tweet_id', 'text', 'time', 'parent', 'label', 'follower_count', 'user_id'])

test_df = DataFrame(test_list, columns=['tweet_id', 'text', 'time', 'parent', 'follower_count', 'user_id'])

# Task 2
covid_df = DataFrame(covid_list, columns=['tweet_id', 'text', 'time', 'parent', 'follower_count', 'user_id'])


# Concat train and dev df for final test submission
merged_df = pd.concat([train_df, dev_df])



In [5]:
# ax = sns.countplot(train_df.label)
# plt.xlabel('Tweet Type')
# ax.set_xticklabels(['non-rumour', 'rumour'])

In [6]:
# ax = sns.countplot(dev_df.label)
# plt.xlabel('Tweet Type')
# ax.set_xticklabels(['non-rumour', 'rumour'])

## Adjust Hyperparameters Below

In [7]:
# 79 Used as max length for tokenised input, tested over train dev test.
MAX_LEN = 79
BATCH_SIZE = 16
EPOCHS = 10

#     Batch size: 16
#     Learning rate (Adam): 3e-5, 2e-5
#     Number of epochs: 10

WEIGHT_DECAY = 0.0
LEARNING_RATE = 1e-5
WARMUP_STEPS = 0


# Load Tokenizer and BERT Model

# MODEL_NAME = 'bert-base-cased'
MODEL_NAME = "bert-base-cased-finetuned-mrpc"

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
bert_model = BertModel.from_pretrained(MODEL_NAME)





In [8]:
from torch.utils.data import Dataset, DataLoader

# Create new dataset

# reviews = text
# Targets = labels

class rumour_dataset(Dataset):
    def __init__(self, tweet_id, texts, labels, tokenizer, max_len):
        self.tweet_id = tweet_id
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        tweet_id = self.tweet_id[item]
        text = str(self.texts[item])
        label = self.labels[item]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'tweet_id' : tweet_id,
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

class test_rumour_dataset(Dataset):
    def __init__(self, tweet_id, texts, tokenizer, max_len):
        self.tweet_id = tweet_id
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        tweet_id = self.tweet_id[item]
        text = str(self.texts[item])
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'tweet_id' : tweet_id,
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }
    
# Create data loader for BERT

def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = rumour_dataset(
        tweet_id = df.tweet_id.to_numpy(),
        texts=df.text.to_numpy(),
        labels=df.label.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
        )

    return DataLoader(
        ds,
        batch_size=batch_size,
        # Set to 0 as jupyter + windows has a bug
        num_workers=0
  )

def create_test_data_loader(df, tokenizer, max_len, batch_size):
    ds = test_rumour_dataset(
        tweet_id = df.tweet_id.to_numpy(),
        texts = df.text.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len)
    
    return DataLoader(
        ds,
        batch_size,
        num_workers=0)

In [9]:
# Load Data
train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
dev_data_loader = create_data_loader(dev_df, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_test_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)
covid_data_loader = create_test_data_loader(covid_df, tokenizer, MAX_LEN, BATCH_SIZE)

merged_data_loader = create_data_loader(merged_df, tokenizer, MAX_LEN, BATCH_SIZE)

In [10]:
class SentimentClassifier(nn.Module):

    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask
        )
        output = self.drop(pooled_output)
        return self.out(output)

#   def forward(self, input_ids, attention_mask):
#     _, outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
# #     cont_reps = outputs.last_hidden_state
# #     cls_rep = outputs[:, 0]
    
#     return self.out(outputs)

In [11]:
model = SentimentClassifier(2)
model = model.to(device)

In [12]:
total_steps = len(train_data_loader) * EPOCHS
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY,correct_bias=False)


scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=WARMUP_STEPS,
    num_training_steps=total_steps
    )

loss_fn = nn.CrossEntropyLoss().to(device)

In [13]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()

    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

In [14]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)

            loss = loss_fn(outputs, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

        return correct_predictions.double() / n_examples, np.mean(losses)

In [15]:
from collections import defaultdict
history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):

    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

#     train_acc, train_loss = train_epoch(model,train_data_loader, loss_fn, optimizer, device, scheduler, len(train_df))
    train_acc, train_loss = train_epoch(model,merged_data_loader, loss_fn, optimizer, device, scheduler, len(merged_df))

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(model,dev_data_loader,loss_fn, device, len(dev_df))

    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc

Epoch 1/10
----------
Train loss 0.4112663662761723 accuracy 0.8151695077571346
Val   loss 0.20364184284935127 accuracy 0.939655172413793

Epoch 2/10
----------
Train loss 0.22225354955352228 accuracy 0.9182149013598927
Val   loss 0.13379080258813258 accuracy 0.9603448275862069

Epoch 3/10
----------
Train loss 0.14250720494045144 accuracy 0.9549894656196131
Val   loss 0.0729424045735819 accuracy 0.9775862068965517

Epoch 4/10
----------
Train loss 0.08498059794152246 accuracy 0.9772074315265274
Val   loss 0.024006012040567962 accuracy 0.996551724137931

Epoch 5/10
----------
Train loss 0.050257073061249505 accuracy 0.986784140969163
Val   loss 0.017028390695781423 accuracy 0.996551724137931

Epoch 6/10
----------
Train loss 0.03625068647065342 accuracy 0.9902317563685118
Val   loss 0.011513537841351904 accuracy 0.996551724137931

Epoch 7/10
----------
Train loss 0.025433737720364266 accuracy 0.9936793717678606
Val   loss 0.004185058232365377 accuracy 0.9982758620689655

Epoch 8/10
---

In [16]:
# plt.plot(history['train_acc'], label='train accuracy')

# plt.plot(history['val_acc'], label='validation accuracy')

# plt.title('Training history')

# plt.ylabel('Accuracy')

# plt.xlabel('Epoch')

# plt.legend()

# plt.ylim([0, 1]);

In [17]:
# Prediction Function

def get_predictions(model, data_loader):
    model = model.eval()
    tweet_ids = []
    out_texts = []
    predictions = []
    prediction_probs = []

    with torch.no_grad():
        for d in data_loader:
            tweet_id = d["tweet_id"]
            texts = d["text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
                )

            _, preds = torch.max(outputs, dim=1)

            tweet_ids.extend(tweet_id)
            out_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(outputs)

#     predictions = torch.stack(predictions).cpu()
#     prediction_probs = torch.stack(prediction_probs).cpu()
    predictions = torch.stack(predictions).to(device)
    prediction_probs = torch.stack(prediction_probs).to(device)

    return tweet_ids, texts, predictions, prediction_probs

In [18]:
# Dev Predictions

# y_id, y_texts, y_pred, y_pred_probs = get_predictions(
#   model,
#   dev_data_loader
# )

# Test Prediction

y_id, y_texts, y_pred, y_pred_probs = get_predictions(
  model,
  test_data_loader
)



# Covid Prediction

# y_id, y_texts, y_pred, y_pred_probs = get_predictions(
#   model,
#   covid_data_loader
# )

In [19]:
def class_label(num):
    if num == 0:
        return 'non-rumour'
    else:
        return 'rumour'

    
y_pred_list = y_pred.tolist()


output_dict = {}
for i, tensor_id in enumerate(y_id):
    tweet_id = str(tensor_id.item())
    label = class_label(y_pred_list[i])
    
    output_dict[tweet_id] = label


In [20]:
output_dict

{'544382249178001408': 'rumour',
 '544383045436846080': 'non-rumour',
 '544389328773054464': 'non-rumour',
 '544395918842531840': 'non-rumour',
 '544410984647892992': 'rumour',
 '544383661106143232': 'non-rumour',
 '544383791612305408': 'non-rumour',
 '544382614296346624': 'non-rumour',
 '544450382945325056': 'rumour',
 '544382568674516992': 'non-rumour',
 '544383159706451968': 'rumour',
 '544382309256790016': 'rumour',
 '544384184756625408': 'non-rumour',
 '544390989566066689': 'non-rumour',
 '544382586613555201': 'non-rumour',
 '544388862177312768': 'non-rumour',
 '544392164482883584': 'non-rumour',
 '525027317551079424': 'rumour',
 '525049051692429313': 'non-rumour',
 '525049750295281666': 'non-rumour',
 '525029610682335232': 'non-rumour',
 '525047905946963968': 'rumour',
 '525048587970170880': 'rumour',
 '525028037709291520': 'non-rumour',
 '525032398967570432': 'non-rumour',
 '525161942550577152': 'non-rumour',
 '525034200991551488': 'non-rumour',
 '525028710223982593': 'non-rumou

In [21]:
# # Save results
# with open("test-output.json", "w") as outfile:
#     json.dump(output_dict, outfile)

In [22]:


# For Dev set evaluation

# dev_pred_list = []
# dev_true_list = []

# for key in output_dict.keys():
#     true_label = int(dev_df[dev_df['tweet_id'] == int(key)]['label'])
#     pred_label = output_dict[key]
    
#     dev_pred_list.append(binary_label(pred_label))
#     dev_true_list.append(true_label)
    
    
# from sklearn.metrics import f1_score, precision_score, recall_score

# def print_metrics(true, pred):
#     print("precision:", precision_score(true, pred))
#     print("recall:", recall_score(true, pred))
#     print("f1:", f1_score(true, pred))
          
# print_metrics(dev_true_list, dev_pred_list)
    

In [23]:
# Export Covid Predictions to csv

# covid_pred_df = pd.DataFrame.from_dict(output_dict.items(), orient='index', columns=['tweet_id', 'label'])
# covid_pred_df = pd.DataFrame(output_dict.items(), columns=['tweet_id', 'label'])
# covid_pred_df_2 = covid_pred_df.copy()
# covid_pred_df_2['tweet_id'] = covid_pred_df['tweet_id'].astype('int64')
# covid_pred_df_2
# covid = pd.merge(covid_df, covid_pred_df_2, on='tweet_id')
# covid.to_csv('covid_predictions_dataframe.csv', index = None, header=True) 