In [159]:
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.metrics import precision_recall_fscore_support
import torch.optim as optim
import torch
import pandas as pd
import chardet
import numpy as np
import torch.nn as nn

In [160]:
def detect_encoding(file_path: str) -> str:
    """
    Detects the encoding of a file
    """
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())

    return result['encoding']

def preprocess_data(data: pd.DataFrame, target: str) -> pd.DataFrame:
    """
    Preprocesses the data

    Args:
        data: The dataframe to preprocess
        target: The target to extract from the dataset

    Returns:
        The preprocessed dataframe
    """
    # Deep copy the dataframe
    data = data.copy()

    # Remove targets we do not care about
    data = data.loc[data['Target'] == target]

    # Remove '#SemSt' from the 'Tweet' column
    data['Tweet'] = data['Tweet'].str.replace('#SemST', '')

    # Make tweets lowercase
    data['Tweet'] = data['Tweet'].str.lower()

    return data

def print_stance_statistics(data: pd.DataFrame) -> None:
    """
    Prints the number of tweets in each stance
    """
    stance_counts = data['Stance'].value_counts()
    print(stance_counts)

def load_sem_eval_data(target: str) -> (pd.DataFrame, pd.DataFrame):
    """
    Loads the SemEval 2016 dataset and extracts rows that contain the target

    Args:
        target: The target (e.g., "Climate Change is a Real Concern") to extract from the dataset

    Returns:
        Two pandas dataframes containing the training and test datasets
    """

    # Load training dataset
    train_data_path = '/content/drive/MyDrive/semeval2016-task6-trainingdata.txt'
    train_data = pd.read_csv(train_data_path, sep='\t', encoding=detect_encoding(train_data_path))

    # Load test dataset
    test_data_path = '/content/drive/MyDrive/SemEval2016-Task6-subtaskA-testdata-gold.txt'
    test_data = pd.read_csv(test_data_path, sep='\t', encoding=detect_encoding(test_data_path))

    # Preprocess training and test data
    train_data = preprocess_data(train_data, target)
    test_data = preprocess_data(test_data, target)

    return train_data, test_data

def split_data(data: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame):
    """
    Splits the SemEval-2016 data into training and test data

    Args:
        data: The dataframe with SemEval-2016 data

    Returns:
        Two pandas dataframes containing the training and test datasets
    """
    return (data['Tweet'], data['Stance'])


In [161]:
def split_reddit(data: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame):
    """
    Splits the SemEval-2016 data into training and test data

    Args:
        data: The dataframe with SemEval-2016 data

    Returns:
        Two pandas dataframes containing the training and test datasets
    """
    return (data['Text'], data['Stance'])

In [162]:
from sklearn.metrics import f1_score

def calculate_score(y_test, y_pred) -> float:
    """
    Calculates the evaluation metric for the SemEval 2016 Task 6, Subtask A
    which is the macro-average of the f1-score for "FAVOR" and the
    f1-score for "AGAINST", ignoring the "NONE" class.

    Args:
        y_test: The true labels
        y_pred: The predicted labels

    Returns:
        The calculated F1 score
    """
    mask = (y_test != 'NONE') # Remove "NONE" class tweets
    y_test_filtered = y_test[mask]
    y_pred_filtered = y_pred[mask]

    f1 = f1_score(y_test_filtered, y_pred_filtered, average='macro')
    return f1

In [163]:
def get_optimizer(net, lr, weight_decay):
    """
    FROM HOMEWORK 3
    Return the optimizer (Adam) you will use to train the model.

    Input:
        - net: model
        - lr: initial learning_rate
        - weight_decay: weight_decay in optimizer
    """
    return optim.Adam(params=net.parameters(), lr=lr, weight_decay=weight_decay)

In [164]:
def get_label_tensor(labels):
    """
    Convert a list of labels to a tensor

    Input:
        - labels: list of labels
        - label_mapping: dictionary mapping label to index

    Output:
        - label_tensor: tensor of labels
    """
    label_mapping = {'FAVOR': 2, 'NONE':1, 'AGAINST': 0}

    label_nums = labels.map(label_mapping)

    return torch.tensor(label_nums.values)

https://medium.com/gumgum-tech/handling-class-imbalance-by-introducing-sample-weighting-in-the-loss-function-3bdebd8203b4

In [165]:
def define_loss_function(weights):
    """
    Return loss fuction. Use class weights to fix lopsided training data
    """
    class_weights = torch.tensor(weights, dtype=torch.float32).to(device)
    return nn.CrossEntropyLoss(weight = class_weights)

In [166]:
def get_device():
    """
    Return the device you will use for training/testing.
    """
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [198]:
def create_loader(X: pd.DataFrame, y: pd.DataFrame, tokenizer, batch_size: int, max_length=512) -> DataLoader:
    """
    Creates the data loader for SemEval data

    Args:
        X: The input data, a list of tweets
        y: The labels, a list of stances
        batch_size: The batch size
    """
    X = X.tolist()
    tokenized_inputs = tokenizer(
        X,
        padding="max_length",
        max_length=max_length,
        truncation=True,
        return_tensors="pt"
    )
    data = TensorDataset(
        tokenized_inputs["input_ids"],
        tokenized_inputs["attention_mask"],
        get_label_tensor(y)
    )

    # X_ids = tokenizer.batch_encode_plus(X,
    #                                     padding=True,
    #                                     return_tensors='pt')

    # y_tensors = get_label_tensor(y)

    # data = TensorDataset(X_ids['input_ids'], X_ids['attention_mask'], y_tensors)

    return DataLoader(data, batch_size=batch_size)

In [199]:
target = 'Climate Change is a Real Concern'

train_data, test_data = load_sem_eval_data(target)



In [200]:
def train_model(model, train_loader, optimizer, device, loss_function, num_epochs):
    num_itr = 0

    model.to(device)
    model.train()

    for epoch in range(num_epochs):
        model.train()
        for batch in train_loader:
            num_itr += 1
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = loss_function(outputs.logits, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print('Epoch No. {0}--Iteration No. {1}-- batch loss = {2:.4f}'.format(
            epoch + 1,
            num_itr,
            loss.item()
            ))

    return model

In [201]:
# Split data into X and y
X_train, y_train = split_data(train_data)
X_test, y_test = split_data(test_data)

# Tokenize to BERT format
model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True, padding="max_length")

max_length = 512
# Create data loaders
train_loader = create_loader(X_train, y_train, tokenizer, batch_size=16, max_length=max_length)
test_loader = create_loader(X_test, y_test, tokenizer, batch_size=16, max_length=max_length)

# Create BERT model
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Fine Tune BERT model
optimizer = get_optimizer(model, lr=5e-5, weight_decay=0)
device = get_device()

weights = [5000, 1, 1]
loss_function = define_loss_function(weights)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [202]:

model = train_model(model, train_loader, optimizer, device, loss_function, num_epochs=15)

Epoch No. 1--Iteration No. 25-- batch loss = 1.0300
Epoch No. 2--Iteration No. 50-- batch loss = 0.9110
Epoch No. 3--Iteration No. 75-- batch loss = 0.7470
Epoch No. 4--Iteration No. 100-- batch loss = 0.9460
Epoch No. 5--Iteration No. 125-- batch loss = 0.9921
Epoch No. 6--Iteration No. 150-- batch loss = 0.7886
Epoch No. 7--Iteration No. 175-- batch loss = 0.6003
Epoch No. 8--Iteration No. 200-- batch loss = 0.0750
Epoch No. 9--Iteration No. 225-- batch loss = 0.2361
Epoch No. 10--Iteration No. 250-- batch loss = 0.0327
Epoch No. 11--Iteration No. 275-- batch loss = 0.0211
Epoch No. 12--Iteration No. 300-- batch loss = 0.0134
Epoch No. 13--Iteration No. 325-- batch loss = 0.0100
Epoch No. 14--Iteration No. 350-- batch loss = 0.0077
Epoch No. 15--Iteration No. 375-- batch loss = 0.0084


In [203]:
def evaluate_model(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # Get the predicted labels
            _, preds = torch.max(outputs.logits, dim=1)

            # Count the number of correct predictions, ignore the "NONE" class
            #mask = (labels != 1)
            correct_predictions += torch.sum(preds == labels)
            total_predictions += torch.sum(preds)
            preds = preds
            labels = labels

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate precision, recall, and F1 score for each class
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average=None)

    # Calculate average F1 score
    avg_f1 = sum(f1) / len(f1)

    # Print or return the results
    print('Precision (favor): {:.4f}'.format(precision[0]))
    print('Recall (favor): {:.4f}'.format(recall[0]))
    print('F1 Score (favor): {:.4f}'.format(f1[0]))

    print('Precision (neutral): {:.4f}'.format(precision[1]))
    print('Recall (neutral): {:.4f}'.format(recall[1]))
    print('F1 Score (neutral): {:.4f}'.format(f1[1]))

    print('Precision (against): {:.4f}'.format(precision[2]))
    print('Recall (against): {:.4f}'.format(recall[2]))
    print('F1 Score (against): {:.4f}'.format(f1[2]))

    print('Average F1 Score: {:.4f}'.format(avg_f1))

    # Calculate the accuracy
    print(correct_predictions, total_predictions)
    accuracy = correct_predictions.double() / total_predictions.double()

    print('Test Accuracy: {:.4f}'.format(accuracy))

In [197]:
print('Twitter predictions (Control)')
evaluate_model(model, test_loader, device)

Precision (favor): 1.0000
Recall (favor): 0.0909
F1 Score (favor): 0.1667
Precision (neutral): 0.7353
Recall (neutral): 0.7143
F1 Score (neutral): 0.7246
Precision (against): 0.8582
Recall (against): 0.9350
F1 Score (against): 0.8949
Average F1 Score: 0.5954
tensor(141, device='cuda:0') tensor(302, device='cuda:0')
Test Accuracy: 0.4669


In [234]:
def load_reddit_data(tokenizer):
    reddit_body_path = '/content/drive/MyDrive/reddit_body_data.txt'
    body_test = pd.read_csv(reddit_body_path, sep='\t', encoding=detect_encoding(reddit_body_path))

    # Load test dataset
    reddit_title_path = '/content/drive/MyDrive/reddit_title_data.txt'
    title_test = pd.read_csv(reddit_title_path, sep='\t', encoding=detect_encoding(reddit_title_path))

    # Preprocess training and test data
    body_data = body_test.copy()
    title_data = title_test.copy()

    body_data['Text'] = body_data['Text'].str.lower()
    title_data['Text'] = title_data['Text'].str.lower()
    body_data['Text'] = body_data['Text'].fillna('')
    title_data['Text'] = title_data['Text'].fillna('')
    body_data['Text'] = body_data['Text'].apply(lambda x: ' '.join(tokenizer.tokenize(x)[:512]))
    title_data['Text'] = title_data['Text'].apply(lambda x: ' '.join(tokenizer.tokenize(x)[:512]))

    label_mapping = {0: 'AGAINST', 1: 'NONE', 2: 'FAVOR'}

    # Replace values in the specified column
    body_data['Stance'] = body_data['Stance'].replace(label_mapping)
    title_data['Stance'] = title_data['Stance'].replace(label_mapping)


    return body_data, title_data


In [240]:
def evaluate_reddit(model, loader, device):

    model.eval()
    all_preds = []
    all_labels = []
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in loader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # Get the predicted labels
            _, preds = torch.max(outputs.logits, dim=1)

            # i think don't do this -> Count the number of correct predictions, ignore the "NONE" class
            #mask = (labels != 1)
            correct_predictions += torch.sum(preds == labels)
            total_predictions += torch.sum(preds)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    print(all_preds)
    # Calculate precision, recall, and F1 score for each class
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average=None)

    # Calculate average F1 score
    avg_f1 = sum(f1) / len(f1)

    # Print or return the results
    print('Precision (favor): {:.4f}'.format(precision[0]))
    print('Recall (favor): {:.4f}'.format(recall[0]))
    print('F1 Score (favor): {:.4f}'.format(f1[0]))

    print('Precision (neutral): {:.4f}'.format(precision[1]))
    print('Recall (neutral): {:.4f}'.format(recall[1]))
    print('F1 Score (neutral): {:.4f}'.format(f1[1]))

    print('Precision (against): {:.4f}'.format(precision[2]))
    print('Recall (against): {:.4f}'.format(recall[2]))
    print('F1 Score (against): {:.4f}'.format(f1[2]))

    print('Average F1 Score: {:.4f}'.format(avg_f1))

    # Calculate the accuracy
    print(correct_predictions, total_predictions)
    accuracy = correct_predictions.double() / total_predictions.double()

    print('Test Accuracy: {:.4f}'.format(accuracy))


In [241]:
body_data, title_data = load_reddit_data(tokenizer)
x_body, y_body = split_reddit(body_data)
x_title, y_title = split_reddit(title_data)
body_loader = create_loader(x_body, y_body, tokenizer, batch_size=16, max_length=max_length)
title_loader = create_loader(x_title, y_title, tokenizer, batch_size=16, max_length=max_length)

In [242]:
print('Reddit predictions (Body Only)')
evaluate_reddit(model, body_loader, device)

[2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 0, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2]
Precision (favor): 0.0000
Recall (favor): 0.0000
F1 Score (favor): 0.0000
Precision (neutral): 0.0588
Recall (neutral): 0.0769
F1 Sc

In [238]:
print('Reddit predictions (Title Only)')
evaluate_reddit(model, title_loader, device)

Precision (favor): 0.1875
Recall (favor): 0.2903
F1 Score (favor): 0.2278
Precision (neutral): 0.0870
Recall (neutral): 0.1538
F1 Score (neutral): 0.1111
Precision (against): 0.8761
Recall (against): 0.7796
F1 Score (against): 0.8251
Average F1 Score: 0.3880
tensor(202, device='cuda:0') tensor(459, device='cuda:0')
Test Accuracy: 0.4401


In [239]:
both_data_text = title_data['Text'] + ' ' + body_data['Text']

# Create a new DataFrame 'both_data'
both_data = pd.DataFrame({'Text': both_data_text, 'Stance': title_data['Stance']})
x_both, y_both = split_reddit(both_data)
both_loader = create_loader(x_both, y_both, tokenizer, batch_size=16, max_length=max_length)
print('Reddit predictions (Title and Body)')
evaluate_reddit(model, both_loader, device)

Precision (favor): 0.0000
Recall (favor): 0.0000
F1 Score (favor): 0.0000
Precision (neutral): 1.0000
Recall (neutral): 0.0769
F1 Score (neutral): 0.1429
Precision (against): 0.8497
Recall (against): 0.9918
F1 Score (against): 0.9153
Average F1 Score: 0.3527
tensor(244, device='cuda:0') tensor(573, device='cuda:0')
Test Accuracy: 0.4258
