In [1]:
import numpy as np
import copy
from tqdm import tqdm
import pandas as pd
import re
import gensim
from score import report_score
from sklearn.metrics import accuracy_score



In [2]:
datadir="fnc-1"
manual_seed=47
num_train_epochs = 3

In [3]:
raw_train_bodies = pd.read_csv(datadir + '/train_bodies.csv')   
raw_train_stances = pd.read_csv(datadir + '/train_stances.csv')
raw_test_bodies = pd.read_csv(datadir + '/competition_test_bodies.csv') 
raw_test_stances = pd.read_csv(datadir + '/competition_test_stances.csv')

true_test = raw_test_stances['Stance']

In [4]:
stance_to_int = {"agree":0, "discuss": 1, "disagree": 2, "unrelated": 3}
int_to_stance = {0:"agree", 1:"discuss", 2:"disagree", 3: "unrelated"}

In [5]:
actual_test_stances = raw_test_stances['Stance']
raw_train_stances['Stance'] = raw_train_stances['Stance'].apply(lambda x: stance_to_int[x])
raw_test_stances['Stance'] = raw_test_stances['Stance'].apply(lambda x: stance_to_int[x])


In [6]:
train_df = raw_train_stances.join(raw_train_bodies.set_index('Body ID'), on='Body ID')
test_df = raw_test_stances.join(raw_test_bodies.set_index('Body ID'), on='Body ID')

In [7]:
train_df['labels'] = train_df['Stance']


In [8]:
train_df = train_df.drop(['Body ID','Stance'], axis=1)
test_df = test_df.drop(['Body ID','Stance'], axis=1)


In [9]:
def clean(s):
    # Cleans a string: Lowercasing, trimming, removing non-alphanumeric
    return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()

train_df['Headline'] = train_df['Headline'].apply(clean)
train_df['articleBody'] = train_df['articleBody'].apply(clean)

test_df['Headline'] = test_df['Headline'].apply(clean)
test_df['articleBody'] = test_df['articleBody'].apply(clean)

In [10]:
train_df=train_df.rename(columns={'Headline': 'text_a', 'articleBody': 'text_b'})
test_df=test_df.rename(columns={'Headline': 'text_a', 'articleBody': 'text_b'})

In [11]:
test_dl=[]
for i in range(len(test_df)):
  test_dl.append([test_df['text_a'][i], test_df['text_b'][i]])

In [12]:
from simpletransformers.classification import ClassificationModel

In [13]:
model = ClassificationModel(
    'bert', 'bert-base-cased', 
    num_labels=4, 
    args={
        'num_train_epochs': num_train_epochs,
        'manual_seed': manual_seed,
        'max_seq_length': 256,
        'output_dir': "outputs/bert",
        'overwrite_output_dir': True,
        'save_steps': -1,
        'early_stopping': True},
    use_cuda=True)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [14]:
#model.train_model(train_df)
model = ClassificationModel("bert", "outputs/bert/checkpoint-18741-epoch-3")

In [15]:
preds , _ = model.predict(test_dl)

  0%|          | 0/25413 [00:00<?, ?it/s]

  0%|          | 0/3177 [00:00<?, ?it/s]

In [16]:
outputs = [int_to_stance[int(p)] for p in preds]

In [17]:
report_score(true_test, outputs)

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |   1351    |    113    |    356    |    83     |
-------------------------------------------------------------
| disagree  |    231    |    268    |    119    |    79     |
-------------------------------------------------------------
|  discuss  |    807    |    208    |   3250    |    199    |
-------------------------------------------------------------
| unrelated |    34     |    10     |    118    |   18187   |
-------------------------------------------------------------
Score: 9874.25 out of 11651.25	(84.74841755176483%)


84.74841755176483

In [36]:
# Find Relatedness Accuracy and Opinion Accuracy
# stance_to_int = {"agree":0, "discuss": 1, "disagree": 2, "unrelated": 3}
true_test_labels = true_test.apply(lambda x: stance_to_int[x])

relatedness_correct = 0
opinion_correct = 0
opinion_count = 0

for i in range(len(true_test_labels)):
    label = true_test_labels[i]
    pred = preds[i]
    if (label == 3 and pred == 3) or (label != 3 and pred != 3):
        relatedness_correct+=1
    if label != 3:
        opinion_count+=1
        if label == pred:
            opinion_correct += 1


print(f"Relatedness Accuracy is {relatedness_correct/len(true_test_labels)}")
print(f"Opinion Accuracy is {opinion_correct/opinion_count}")


Relatedness Accuracy is 0.979419981899028
Opinion Accuracy is 0.6892695356738392
