# Bias Model for FEVER

In [1]:
import json
from random import random
import os

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import sys
sys.path.append("../")
from my_package.models.traditional.classifier import Classifier
from my_package.utils.handcrafted_features.counter import count_negations
from my_package.utils.handcrafted_features.overlap import get_lexical_overlap, get_entities_overlap



## Configurations

In [2]:
TRAIN_DATA_FILE = "../data/fact_verification/fever.train.jsonl"
DEV_DATA_FILE = "../data/fact_verification/fever.dev.jsonl"
TEST_DATA_FILE = "../data/fact_verification/fever_symmetric_generated.jsonl"

WEIGHT_KEY = "sample_weight"
OUTPUT_TRAIN_DATA_FILE = "../data/fact_verification/weighted_fever.train.jsonl"

DOC1_KEY = "claim"
DOC2_KEY = "evidence"
LABEL_KEY = "gold_label"

POSSIBLE_LABELS = ("SUPPORTS", "REFUTES", "NOT ENOUGH INFO")
BIAS_CLASS = "REFUTES"

MAX_SAMPLE = -1 # -1 for non-maximal mode or a finite number e.g. 2000
DROP_RATE = 0.0
TEST_FRAC = 0.2

MAX_TEST_SAMPLE = -1

## Dataset

In [3]:
def read_data(
    file: str = TRAIN_DATA_FILE,
    sent1_key: str = DOC1_KEY,
    sent2_key: str = DOC2_KEY,
    label_key: str = LABEL_KEY,
    drop_rate: float = 0.0
):
    docs = []
    labels = []

    N_SAMPLE = 0

    with open(file, 'r') as fh:
        line = fh.readline()
        while line:
            if random() > drop_rate:
                datapoint = json.loads(line)
                docs.append([datapoint[sent1_key], datapoint[sent2_key]])
                labels.append(datapoint[label_key])

                N_SAMPLE += 1
                if MAX_SAMPLE != -1 and N_SAMPLE == MAX_SAMPLE:
                    break
            line = fh.readline()
    print("# samples: ", N_SAMPLE)
    return docs, labels

In [4]:
docs, labels = read_data(drop_rate=DROP_RATE)

# samples:  2000


In [5]:
docs[:2]

[['Roman Atwood is a content creator .',
  'He is best known for his vlogs , where he posts updates about his life on a daily basis .'],
 ['Roman Atwood is a content creator .',
  "He also has another YouTube channel called `` RomanAtwood '' , where he posts pranks ."]]

In [6]:
labels[:2]

['SUPPORTS', 'SUPPORTS']

In [7]:
docs_train, docs_test, labels_train, labels_test = train_test_split(
    docs, labels,
    stratify=labels, test_size=TEST_FRAC,
    random_state=42
)

## Bias Classifier

In [8]:
feature_extractors = [
    lambda s1, s2: count_negations(s1),
    lambda s1, s2: count_negations(s2),
    get_lexical_overlap,
    get_entities_overlap
]

In [9]:
config = {
    "n_grams": [1, 2],
    "top_ks": [50, 50], # select by LMI
    "verbose": True,
}

In [10]:
classifier = Classifier(
    possible_labels=POSSIBLE_LABELS,
    feature_extractors=feature_extractors,
    config=config
)

## Training

In [11]:
classifier.fit(docs_train, labels_train)

------ Top N-grams for sentence 1 ------
1-gram LMI:  {'SUPPORTS': {'the': 0.002830599443997493, 'person': 0.0015601595335923452, 'an': 0.0011449873408493345, 'film': 0.0010957417258488456, 'and': 0.0009517330449717213, 'for': 0.0008765933806790766, 'american': 0.0008510747454199162, 'rock': 0.0007985769012638708, 'band': 0.0007640830874644544, 'artist': 0.0007346907491627611, 'world': 0.0006868356463097582, 'there': 0.0006388615210110966, 'music': 0.000611965430575564, 'with': 0.0005300581654427411, 'song': 0.0004933233861207459, 'series': 0.0004910580555191361, 'work': 0.00047893169501172595, 'worked': 0.00046679380339012647, 'united': 0.00046679380339012647, 'bill': 0.00044720306470776765, 'award': 0.0004252788059016547, 'that': 0.00041586057053421107, 'states': 0.00041586057053421107, 'won': 0.00041586057053421107, 'medium': 0.0004152599886572128, 'selena': 0.0004152599886572128, 'recorded': 0.0004152599886572128, 'lewis': 0.0004152599886572128, 'hamilton': 0.0004152599886572128, '

2-gram LMI:  {'SUPPORTS': {'award_for': 0.0002667443779287267, 'is_the': 0.0002326692006241626, 'for_best': 0.00021126990945009025, 'for_the': 0.00013681812824263117, 'is_an': 0.00013541344796845595, 'the_film': 0.0001223467118798094, 'the_song': 0.00012079239872388943, 'rock_band': 0.00011943955589972616, 'was_released': 0.00011738259900265232, 'an_american': 0.00011718344301520055, 'has_also': 0.0001115006757451287, 'and_is': 0.00011137823024682903, 'the_second': 0.00010281989681706361, 'the_largest': 0.00010220895276636799, 'drama_series': 0.00010220895276636799, 'formula_one': 0.00010220895276636799, 'the_united': 0.00010178511999887702, 'series_of': 0.00010115764554798141, 'including_the': 0.00010115764554798141, 'role_as': 9.729910929432666e-05, 'produced_by': 9.671639695115291e-05, 'and_produced': 9.65045038517474e-05, 'she_won': 9.65045038517474e-05, 'united_states': 9.400283003037886e-05, 'globe_award': 9.204444891904282e-05, 'emmy_award': 9.204444891904282e-05, 'which_she': 9

In [12]:
# Inferential Examples
x = [['Roman Atwood is a content creator .',
  'He is best known for his vlogs , where he posts updates about his life on a daily basis .'],
 ['Roman Atwood is a content creator .',
  "He also has another YouTube channel called `` RomanAtwood '' , where he posts pranks ."]] 
classifier.inference(x)

[{'SUPPORTS': 0.7678772565201838, 'REFUTES': 0.23212274347981623},
 {'SUPPORTS': 0.6953323195198748, 'REFUTES': 0.3046676804801252}]

## Evaluation

In [13]:
y_preds = classifier.predict(docs_test)
print("Accuracy on train set: %.3f"% accuracy_score(labels_test, y_preds))

Accuracy on train set: 0.802


### On original test set

In [14]:
eval_docs, eval_labels = read_data(DEV_DATA_FILE)

# samples:  2000


In [15]:
y_preds = classifier.predict(eval_docs)
print("Accuracy on original test set: %.3f"% accuracy_score(eval_labels, y_preds))

Accuracy on original test set: 0.589


### On symmetric (challenge) test set

In [16]:
eval_docs, eval_labels = read_data(TEST_DATA_FILE, sent2_key="evidence_sentence", label_key="label")

# samples:  717


In [17]:
y_preds = classifier.predict(eval_docs)
print("Accuracy on challenge test set: %.3f"% accuracy_score(eval_labels, y_preds))

Accuracy on challenge test set: 0.462


## Write predicted probability to the training set

In [18]:
def get_weight(prob_score_bias_class: float, ground_truth_label: str, bias_label: str = BIAS_CLASS) -> float:
    if ground_truth_label == bias_label:
        return 1/prob_score_bias_class
    return 1/(1-prob_score_bias_class)

In [19]:
# test cases
def test_bias_label():
    weight = get_weight(0.2, "A", bias_label="A")
    np.testing.assert_almost_equal(1/0.2, weight, 5)
    
def test_not_bias_label():
    weight = get_weight(0.2, "B", bias_label="A")
    np.testing.assert_almost_equal(1/0.8, weight, 5)
    
test_bias_label()
test_not_bias_label()

In [20]:
f_output = open(OUTPUT_TRAIN_DATA_FILE, 'w')

In [21]:
N_SAMPLE = 0

with open(TRAIN_DATA_FILE, 'r') as fh:
    line = fh.readline()
    while line:
        datapoint = json.loads(line)
        x = [[datapoint[DOC1_KEY], datapoint[DOC2_KEY]]]
        
        prob = classifier.inference(x)[0][BIAS_CLASS]
        weight = get_weight(
            prob_score_bias_class=prob,
            ground_truth_label=datapoint[LABEL_KEY],
            bias_label=BIAS_CLASS
        )
        if datapoint.get("weight", None) != None:
            del datapoint["weight"] # only for fever
        f_output.write("%s\n"%json.dumps({**datapoint, WEIGHT_KEY: weight, "prob": prob}))

        N_SAMPLE += 1
        if MAX_TEST_SAMPLE != -1 and N_SAMPLE == MAX_TEST_SAMPLE:
            break
        line = fh.readline()
        
f_output.close()