# Bias Model for FEVER

In [1]:
import json
from random import random
import os
from typing import Dict, List

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import sys
sys.path.append("../")
from my_package.models.traditional.classifier import Classifier
from my_package.utils.handcrafted_features.counter import count_negations
from my_package.utils.handcrafted_features.overlap import get_lexical_overlap, get_entities_overlap

## Configurations

In [2]:
DUMMY_PREFIX = "" # "sample_" for example and "" for the real one

TRAIN_DATA_FILE = "../data/fact_verification/%sfever.train.jsonl"%DUMMY_PREFIX
VAL_DATA_FILE = "../data/fact_verification/%sfever.val.jsonl"%DUMMY_PREFIX
DEV_DATA_FILE = "../data/fact_verification/%sfever.dev.jsonl"%DUMMY_PREFIX
TEST_DATA_FILE = "../data/fact_verification/fever_symmetric_v0.1.test.jsonl"

WEIGHT_KEY = "sample_weight"
OUTPUT_VAL_DATA_FILE = "../data/fact_verification/%sweighted_fever.val.jsonl"%DUMMY_PREFIX
OUTPUT_TRAIN_DATA_FILE = "../data/fact_verification/%sweighted_fever.train.jsonl"%DUMMY_PREFIX
SAVED_MODEL_PATH = "../results/fever/bias_model"

DOC1_KEY = "claim"
DOC2_KEY = "evidence"
LABEL_KEY = "gold_label"

POSSIBLE_LABELS = ("SUPPORTS", "NOT ENOUGH INFO", "REFUTES")
BIAS_CLASS = "REFUTES"

MAX_SAMPLE = -1 # -1 for non-maximal mode or a finite number e.g. 2000
DROP_RATE = 0.0
TEST_FRAC = 0.2

MAX_TEST_SAMPLE = -1

In [3]:
def inference_prob_to_index(x: List[Dict[str, float]]) -> List[float]:
    return [
        x["SUPPORTS"],
        x["NOT ENOUGH INFO"],
        x["REFUTES"]
    ]

## Dataset

In [4]:
def read_data(
    file: str = TRAIN_DATA_FILE,
    sent1_key: str = DOC1_KEY,
    sent2_key: str = DOC2_KEY,
    label_key: str = LABEL_KEY,
    drop_rate: float = 0.0
):
    docs = []
    labels = []

    N_SAMPLE = 0

    with open(file, 'r') as fh:
        line = fh.readline()
        while line:
            if random() > drop_rate:
                datapoint = json.loads(line)
                docs.append([datapoint[sent1_key], datapoint[sent2_key]])
                labels.append(datapoint[label_key])

                N_SAMPLE += 1
                if MAX_SAMPLE != -1 and N_SAMPLE == MAX_SAMPLE:
                    break
            line = fh.readline()
    print("# samples: ", N_SAMPLE)
    return docs, labels

In [5]:
docs, labels = read_data(drop_rate=DROP_RATE)

# samples:  2400


In [6]:
docs[:2]

[['Margaret of Valois criticized memoirs .',
  'She was famous for her beauty and sense of style , notorious for a licentious lifestyle , and also proved a competent memoirist .'],
 ['Legion is a background character in Legion .',
  "Hawley wanted to show Haller as an `` unreliable narrator `` , including mixing 1960s design with modern-day elements , and filming the series through the title character 's distorted view of reality ."]]

In [7]:
labels[:2]

['NOT ENOUGH INFO', 'NOT ENOUGH INFO']

In [8]:
docs_train, docs_test, labels_train, labels_test = train_test_split(
    docs, labels,
    stratify=labels, test_size=TEST_FRAC,
    random_state=42
)

## Bias Classifier

In [9]:
feature_extractors = [
    lambda s1, s2: count_negations(s1),
    lambda s1, s2: count_negations(s2),
    get_lexical_overlap,
    get_entities_overlap
]

In [10]:
config = {
    "n_grams": [1, 2],
    "top_ks": [50, 50], # select by LMI
    "verbose": True,
}

In [11]:
classifier = Classifier(
    possible_labels=POSSIBLE_LABELS,
    feature_extractors=feature_extractors,
    config=config
)

## Training

In [12]:
classifier.fit(docs_train, labels_train)
classifier.save(SAVED_MODEL_PATH)

------ Top N-grams for sentence 1 ------
1-gram LMI:  {'REFUTES': {'not': 0.004052109313911515, 'only': 0.0033080988060226585, 'is': 0.0023721037460285354, 'does': 0.0009856206289849108, 'incapable': 0.0009351021493641956, 'solely': 0.0008624180503617971, 'by': 0.000805776430798953, 'died': 0.0007669511884319268, 'being': 0.0007443825566710869, 'avoided': 0.0007392154717386832, 'ever': 0.0006948976123873521, '2000': 0.0006771580321946983, 'entire': 0.0006160128931155693, 'be': 0.0005690781133917829, 'born': 0.0005090486533598655, 'country': 0.0005044617725619931, 'has': 0.0004819323752911275, 'acting': 0.0004741497994029085, 'did': 0.000460170713059156, 'never': 0.000460170713059156, 'emily': 0.00045833588384096665, 'a': 0.00044488855358588145, 'his': 0.0004426249705423559, 'iceland': 0.00043292208279245277, "n't": 0.00043292208279245277, 'mel': 0.00043292208279245277, 'zero': 0.00043292208279245277, 'april': 0.00041882198299005973, 'anything': 0.00038398983459225023, 'high': 0.0003839

1-gram LMI:  {'REFUTES': {'including': 0.00029491745203119623, 'was': 0.0002426623628366875, '2005': 0.00023967289174014187, 'show': 0.00023225591090679704, 'american': 0.00022163134572462495, 'by': 0.00022027717326859948, 'thiokol': 0.00021999809941685163, 'studios': 0.00021410622594786945, 'analyze': 0.0002137147956497665, 'neighbors': 0.0002137147956497665, '2': 0.00021277147666103895, 'records': 0.00021145049836798955, 'its': 0.00019647056503720409, 'on': 0.0001949040883713631, '2001': 0.0001917383133921135, '2002': 0.00019075926062585898, 'band': 0.00018958979059148327, 'written': 0.00018958979059148327, '1993': 0.00018600723046533513, 'developed': 0.00018590661582892925, 'making': 0.00018425080456095676, 'playstation': 0.0001791500867164755, 'map': 0.00017809566304147206, 'most': 0.00017516881188064606, 'other': 0.0001728217926454688, 'february': 0.00017133251585027906, 'their': 0.00017133251585027906, 'people': 0.00017080995411553203, '1998': 0.00017080995411553203, 'are': 0.000

In [13]:
# Inferential Examples
x = [['Roman Atwood is a content creator .',
  'He is best known for his vlogs , where he posts updates about his life on a daily basis .'],
 ['Roman Atwood is a content creator .',
  "He also has another YouTube channel called `` RomanAtwood '' , where he posts pranks ."]] 
classifier.inference(x)

[{'SUPPORTS': 0.41271204346951557,
  'NOT ENOUGH INFO': 0.5186246285822429,
  'REFUTES': 0.06866332794824151},
 {'SUPPORTS': 0.4184884596171883,
  'NOT ENOUGH INFO': 0.4370285284611557,
  'REFUTES': 0.144483011921656}]

## Evaluation

In [14]:
y_preds = classifier.predict(docs_test)
print("Accuracy on train set: %.3f"% accuracy_score(labels_test, y_preds))

Accuracy on train set: 0.592


In [15]:
# validate load process of the classifier
test_classifier = Classifier(
    possible_labels=POSSIBLE_LABELS,
    feature_extractors=feature_extractors,
    config=config
)
test_classifier.load(SAVED_MODEL_PATH)

y_preds = test_classifier.predict(docs_test)
print("Accuracy on train set: %.3f"% accuracy_score(labels_test, y_preds))

Accuracy on train set: 0.592


### On original test set

In [16]:
eval_docs, eval_labels = read_data(DEV_DATA_FILE)

# samples:  100


In [17]:
y_preds = classifier.predict(eval_docs)
print("Accuracy on original test set: %.3f"% accuracy_score(eval_labels, y_preds))

Accuracy on original test set: 0.440


### On symmetric (challenge) test set

In [18]:
eval_docs, eval_labels = read_data(TEST_DATA_FILE, sent2_key="evidence_sentence", label_key="label")

# samples:  717


In [19]:
y_preds = classifier.predict(eval_docs)
print("Accuracy on challenge test set: %.3f"% accuracy_score(eval_labels, y_preds))

Accuracy on challenge test set: 0.308


## Write predicted probability to the training set

In [20]:
def get_weight(prob_score_ground_truth_class: float) -> float:
    return 1/prob_score_ground_truth_class

In [21]:
def write_weight_to_file(
    DATA_FILE: str,
    OUTPUT_DATA_FILE: str,
    _classifier
) -> None:
    f_output = open(OUTPUT_DATA_FILE, 'w')

    N_SAMPLE = 0

    with open(DATA_FILE, 'r') as fh:
        line = fh.readline()
        while line:
            datapoint = json.loads(line)
            ground_truth_label = datapoint[LABEL_KEY]
            x = [[datapoint[DOC1_KEY], datapoint[DOC2_KEY]]]

            probs = _classifier.inference(x)[0]
            prob = probs[ground_truth_label]
            weight = get_weight(prob_score_ground_truth_class=prob)
            if datapoint.get("weight", None) != None:
                del datapoint["weight"] # only for fever
            f_output.write("%s\n"%json.dumps({
                **datapoint,
                WEIGHT_KEY: weight,
                "bias_probs": inference_prob_to_index(probs),
                "bias_prob": prob
            }))

            N_SAMPLE += 1
            if MAX_SAMPLE != -1 and N_SAMPLE == MAX_SAMPLE:
                break
            line = fh.readline()

    f_output.close()

### Train

In [22]:
write_weight_to_file(
    DATA_FILE = TRAIN_DATA_FILE,
    OUTPUT_DATA_FILE = OUTPUT_TRAIN_DATA_FILE,
    _classifier = classifier
)

### Val

In [27]:
write_weight_to_file(
    DATA_FILE = VAL_DATA_FILE,
    OUTPUT_DATA_FILE = OUTPUT_VAL_DATA_FILE,
    _classifier = classifier
)