# Bias Model for FEVER

In [1]:
import json
from random import random
import os
from typing import Dict, List

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import sys
sys.path.append("../")
from my_package.models.traditional.classifier import Classifier
from my_package.utils.handcrafted_features.counter import count_negations
from my_package.utils.handcrafted_features.overlap import get_lexical_overlap, get_entities_overlap



## Configurations

In [2]:
DUMMY_PREFIX = "" # "sample_" for example and "" for the real one

TRAIN_DATA_FILE = "../data/fact_verification/%sfever.train.jsonl"%DUMMY_PREFIX
DEV_DATA_FILE = "../data/fact_verification/%sfever.dev.jsonl"%DUMMY_PREFIX
TEST_DATA_FILE = "../data/fact_verification/fever_symmetric_v0.1.test.jsonl"

WEIGHT_KEY = "sample_weight"
OUTPUT_TRAIN_DATA_FILE = "../data/fact_verification/%sweighted_fever.train.jsonl"%DUMMY_PREFIX
SAVED_MODEL_PATH = "../results/fever/bias_model"

DOC1_KEY = "claim"
DOC2_KEY = "evidence"
LABEL_KEY = "gold_label"

POSSIBLE_LABELS = ("SUPPORTS", "REFUTES", "NOT ENOUGH INFO")
BIAS_CLASS = "REFUTES"

MAX_SAMPLE = -1 # -1 for non-maximal mode or a finite number e.g. 2000
DROP_RATE = 0.0
TEST_FRAC = 0.2

MAX_TEST_SAMPLE = -1

In [3]:
def inference_prob_to_index(x: List[Dict[str, float]]) -> List[float]:
    return [
        x["SUPPORTS"],
        x["NOT ENOUGH INFO"],
        x["REFUTES"]
    ]

## Dataset

In [4]:
def read_data(
    file: str = TRAIN_DATA_FILE,
    sent1_key: str = DOC1_KEY,
    sent2_key: str = DOC2_KEY,
    label_key: str = LABEL_KEY,
    drop_rate: float = 0.0
):
    docs = []
    labels = []

    N_SAMPLE = 0

    with open(file, 'r') as fh:
        line = fh.readline()
        while line:
            if random() > drop_rate:
                datapoint = json.loads(line)
                docs.append([datapoint[sent1_key], datapoint[sent2_key]])
                labels.append(datapoint[label_key])

                N_SAMPLE += 1
                if MAX_SAMPLE != -1 and N_SAMPLE == MAX_SAMPLE:
                    break
            line = fh.readline()
    print("# samples: ", N_SAMPLE)
    return docs, labels

In [5]:
docs, labels = read_data(drop_rate=DROP_RATE)

# samples:  2400


In [6]:
docs[:2]

[['Margaret of Valois criticized memoirs .',
  'She was famous for her beauty and sense of style , notorious for a licentious lifestyle , and also proved a competent memoirist .'],
 ['Legion is a background character in Legion .',
  "Hawley wanted to show Haller as an `` unreliable narrator `` , including mixing 1960s design with modern-day elements , and filming the series through the title character 's distorted view of reality ."]]

In [7]:
labels[:2]

['NOT ENOUGH INFO', 'NOT ENOUGH INFO']

In [8]:
docs_train, docs_test, labels_train, labels_test = train_test_split(
    docs, labels,
    stratify=labels, test_size=TEST_FRAC,
    random_state=42
)

## Bias Classifier

In [9]:
feature_extractors = [
    lambda s1, s2: count_negations(s1),
    lambda s1, s2: count_negations(s2),
    get_lexical_overlap,
    get_entities_overlap
]

In [10]:
config = {
    "n_grams": [1, 2],
    "top_ks": [50, 50], # select by LMI
    "verbose": True,
}

In [11]:
classifier = Classifier(
    possible_labels=POSSIBLE_LABELS,
    feature_extractors=feature_extractors,
    config=config
)

## Training

In [12]:
classifier.fit(docs_train, labels_train)
classifier.save(SAVED_MODEL_PATH)

------ Top N-grams for sentence 1 ------
1-gram LMI:  {'SUPPORTS': {'is': 0.002163038779813946, 'an': 0.0016999043853222335, 'person': 0.00142695624124845, 'of': 0.0013563445102635363, 'the': 0.0012599035305118262, 'film': 0.000881325435050041, 'was': 0.0007728863491820543, 'place': 0.0006833153529354253, 'acts': 0.0006833153529354253, 'state': 0.0006264412671317985, 'world': 0.0005703903592827062, 'one': 0.0005455494149122634, 'actor': 0.0005450555113568687, 'character': 0.0005346332096052878, 'there': 0.0004378982416593059, 'entertainment': 0.0004348370427770889, 'album': 0.0003861043237676886, 'acted': 0.000385355139612428, 'won': 0.0003774017357034175, 'georgia': 0.0003727174652375047, '24': 0.0003727174652375047, 'performer': 0.0003721210102776486, 'involved': 0.0003721210102776486, 'matt': 0.0003721210102776486, 'plays': 0.0003370201712105341, 'located': 0.0003370201712105341, 'industry': 0.00032602064011348744, 'and': 0.0003252818499873436, 'united': 0.0003190783942325149, 'awar

1-gram LMI:  {'SUPPORTS': {'the': 0.0009211924923548207, 'and': 0.0005781972950453232, 'world': 0.00041554233858412474, 'as': 0.00041146774091682057, 'at': 0.00039270888010437173, 'award': 0.00032265423893179976, 'playing': 0.0002622765361717602, 'film': 0.0002428493773706249, 'million': 0.00020507862941581556, 'globe': 0.00020322464841687456, 'november': 0.00019845444682935735, 'performance': 0.00018978112975060187, 'to': 0.00018596471067832613, 'also': 0.00018183511614641815, 'state': 0.0001770759408983469, 'no': 0.00017476283690868877, 'over': 0.00016475830970595574, 'golden': 0.00016475830970595574, 'won': 0.00016015750360703606, 'tennis': 0.00015866958258743485, 'titled': 0.0001579374273942727, 'air': 0.00015721345178101767, 'georgia': 0.00015713122538999722, 'war': 0.00015541159409138, 'london': 0.00015506667601106907, 'he': 0.00015379104034938363, 'best': 0.0001439276215445653, 'time': 0.00014330042320772048, 'career': 0.00014131804633712772, 'social': 0.00013978922185338289, 'a

In [13]:
# Inferential Examples
x = [['Roman Atwood is a content creator .',
  'He is best known for his vlogs , where he posts updates about his life on a daily basis .'],
 ['Roman Atwood is a content creator .',
  "He also has another YouTube channel called `` RomanAtwood '' , where he posts pranks ."]] 
classifier.inference(x)

[{'SUPPORTS': 0.46302873070427597,
  'REFUTES': 0.07559391671207723,
  'NOT ENOUGH INFO': 0.46137735258364687},
 {'SUPPORTS': 0.41902837885038285,
  'REFUTES': 0.16724135788396502,
  'NOT ENOUGH INFO': 0.4137302632656521}]

## Evaluation

In [14]:
y_preds = classifier.predict(docs_test)
print("Accuracy on train set: %.3f"% accuracy_score(labels_test, y_preds))

Accuracy on train set: 0.583


In [15]:
# validate load process of the classifier
test_classifier = Classifier(
    possible_labels=POSSIBLE_LABELS,
    feature_extractors=feature_extractors,
    config=config
)
test_classifier.load(SAVED_MODEL_PATH)

y_preds = classifier.predict(docs_test)
print("Accuracy on train set: %.3f"% accuracy_score(labels_test, y_preds))

Accuracy on train set: 0.583


### On original test set

In [16]:
eval_docs, eval_labels = read_data(DEV_DATA_FILE)

# samples:  100


In [17]:
y_preds = classifier.predict(eval_docs)
print("Accuracy on original test set: %.3f"% accuracy_score(eval_labels, y_preds))

Accuracy on original test set: 0.430


### On symmetric (challenge) test set

In [18]:
eval_docs, eval_labels = read_data(TEST_DATA_FILE, sent2_key="evidence_sentence", label_key="label")

# samples:  717


In [19]:
y_preds = classifier.predict(eval_docs)
print("Accuracy on challenge test set: %.3f"% accuracy_score(eval_labels, y_preds))

Accuracy on challenge test set: 0.312


## Write predicted probability to the training set

In [20]:
def get_weight(prob_score_bias_class: float, ground_truth_label: str, bias_label: str = BIAS_CLASS) -> float:
    if ground_truth_label == bias_label:
        return 1/prob_score_bias_class
    return 1/(1-prob_score_bias_class)

In [21]:
# test cases
def test_bias_label():
    weight = get_weight(0.2, "A", bias_label="A")
    np.testing.assert_almost_equal(1/0.2, weight, 5)
    
def test_not_bias_label():
    weight = get_weight(0.2, "B", bias_label="A")
    np.testing.assert_almost_equal(1/0.8, weight, 5)
    
test_bias_label()
test_not_bias_label()

In [22]:
f_output = open(OUTPUT_TRAIN_DATA_FILE, 'w')

In [23]:
N_SAMPLE = 0

with open(TRAIN_DATA_FILE, 'r') as fh:
    line = fh.readline()
    while line:
        datapoint = json.loads(line)
        x = [[datapoint[DOC1_KEY], datapoint[DOC2_KEY]]]
        
        probs = classifier.inference(x)[0]
        prob = probs[BIAS_CLASS]
        weight = get_weight(
            prob_score_bias_class=prob,
            ground_truth_label=datapoint[LABEL_KEY],
            bias_label=BIAS_CLASS
        )
        if datapoint.get("weight", None) != None:
            del datapoint["weight"] # only for fever
        f_output.write("%s\n"%json.dumps({
            **datapoint,
            WEIGHT_KEY: weight,
            "bias_probs": inference_prob_to_index(probs),
            "bias_prob": prob
        }))

        N_SAMPLE += 1
        if MAX_TEST_SAMPLE != -1 and N_SAMPLE == MAX_TEST_SAMPLE:
            break
        line = fh.readline()
        
f_output.close()