# Bias Model for FEVER

In [1]:
import json
from random import random
import os

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

import sys
sys.path.append("../")
from my_package.models.traditional.classifier import Classifier
from my_package.utils.handcrafted_features.counter import count_negations
from my_package.utils.handcrafted_features.overlap import get_lexical_overlap, get_entities_overlap



## Configurations

In [2]:
TRAIN_DATA_FILE = "../data/paraphrase_identification/qqp.train.jsonl"
DEV_DATA_FILE = "../data/paraphrase_identification/qqp.dev.jsonl"
TEST_DATA_FILE = "../data/paraphrase_identification/paws.dev_and_test.jsonl"

WEIGHT_KEY = "sample_weight"
OUTPUT_TRAIN_DATA_FILE = "../data/paraphrase_identification/weighted_qqp.train.jsonl"

DOC1_KEY = "sentence1"
DOC2_KEY = "sentence2"
LABEL_KEY = "is_duplicate"

POSSIBLE_LABELS = ("0", "1")
BIAS_CLASS = "1"

MAX_SAMPLE = -1 # -1 for non-maximal mode or a finite number e.g. 2000
DROP_RATE = 0.0
TEST_FRAC = 0.2

MAX_TEST_SAMPLE = -1

## Dataset

In [3]:
def read_data(
    file: str = TRAIN_DATA_FILE,
    sent1_key: str = DOC1_KEY,
    sent2_key: str = DOC2_KEY,
    label_key: str = LABEL_KEY,
    drop_rate: float = 0.0
):
    docs = []
    labels = []

    N_SAMPLE = 0

    with open(file, 'r') as fh:
        line = fh.readline()
        while line:
            if random() > drop_rate:
                datapoint = json.loads(line)
                docs.append([datapoint[sent1_key], datapoint[sent2_key]])
                labels.append(str(datapoint[label_key]))

                N_SAMPLE += 1
                if MAX_SAMPLE != -1 and N_SAMPLE == MAX_SAMPLE:
                    break
            line = fh.readline()
    print("# samples: ", N_SAMPLE)
    return docs, labels

In [4]:
docs, labels = read_data(drop_rate=DROP_RATE)

# samples:  200


In [5]:
docs[:2]

[['What are the best books for IBPS PO, SBI SO, SBI PO?',
  'What are the best books for Bank P.O./IBPS preparation?'],
 ['What are some of the results of The Congress of Vienna?',
  'What were the results of the Congress of Vienna?']]

In [6]:
labels[:2]

['1', '1']

In [7]:
docs_train, docs_test, labels_train, labels_test = train_test_split(
    docs, labels,
    stratify=labels, test_size=TEST_FRAC,
    random_state=42
)

## Bias Classifier

In [8]:
feature_extractors = [
    get_lexical_overlap,
    get_entities_overlap
]

In [9]:
config = {
    "n_grams": [1, 2],
    "top_ks": [50, 50], # select by LMI
    "verbose": True
}

In [10]:
classifier = Classifier(
    possible_labels=POSSIBLE_LABELS,
    feature_extractors=feature_extractors,
    config=config
)

## Training

In [11]:
classifier.fit(docs_train, labels_train)

------ Top N-grams for sentence 1 ------
1-gram LMI:  {'1': {'how': 0.004840899014482804, 'i': 0.0036665460924711676, 'some': 0.0034999784672137494, 'can': 0.0029910741126893603, 'books': 0.002320591279283061, 'weight': 0.002142262579475879, 'find': 0.001624227635084917, 'that': 0.001512734768787952, 'is': 0.001500109029329849, 'should': 0.0014901639813195062, 'lose': 0.0014281750529839192, 'about': 0.0014281750529839192, 'step': 0.0014281750529839192, 'circuit': 0.0014281750529839192, 'watch': 0.0014281750529839192, 'hire': 0.0014281750529839192, 'hacker': 0.0014281750529839192, 'life': 0.0014281750529839192, 'under': 0.0014281750529839192, 'movies': 0.0014281750529839192, 'what': 0.0014040516549593271, 'instagram': 0.000941422222121777, '2016': 0.000941422222121777, 'girl': 0.000941422222121777, 'after': 0.000941422222121777, 'quora': 0.000941422222121777, 'so': 0.000941422222121777, 'phone': 0.000941422222121777, 'with': 0.0008940983887917038, 'quickly': 0.0007140875264919596, 'week

In [12]:
# Inferential Examples
x = [['Roman Atwood is a content creator .',
  'He is best known for his vlogs , where he posts updates about his life on a daily basis .'],
 ['Roman Atwood is a content creator .',
  "He also has another YouTube channel called `` RomanAtwood '' , where he posts pranks ."]] 
classifier.inference(x)

[{'0': 0.693356785861582, '1': 0.306643214138418},
 {'0': 0.7805909204344456, '1': 0.21940907956555442}]

## Evaluation

In [13]:
y_preds = classifier.predict(docs_test)

print("Accuracy on original train set: %.3f"% accuracy_score(labels_test, y_preds))
print("F1-pos on original train set: %.3f"% f1_score(labels_test, y_preds, pos_label='1'))
print("F1-neg on original train set: %.3f"% f1_score(labels_test, y_preds, pos_label='0'))

Accuracy on original train set: 0.675
F1-pos on original train set: 0.316
F1-neg on original train set: 0.787


### On original test set

In [14]:
eval_docs, eval_labels = read_data(DEV_DATA_FILE)

# samples:  200


In [15]:
y_preds = classifier.predict(eval_docs)

print("Accuracy on original test set: %.3f"% accuracy_score(eval_labels, y_preds))
print("F1-pos on original test set: %.3f"% f1_score(eval_labels, y_preds, pos_label='1'))
print("F1-neg on original test set: %.3f"% f1_score(eval_labels, y_preds, pos_label='0'))

Accuracy on original test set: 0.665
F1-pos on original test set: 0.396
F1-neg on original test set: 0.768


### On symmetric (challenge) test set

In [16]:
eval_docs, eval_labels = read_data(TEST_DATA_FILE)

# samples:  200


In [17]:
y_preds = classifier.predict(eval_docs)

print("Accuracy on challenge test set: %.3f"% accuracy_score(eval_labels, y_preds))
print("F1-pos on challenge test set: %.3f"% f1_score(eval_labels, y_preds, pos_label='1'))
print("F1-neg on challenge test set: %.3f"% f1_score(eval_labels, y_preds, pos_label='0'))

Accuracy on challenge test set: 0.530
F1-pos on challenge test set: 0.347
F1-neg on challenge test set: 0.633


## Write predicted probability to the training set

In [18]:
def get_weight(prob_score_bias_class: float, ground_truth_label: str, bias_label: str = BIAS_CLASS) -> float:
    if ground_truth_label == bias_label:
        return 1/prob_score_bias_class
    return 1/(1-prob_score_bias_class)

In [23]:
# test cases
def test_bias_label():
    weight = get_weight(0.2, "A", bias_label="A")
    np.testing.assert_almost_equal(1/0.2, weight, 5)
    
def test_not_bias_label():
    weight = get_weight(0.2, "B", bias_label="A")
    np.testing.assert_almost_equal(1/0.8, weight, 5)
    
test_bias_label()
test_not_bias_label()

In [25]:
f_output = open(OUTPUT_TRAIN_DATA_FILE, 'w')

In [26]:
N_SAMPLE = 0

with open(TRAIN_DATA_FILE, 'r') as fh:
    line = fh.readline()
    while line:
        datapoint = json.loads(line)
        x = [[datapoint[DOC1_KEY], datapoint[DOC2_KEY]]]
        
        prob = classifier.inference(x)[0][BIAS_CLASS]
        weight = get_weight(
            prob_score_bias_class=prob,
            ground_truth_label=datapoint[LABEL_KEY],
            bias_label=BIAS_CLASS
        )
        if datapoint.get("weight", None) != None:
            del datapoint["weight"] # only for fever
        f_output.write("%s\n"%json.dumps({**datapoint, WEIGHT_KEY: weight, "prob": prob}))

        N_SAMPLE += 1
        if MAX_SAMPLE != -1 and N_SAMPLE == MAX_SAMPLE:
            break
        line = fh.readline()
        
f_output.close()