# Bias Model for FEVER

In [1]:
import json
from random import random
import os
from typing import Dict, List

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

import sys
sys.path.append("../")
from my_package.models.traditional.classifier import Classifier
from my_package.utils.handcrafted_features.counter import count_negations
from my_package.utils.handcrafted_features.overlap import get_lexical_overlap, get_entities_overlap

## Configurations

In [16]:
DUMMY_PREFIX = "" # "sample_" for example and "" for the real one

TRAIN_DATA_FILE = "../data/paraphrase_identification/%sqqp.train.jsonl"%DUMMY_PREFIX
VAL_DATA_FILE = "../data/paraphrase_identification/%sqqp.val.jsonl"%DUMMY_PREFIX
DEV_DATA_FILE = "../data/paraphrase_identification/%sqqp.dev.jsonl"%DUMMY_PREFIX
TEST_DATA_FILE = "../data/paraphrase_identification/paws.dev_and_test.jsonl"

WEIGHT_KEY = "sample_weight"
OUTPUT_VAL_DATA_FILe = "../data/paraphrase_identification/%sweighted_qqp.val.jsonl"%DUMMY_PREFIX
OUTPUT_TRAIN_DATA_FILE = "../data/paraphrase_identification/%sweighted_qqp.train.jsonl"%DUMMY_PREFIX
SAVED_MODEL_PATH = "../results/qqp/bias_model"

DOC1_KEY = "sentence1"
DOC2_KEY = "sentence2"
LABEL_KEY = "is_duplicate"

POSSIBLE_LABELS = ("0", "1")
BIAS_CLASS = "1"

MAX_SAMPLE = -1 # -1 for non-maximal mode or a finite number e.g. 2000
DROP_RATE = 0.0
TEST_FRAC = 0.2

MAX_TEST_SAMPLE = -1

In [3]:
def inference_prob_to_index(x: List[Dict[str, float]]) -> List[float]:
    return [
        x["0"],
        x["1"]
    ]

## Dataset

In [4]:
def read_data(
    file: str = TRAIN_DATA_FILE,
    sent1_key: str = DOC1_KEY,
    sent2_key: str = DOC2_KEY,
    label_key: str = LABEL_KEY,
    drop_rate: float = 0.0
):
    docs = []
    labels = []

    N_SAMPLE = 0

    with open(file, 'r') as fh:
        line = fh.readline()
        while line:
            if random() > drop_rate:
                datapoint = json.loads(line)
                docs.append([datapoint[sent1_key], datapoint[sent2_key]])
                labels.append(str(datapoint[label_key]))

                N_SAMPLE += 1
                if MAX_SAMPLE != -1 and N_SAMPLE == MAX_SAMPLE:
                    break
            line = fh.readline()
    print("# samples: ", N_SAMPLE)
    return docs, labels

In [5]:
docs, labels = read_data(drop_rate=DROP_RATE)

# samples:  394287


In [6]:
docs[:2]

[['How can I stop my dog from chewing my shoes?',
  'How do you stop a English Bulldog/Pitbull mix puppy from biting my shoes?'],
 ['What are the most interesting products and innovations that Guess is coming out with in 2016?',
  'What are the most interesting products and innovations that Twitter is coming out with in 2016?']]

In [7]:
labels[:2]

['1', '0']

In [8]:
docs_train, docs_test, labels_train, labels_test = train_test_split(
    docs, labels,
    stratify=labels, test_size=TEST_FRAC,
    random_state=42
)

## Bias Classifier

In [9]:
feature_extractors = [
    get_lexical_overlap,
    get_entities_overlap
]

In [10]:
config = {
    "n_grams": [1, 2],
    "top_ks": [50, 50], # select by LMI
    "verbose": True
}

In [11]:
classifier = Classifier(
    possible_labels=POSSIBLE_LABELS,
    feature_extractors=feature_extractors,
    config=config
)

## Training

In [12]:
classifier.fit(docs_train, labels_train)
classifier.save(SAVED_MODEL_PATH)

------ Top N-grams for sentence 1 ------
1-gram LMI:  {'1': {'how': 0.003131597238347227, 'do': 0.002285449958695732, 'can': 0.001863018506616515, 'you': 0.0010957448135881257, 'i': 0.0010934890462615734, '500': 0.0010924789905773387, 'quora': 0.0008671009729796184, 'notes': 0.0008354251104414944, 'best': 0.0007808110606229144, 'life': 0.0007216823342085798, '1000': 0.0007171122063811738, 'money': 0.0007014760067833251, 'your': 0.0006488218292881769, 'improve': 0.0006406790898520766, 'war': 0.0006391822526546039, 'who': 0.0006346506392816293, 'think': 0.0006034940326607351, '2000': 0.0005783712303056499, 'rupee': 0.0005783712303056499, 'rs': 0.000557622236268245, 'ever': 0.0005436145423007438, 'really': 0.0005250121260953952, 'english': 0.0005149166155177826, 'is': 0.00048464443080930474, 'battle': 0.0004498442902377277, 'my': 0.00041647518841883704, 'ways': 0.00041365486451378685, 'india': 0.00040373374784453655, 'president': 0.00039213955954272965, 'currency': 0.00038558082020376656,

2-gram LMI:  {'1': {'how_do': 0.0016762468815476141, 'how_can': 0.0016395348572630377, 'do_i': 0.001461593834207281, 'what_is': 0.0008912696773281356, 'can_i': 0.0008527877422811793, 'what_are': 0.0007193761031778532, 'the_best': 0.0007018282994500884, 'on_quora': 0.0006347783285563914, 'how_did': 0.0005441703883386535, 'is_the': 0.0005237711414931545, 'will_the': 0.0005225435351750253, '500_and': 0.0005080849634338861, 'and_1000': 0.0005080849634338861, 'ways_to': 0.0004757183909928511, 'donald_trump': 0.0004757183909928511, 'i_get': 0.00044938680525354415, 'are_some': 0.0004389830245536441, 'are_the': 0.0004388604422926, '1000_notes': 0.00043550139722904526, 'the_ban': 0.00043550139722904526, 'can_we': 0.0004235141376241519, 'make_money': 0.00039956599495776857, 'do_you': 0.00039755334475431226, 'the_indian': 0.00037844729328952577, 'can_one': 0.00036291783102420435, 'ban_of': 0.00036291783102420435, 'rupee_notes': 0.00036291783102420435, 'some_good': 0.00035407073805783734, 'improve

In [13]:
# Inferential Examples
x = [['Roman Atwood is a content creator .',
  'He is best known for his vlogs , where he posts updates about his life on a daily basis .'],
 ['Roman Atwood is a content creator .',
  "He also has another YouTube channel called `` RomanAtwood '' , where he posts pranks ."]] 
classifier.inference(x)

[{'0': 0.93581505376543, '1': 0.06418494623457006},
 {'0': 0.9199047220951436, '1': 0.08009527790485636}]

## Evaluation

In [14]:
y_preds = classifier.predict(docs_test)

print("Accuracy on original train set: %.3f"% accuracy_score(labels_test, y_preds))
print("F1-pos on original train set: %.3f"% f1_score(labels_test, y_preds, pos_label='1'))
print("F1-neg on original train set: %.3f"% f1_score(labels_test, y_preds, pos_label='0'))

Accuracy on original train set: 0.695
F1-pos on original train set: 0.527
F1-neg on original train set: 0.775


In [12]:
# validate load process of the classifier
test_classifier = Classifier(
    possible_labels=POSSIBLE_LABELS,
    feature_extractors=feature_extractors,
    config=config
)
test_classifier.load(SAVED_MODEL_PATH)

y_preds = test_classifier.predict(docs_test)

print("Accuracy on original train set: %.3f"% accuracy_score(labels_test, y_preds))
print("F1-pos on original train set: %.3f"% f1_score(labels_test, y_preds, pos_label='1'))
print("F1-neg on original train set: %.3f"% f1_score(labels_test, y_preds, pos_label='0'))

### On original test set

In [16]:
eval_docs, eval_labels = read_data(DEV_DATA_FILE)

# samples:  200


In [17]:
y_preds = classifier.predict(eval_docs)

print("Accuracy on original test set: %.3f"% accuracy_score(eval_labels, y_preds))
print("F1-pos on original test set: %.3f"% f1_score(eval_labels, y_preds, pos_label='1'))
print("F1-neg on original test set: %.3f"% f1_score(eval_labels, y_preds, pos_label='0'))

Accuracy on original test set: 0.710
F1-pos on original test set: 0.554
F1-neg on original test set: 0.785


### On symmetric (challenge) test set

In [18]:
eval_docs, eval_labels = read_data(TEST_DATA_FILE)

# samples:  677


In [19]:
y_preds = classifier.predict(eval_docs)

print("Accuracy on challenge test set: %.3f"% accuracy_score(eval_labels, y_preds))
print("F1-pos on challenge test set: %.3f"% f1_score(eval_labels, y_preds, pos_label='1'))
print("F1-neg on challenge test set: %.3f"% f1_score(eval_labels, y_preds, pos_label='0'))

Accuracy on challenge test set: 0.409
F1-pos on challenge test set: 0.419
F1-neg on challenge test set: 0.399


## Write predicted probability to the training set

In [18]:
def get_weight(prob_score_bias_class: float, ground_truth_label: str, bias_label: str = BIAS_CLASS) -> float:
    assert type(ground_truth_label) == type(bias_label)
    if ground_truth_label == bias_label:
        return 1/prob_score_bias_class
    return 1/(1-prob_score_bias_class)

In [21]:
# test cases
def test_bias_label():
    weight = get_weight(0.2, "A", bias_label="A")
    np.testing.assert_almost_equal(1/0.2, weight, 5)
    
def test_not_bias_label():
    weight = get_weight(0.2, "B", bias_label="A")
    np.testing.assert_almost_equal(1/0.8, weight, 5)
    
test_bias_label()
test_not_bias_label()

In [14]:
def write_weight_to_file(
    DATA_FILE: str,
    OUTPUT_DATA_FILE: str,
    _classifier
) -> None:
    f_output = open(OUTPUT_DATA_FILE, 'w')

    N_SAMPLE = 0

    with open(DATA_FILE, 'r') as fh:
        line = fh.readline()
        while line:
            datapoint = json.loads(line)
            x = [[datapoint[DOC1_KEY], datapoint[DOC2_KEY]]]

            probs = _classifier.inference(x)[0]
            prob = probs[BIAS_CLASS]
            weight = get_weight(
                prob_score_bias_class=prob,
                ground_truth_label=str(datapoint[LABEL_KEY]),
                bias_label=BIAS_CLASS
            )
            if datapoint.get("weight", None) != None:
                del datapoint["weight"] # only for fever
            f_output.write("%s\n"%json.dumps({
                **datapoint,
                WEIGHT_KEY: weight,
                "bias_probs": inference_prob_to_index(probs),
                "bias_prob": prob
            }))

            N_SAMPLE += 1
            if MAX_SAMPLE != -1 and N_SAMPLE == MAX_SAMPLE:
                break
            line = fh.readline()

    f_output.close()

### Train

In [23]:
write_weight_to_file(
    DATA_FILE = TRAIN_DATA_FILE,
    OUTPUT_DATA_FILE = OUTPUT_TRAIN_DATA_FILE,
    _classifier = classifier
)

### Val

In [19]:
write_weight_to_file(
    DATA_FILE = VAL_DATA_FILE,
    OUTPUT_DATA_FILE = OUTPUT_VAL_DATA_FILe,
    _classifier = classifier
)