In [2]:
import numpy as np
import torch
from tqdm import tqdm
import unicodedata

from collections import defaultdict, OrderedDict, Counter
from dataclasses import dataclass
import datetime as dt
from itertools import chain
import os
import pathlib
from pathlib import Path
import string
import pandas as pd
import unicodedata as ud
from time import time
from typing import Dict, Type, Callable, List
import sys
import ujson

from sentence_transformers import CrossEncoder, util
import textwrap

from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert import Indexer, Searcher
from colbert.data import Queries, Collection
from colbert import Trainer

sys.path.insert(0, '/home/drchajan/devel/python/FC/ColBERTv2') # ignore other ColBERT installations

%load_ext autoreload
%autoreload 2

In [3]:
def _create_parent_dir(fname):
    pdir = Path(fname).parent
    pdir.mkdir(parents=True, exist_ok=True)

def read_jsonl(jsonl):
    with open(jsonl, 'r') as json_file:
        data = []
        for jline in json_file:
            rec = ujson.loads(jline)
            data.append(rec)
    return data

def write_jsonl(jsonl, data, mkdir=False):
    if mkdir:
        _create_parent_dir(jsonl)
    # data is an iterable (list) of JSON-compatible structures (OrderedDict)
    with open(jsonl, 'w', encoding='utf8') as json_file:
        for r in data:
            ujson.dump(r, json_file, ensure_ascii=False, default=str)
            json_file.write("\n")
            
def read_json(fname):
    with open(fname, 'r') as json_file:
        data = ujson.load(json_file)
    return data

def write_json(fname, data, indent=3, mkdir=False):
    if mkdir:
        _create_parent_dir(json)
    with open(str(fname), 'w', encoding='utf8') as json_file:
        ujson.dump(data, json_file, ensure_ascii=False, indent=indent, default=str)

def fever_detokenize(txt):
    # updated detokenize, most models are not trained with this...
    txt = txt.replace(" .", ".").replace(" ,", ",").replace(" ?", "?").replace(" :", ":").replace(" ;", ";")
    txt = txt.replace("`` ", '"').replace(" ''", '"').replace(" '", "'")
    txt = txt.replace("-LRB-", "(").replace("-RRB-", ")")
    txt = txt.replace("-LSB-", "/").replace("-RSB-", "/")
    txt = txt.replace("-COLON-", ":")
    txt = txt.replace("( ", "(").replace(" )", ")")
    return txt

In [4]:
LANG = "en"
FEVER_CORPUS = "/mnt/data/factcheck/fever/data-en-latest/enwiki.jsonl"
FEVER_ROOT = "/mnt/data/factcheck/fever/data-en-lrev/fever-data"
FEVER_PREDICTIONS = "/mnt/data/factcheck/fever/data-en-lrev/predictions"
COLBERT_ROOT = "/mnt/data/factcheck/fever/data-en-lrev/colbertv2"

ANSERINI_ROOT = Path(COLBERT_ROOT, "anserini_for_hard_negatives")
ANSERINI_COLLECTION = str(Path(ANSERINI_ROOT, "collection"))
ANSERINI_INDEX = str(Path(ANSERINI_ROOT, "index"))
ANSERINI_RETRIEVED = Path(ANSERINI_ROOT, "retrieved")

In [5]:
corpus = read_jsonl(Path(COLBERT_ROOT, "collection.jsonl"))

In [6]:
original_id2pid = read_json(Path(COLBERT_ROOT, "original_id2pid.json"))

We will use Anserini in the first stage to get hard negatives. 

In [7]:
id2txt = {doc["id"]: doc["text"] for doc in corpus}
model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=512)

In [12]:
class EnFEVER_LREV_ID_Fixer:
    '''Our snapshot of EnFEVER most likely does not exactly match the snapshot used by the authors of the FEVER paper. Some of the evidence documents are missing. It seems that most often they have only slightly different names differing in use of underscores ("_"). This class tries to match them. 
    '''
    def __init__(self, corpus):
        self.fixed_id2original_id = defaultdict(list)
        for r in tqdm(corpus):
            original_id = unicodedata.normalize("NFC", r["id"])
            fixed_id = original_id.replace("_", "")
            self.fixed_id2original_id[fixed_id].append(original_id)

    def fix(self, id_):
        fixed_id = unicodedata.normalize("NFC", id_).replace("_", "")
        if fixed_id not in self.fixed_id2original_id:
            return id_
        original_ids = self.fixed_id2original_id[fixed_id]
        assert len(original_ids) == 1, f"{id_} => {fixed_id} => {original_ids}"
        return original_ids[0]
    
enfever_lrev_id_fixer = EnFEVER_LREV_ID_Fixer(corpus)

100%|██████████| 6038180/6038180 [00:12<00:00, 487824.92it/s] 


In [13]:
def rerank_predictions(corpus, in_prediction_jsonl, out_prediction_jsonl):
    # id2txt = {doc["id"]: doc["text"] for doc in corpus}
    # model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=512)
    in_preds = read_jsonl(in_prediction_jsonl) 
    for r in tqdm(in_preds):
        claim = r["claim"]
        predictions = r["predicted_pages"]
        print(r["predicted_pages"])
        docs = [id2txt[id_] for id_ in predictions]
        data = [(claim, doc) for doc in docs]
        scores = model.predict(data)
        idxs = np.argsort(scores, kind='stable')[::-1]
        r["predicted_pages"] = [predictions[idx] for idx in idxs]
        print(r["predicted_pages"])
        break

# rerank_predictions(corpus, "/mnt/data/factcheck/fever/data-en-lrev/predictions/paper_test/colbert/enfever_lrev128/k500.jsonl", None)
rerank_predictions(corpus, "/mnt/data/factcheck/fever/data-en-lrev/predictions/paper_test/seal/fever_mbart-large-cc25_en_XX+unsupervised6M_cp25000_with_titles_k500.jsonl", None)

  0%|          | 0/9999 [00:00<?, ?it/s]

['Fences_-LRB-film-RRB-', 'Hacksaw_Ridge', 'List_of_accolades_received_by_Bridge_of_Spies_-LRB-film-RRB-', 'Grease_-LRB-film-RRB-', 'Rogue_One', 'No_Country_for_Old_Men_-LRB-film-RRB-', 'List_of_accolades_received_by_The_Martian_-LRB-film-RRB-', 'The_Help_-LRB-film-RRB-', 'List_of_accolades_received_by_Room', 'List_of_accolades_received_by_The_Revenant_-LRB-2015_film-RRB-', 'List_of_accolades_received_by_Manchester_by_the_Sea_-LRB-film-RRB-', 'List_of_accolades_received_by_Fences_-LRB-film-RRB-', 'Manchester_by_the_Sea_-LRB-film-RRB-', 'List_of_accolades_received_by_Hell_or_High_Water', 'Deepwater_Horizon_-LRB-film-RRB-', 'List_of_accolades_received_by_Zero_Dark_Thirty', 'Hidden_Figures', 'Goodfellas', 'The_Hobbit-COLON-_The_Battle_of_the_Five_Armies', 'Gravity_-LRB-2013_film-RRB-', 'The_Big_Short_-LRB-film-RRB-', 'Anil_Kapoor', 'Psych_-LRB-season_1-RRB-', 'The_Call_-LRB-2013_film-RRB-', 'The_Godfather_Part_II', 'Traci_Lords_filmography', 'List_of_accolades_received_by_Suicide_Squad', 




KeyError: 'Fences_-LRB-film-RRB-'

In [11]:
[v for v in id2txt.keys() if v.startswith('Grease_-LRB')]

['Grease_-LRB-_lubricant_-RRB-',
 'Grease_-LRB-_film_-RRB-',
 'Grease_-LRB-_song_-RRB-',
 'Grease_-LRB-_musical_-RRB-',
 'Grease_-LRB-_video_game_-RRB-']

In [7]:
def read_fever_texts_from_sqlite_db(db_file, detokenize=True, detokenize_id=False, fix_tabs=False):
    import sqlite3
    # note that empty lines are possible, and we store them
    doc2text = defaultdict(list)
    with sqlite3.connect(db_file, detect_types=sqlite3.PARSE_DECLTYPES) as connection:
        cursor = connection.cursor()
        cursor.execute(f"SELECT id, text FROM documents")
        for id_, txt in cursor.fetchall():
            if detokenize:
                if detokenize_id:
                    id_ = fever_detokenize(id_)
                txt = fever_detokenize(txt)
            if fix_tabs:
                txt = txt.replace("\t", " ")
            doc2text[id_].append(txt)
        return doc2text


# this is for debugging only, the text is reconstructed from lines to be alligned with training data
# doc2text = read_fever_texts_from_sqlite_db("/mnt/data/factcheck/fever/data-en/fever/fever.db", detokenize_id=True)
doc2text = read_fever_texts_from_sqlite_db("/mnt/data/factcheck/fever/data-en-lrev/fever/fever.db")

In [8]:
[v for v in doc2text.keys() if v.startswith('Grease_-LRB')]

['Grease_-LRB-video_game-RRB-',
 'Grease_-LRB-film-RRB-',
 'Grease_-LRB-lubricant-RRB-',
 'Grease_-LRB-song-RRB-',
 'Grease_-LRB-musical-RRB-']