In [87]:
import os
import json
import statistics
import pandas as pd
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/felikskong/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/felikskong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [88]:
DATA_DIR = "data"

def load_json(fname):
    path = os.path.join(DATA_DIR, fname)
    if not os.path.exists(path):
        print(f"[WARN] {path} not found, skip.")
        return None
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

train_claims    = load_json("train-claims.json")
dev_claims      = load_json("dev-claims.json")
test_claims     = load_json("test-claims-unlabelled.json")
evidence        = load_json("evidence.json")

def claim_stats(claim_dict, split_name):
    if claim_dict is None:
        return {"split": split_name, "n_claims": 0}

    n_claims   = len(claim_dict)
    labels     = [v.get("claim_label") for v in claim_dict.values() if "claim_label" in v]
    ev_per_c   = [len(v.get("evidences", [])) for v in claim_dict.values()]
    return {
        "split": split_name,
        "# claims": n_claims,
        "avg #evidence": round(statistics.mean(ev_per_c), 2) if ev_per_c else 0,
        "label distribution": pd.Series(labels).value_counts().to_dict() if labels else {},
    }

summary = [
    claim_stats(train_claims, "train"),
    claim_stats(dev_claims,   "dev"),
    claim_stats(test_claims,  "test")
]

if evidence is not None:
    token_lens = [len(passage.split()) for passage in evidence.values()]
    summary.append({
        "split": "evidence-corpus",
        "# claims": len(evidence),     
        "avg #evidence": f"{statistics.mean(token_lens):.1f} tokens",                  
        "label distribution": "-",                               
    })

print(pd.DataFrame(summary).to_string(index=False))


          split  # claims avg #evidence                                                         label distribution
          train      1228          3.36 {'SUPPORTS': 519, 'NOT_ENOUGH_INFO': 386, 'REFUTES': 199, 'DISPUTED': 124}
            dev       154          3.19     {'SUPPORTS': 68, 'NOT_ENOUGH_INFO': 41, 'REFUTES': 27, 'DISPUTED': 18}
           test       153             0                                                                         {}
evidence-corpus   1208827   19.7 tokens                                                                          -


In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word, "v")
    if lemma == word:
        lemma = lemmatizer.lemmatize(word, "n")
    return lemma

def is_keep_token(tok: str) -> bool:
    return (
        tok.isascii()
        and
        tok.lower() not in stop_words
    )


def text_tokenizer(text: str) -> list[str]:
    tokens = word_tokenize(text.lower())

    cleaned: list[str] = []
    for tok in tokens:
        if not is_keep_token(tok):
            continue

        cleaned.append(lemmatize(tok) if tok.isalpha() else tok)
    if len(cleaned) < 5:
        return []
    return cleaned

In [None]:
evidence_ids = list(evidence.keys())
evidence_text_list = [evidence[eid] for eid in evidence_ids]

tfidf_vectorizer = TfidfVectorizer(max_features=5000, tokenizer=text_tokenizer)
tfidf_vectors = tfidf_vectorizer.fit_transform(evidence_text_list)

bow_vectorizer = CountVectorizer(max_features=5000, tokenizer=text_tokenizer)
bow_vectors = bow_vectorizer.fit_transform(evidence_text_list)



In [None]:
dev_claims_ids = list(dev_claims.keys())

top_k = 3

dev_claims_vectors = tfidf_vectorizer.transform([dev_claims[claim_id]['claim_text'] for claim_id in dev_claims_ids])
tfidf_cosine_similarities = cosine_similarity(dev_claims_vectors, tfidf_vectors)

top_k_evidence_tfidf = {
    claim_id: np.argsort(-tfidf_cosine_similarities[i])[:top_k].tolist()
    for i, claim_id in enumerate(dev_claims_ids)
}

dev_claims_retrieved_tfidf = {
    claim_id: {
        "claim_text": dev_claims[claim_id]['claim_text'],
        "claim_label": dev_claims[claim_id]['claim_label'],
        "evidences": [evidence_ids[i] for i in top_k_evidence_tfidf[claim_id]],
    }
    for claim_id in dev_claims_ids
}

dev_claims_bow_vectors = bow_vectorizer.transform([dev_claims[claim_id]['claim_text'] for claim_id in dev_claims_ids])
bow_cosine_similarities = cosine_similarity(dev_claims_bow_vectors, bow_vectors)

top_k_evidence_bow = {
    claim_id: np.argsort(-bow_cosine_similarities[i])[:top_k].tolist()
    for i, claim_id in enumerate(dev_claims_ids)
}

dev_claims_retrieved_bow = {
    claim_id: {
        "claim_text": dev_claims[claim_id]['claim_text'],
        "claim_label": dev_claims[claim_id]['claim_label'],
        "evidences": [evidence_ids[i] for i in top_k_evidence_bow[claim_id]],
    }
    for claim_id in dev_claims_ids
}

In [None]:
def evaluate(retrieved_dict: dict[str, dict]) -> tuple[float, float, float]:
    precisions, recalls, f1s = [], [], []

    for cid, claim in dev_claims.items():
        pred   = set(retrieved_dict[cid]["evidences"])
        truth  = set(claim["evidences"])
        correct = len(pred & truth)

        if correct:
            prec = correct / len(pred)
            rec  = correct / len(truth)
            f1   = 2 * prec * rec / (prec + rec)
        else:
            prec = rec = f1 = 0.0

        precisions.append(prec)
        recalls.append(rec)
        f1s.append(f1)

    return np.mean(precisions), np.mean(recalls), np.mean(f1s)

systems = [
    ("TF-IDF", dev_claims_retrieved_tfidf),
    ("BoW",    dev_claims_retrieved_bow),
]

for name, retrieved in systems:
    p, r, f = evaluate(retrieved)
    print(f"{name:<7} →  Precision: {p:.4f}  |  Recall: {r:.4f}  |  F1: {f:.4f}")

TF-IDF  →  Precision: 0.0866  |  Recall: 0.0951  |  F1: 0.0836
BoW     →  Precision: 0.0606  |  Recall: 0.0765  |  F1: 0.0613


In [None]:
test_claims_ids = list(test_claims.keys())
test_claims_vectors = tfidf_vectorizer.transform([test_claims[claim_id]['claim_text'] for claim_id in test_claims_ids])
test_cosine_similarities = cosine_similarity(test_claims_vectors, tfidf_vectors)

top_k_evidence_test_tfidf = {
    claim_id: np.argsort(-test_cosine_similarities[i])[:top_k].tolist()
    for i, claim_id in enumerate(test_claims_ids)
}

test_claims_retrieved_tfidf = {
    claim_id: {
        "claim_text": test_claims[claim_id]['claim_text'],
        "evidences": [evidence_ids[i] for i in top_k_evidence_test_tfidf[claim_id]],
    }
    for claim_id in test_claims_ids
}

output_file = os.path.join(DATA_DIR, "test_claims_retrieved_tfidf.json")
with open(output_file, 'w') as f:
    json.dump(test_claims_retrieved_tfidf, f, indent=4)

print(f"Test claims with retrieved evidences saved to {output_file}.")

test_claims_vectors_bow = bow_vectorizer.transform([test_claims[claim_id]['claim_text'] for claim_id in test_claims_ids])
test_cosine_similarities_bow = cosine_similarity(test_claims_vectors_bow, bow_vectors)

top_k_evidence_test_bow = {
    claim_id: np.argsort(-test_cosine_similarities_bow[i])[:top_k].tolist()
    for i, claim_id in enumerate(test_claims_ids)
}

test_claims_retrieved_bow = {
    claim_id: {
        "claim_text": test_claims[claim_id]['claim_text'],
        "evidences": [evidence_ids[i] for i in top_k_evidence_test_bow[claim_id]],
    }
    for claim_id in test_claims_ids
}

output_file_bow = os.path.join(DATA_DIR, "test_claims_retrieved_bow.json")
with open(output_file_bow, 'w') as f:
    json.dump(test_claims_retrieved_bow, f, indent=4)

print(f"Test claims with retrieved evidences saved to {output_file_bow}.")


Test claims with retrieved evidences saved to data/test_claims_retrieved_tfidf.json.
Test claims with retrieved evidences saved to data/test_claims_retrieved_bow.json.
