In [1]:
# Import all the packages
import numpy as np
import json
from collections import Counter
from sentence_transformers import SentenceTransformer, CrossEncoder, InputExample
import pickle
import faiss
from tqdm import tqdm
import nltk
import re
from datasets import Dataset
from nltk.tokenize import sent_tokenize
from torch.utils.data import DataLoader
import random
nltk.download('punkt')
nltk.download('punkt_tab')




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
train_claims_path = './data/train-claims.json'
dev_claims_path = './data/dev-claims.json'
evidence_path = './data/evidence.json'

## 训minilm的

In [3]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import json
import random

# Load the data
with open(train_claims_path, 'r') as f:
    train_claims = json.load(f)
with open(evidence_path, 'r') as f:
    evidence_dict = json.load(f)


# Construct the training sample list (claim, evidence_text) -> label defaults to 1.0
train_samples = []
missed = 0

for claim in train_claims.values():
    claim_text = claim['claim_text']
    evidence_ids = claim.get('evidences', [])
    for eid in evidence_ids:
        if eid in evidence_dict:
            ev_text = evidence_dict[eid]
            train_samples.append(InputExample(texts=[claim_text, ev_text], label=1.0))
        else:
            missed += 1


print(f"Total training pairs: {len(train_samples)}")
print(f"Missing evidence ids: {missed}")

# Load the pre-trained model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Build the DataLoader
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=32)

# Define the loss function
train_loss = losses.MultipleNegativesRankingLoss(model)

# Start training
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=5,  
    warmup_steps=100,
    show_progress_bar=True
)

# Save the model
model.save('./model/my_finetuned_minilm_retriever')
print("Finetuned model saved.")


Total training pairs: 4122
Missing evidence ids: 0


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.7078


Finetuned model saved.


# 训msmarco reranker的

In [4]:
from sentence_transformers import CrossEncoder, InputExample
from torch.utils.data import DataLoader
import json
import random



with open(train_claims_path, 'r') as f:
    train_claims = json.load(f)
with open(evidence_path, 'r') as f:
    evidence_dict = json.load(f)


# Construct positive and negative samples
train_samples = []

def generate_samples(claims_data):
    samples = []
    for claim in claims_data.values():
        claim_text = claim["claim_text"]
        evidence_ids = claim.get("evidences", [])
        pos_evidence_texts = [evidence_dict[eid] for eid in evidence_ids if eid in evidence_dict]

        # Positive samples
        for ev in pos_evidence_texts:
            samples.append(InputExample(texts=[claim_text, ev], label=1.0))

        # Negative samples
        neg_pool = [e for eid, e in evidence_dict.items() if eid not in evidence_ids]
        for _ in range(len(pos_evidence_texts)):
            neg_ev = random.choice(neg_pool)
            samples.append(InputExample(texts=[claim_text, neg_ev], label=0.0))

    return samples

train_samples.extend(generate_samples(train_claims))

print(f"Total training samples: {len(train_samples)}")

# Build the DataLoader (InputExample is a valid format)
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=16)

# Load MS MARCO CrossEncoder
model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", num_labels=1)

# Train the model
model.fit(
    train_dataloader=train_dataloader,
    epochs=5,
    warmup_steps=100,
    show_progress_bar=True
)

# Save the model
model.save('./model/my_finetuned_msmarco_reranker')
print("Finetuned model saved.")


Total training samples: 8244


Step,Training Loss
500,0.1872
1000,0.0232
1500,0.0096
2000,0.0072
2500,0.0019


Finetuned model saved.


In [5]:
model = SentenceTransformer('./model/my_finetuned_minilm_retriever')
reranker =  CrossEncoder('./model/my_finetuned_msmarco_reranker')

## encode evidence dictionary

In [6]:
import nltk
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from lemminflect import getAllInflections
nlp = spacy.load("en_core_web_sm")

# load data
with open(train_claims_path, 'r') as f:
    train_claims = json.load(f)

with open(evidence_path, 'r') as f:
    evidence_dict = json.load(f)

# Extract all nouns from claim_text and count their frequencies 
all_nouns = []
for claim_obj in train_claims.values():
    doc = nlp(claim_obj["claim_text"])
    nouns = [token.lemma_.lower() for token in doc if token.pos_ == "NOUN"]
    all_nouns.extend(nouns)

# Select the top 100 most frequent nouns as keywords
top_keywords = set(word for word, _ in Counter(all_nouns).most_common(100))

all_forms = set()
for lemma in top_keywords:
  all_forms.add(lemma)
  # Get all possible noun forms
  infl_map = getAllInflections(lemma, upos="NOUN")
  # infl_map is a dict: { 'NNS': ['cats'], 'NNPS': ['children'], ... }
  for forms in infl_map.values():
      all_forms.update(forms)


In [7]:
def contains_climate_keywords(text: str, all_forms: set) -> bool:
    # Lowercase the text and split into words then check the set
    words = re.findall(r"\b[a-z']+\b", text.lower())
    return any(word in all_forms for word in words)


def is_english(text: str, threshold: float = 0.5) -> bool:
    # Clean the text, only keep letters and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    if len(text) == 0:  # If the text is empty after cleaning, return False
        return False
    # Calculate the proportion of English characters
    english_char_count = sum(1 for char in text if char.isalpha())
    return (english_char_count / len(text)) >= threshold

def clean_and_split(eid, text):
    result_ids = []
    result_texts = []
    sentences = sent_tokenize(text)
    for i, sent in enumerate(sentences):
        sent = sent.lower()
        sent = re.sub(r'[^a-z0-9\s.,!?]', '', sent)  # Remove punctuation
        sent = re.sub(r'\s+', ' ', sent).strip()
        if len(sent.split()) >= 5:  # Optional: Filter out too short texts
          result_ids.append(f"{eid}_s{i}")  # Use the original eID plus sentence index
          result_texts.append(sent)
    return result_ids, result_texts



In [8]:
# Load the evidence embeddings
word_embedding_path = './word_embedding/evidence_embeddings.npy'
word_embedding_meta_path = "./word_embedding/evidence_meta.pkl"


with open(evidence_path, 'r') as f:
    evidence_dict = json.load(f)
# 1. Remove non-English
eids  = list(evidence_dict.keys())
texts = list(evidence_dict.values())
english_pairs = [
  (eid, txt)
  for eid, txt in zip(eids, texts)
  if is_english(txt)
]
print(f"Step1: English keep {len(english_pairs)}/{len(texts)}")

# 2. Remove non-climate-related
climate_pairs = [
    (eid, txt)
    for eid, txt in english_pairs
    if contains_climate_keywords(txt, all_forms)
]
print(f"Step2: Climate-related keep {len(climate_pairs)}/{len(english_pairs)}")


# The results of cleaning and splitting
cleaned_evidence_ids = []
cleaned_evidence_texts = []
original_evidence_ids = []  # Record the original evidence_id of each sentence

# Iterate through the evidence data and clean and split
for eid, text in climate_pairs:
    cleaned_ids, cleaned_texts = clean_and_split(eid, text)
    cleaned_evidence_ids.extend(cleaned_ids)
    cleaned_evidence_texts.extend(cleaned_texts)
    original_evidence_ids.extend([eid] * len(cleaned_ids))  # Each sentence records the original eID

# Encode the cleaned sentences using Sentence-BERT
evidence_embeddings = model.encode(
    cleaned_evidence_texts,
    convert_to_numpy=True,
    normalize_embeddings=True,
    show_progress_bar=True
)

# Save the encoded embeddings (embeddings)
np.save(word_embedding_path, evidence_embeddings)

# Save the evidence_ids and texts after splitting and the corresponding original evidence_id mapping
with open(word_embedding_meta_path, "wb") as f:
    pickle.dump((cleaned_evidence_ids, cleaned_evidence_texts, original_evidence_ids), f)


Step1: English keep 1207838/1208827
Step2: Climate-related keep 385471/1207838


Batches:   0%|          | 0/12133 [00:00<?, ?it/s]

## test and predict

In [None]:
# Load the evidence embeddings
word_embedding_path = './word_embedding/evidence_embeddings.npy'
word_embedding_meta_path = "./word_embedding/evidence_meta.pkl"

# Load numpy embeddings
evidence_embeddings = np.load(word_embedding_path)

# Load evidence_ids, evidence_texts, and original_evidence_ids
with open(word_embedding_meta_path, "rb") as f:
    evidence_ids, evidence_texts, original_evidence_ids = pickle.load(f)


dimension = evidence_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner Product = Cosine similarity if normalized
index.add(evidence_embeddings)


def clean_claim(claim: str) -> str:
    # Lowercase
    claim = claim.lower()
    # Remove punctuation
    claim = re.sub(r'[^a-z0-9\s]', '', claim)
    # Remove extra spaces
    claim = re.sub(r'\s+', ' ', claim).strip()
    return claim

# Create mapping
evidence_dict = dict(zip(evidence_ids, evidence_texts))

with open(evidence_path, 'r') as f:
    original_evidence_dict = json.load(f)

def retrieve_evidence(claim_id, claim_data, retrieval=100, top_k=5):
    claim_text = claim_data["claim_text"]
    cleaned_claim = clean_claim(claim_text)

    # Step 1: Coarse retrieval (model + FAISS)
    claim_embedding = model.encode([cleaned_claim], convert_to_numpy=True, normalize_embeddings=True)
    scores, indices = index.search(claim_embedding, retrieval * 3)

    # Step 2: Remove duplicate candidates
    seen_original_ids = set()
    candidates = []
    for i in indices[0]:
        eid = evidence_ids[i]
        text = evidence_dict[eid]
        original_id = original_evidence_ids[i]

        if original_id not in seen_original_ids:
            candidates.append((original_id, eid, text))
            seen_original_ids.add(original_id)

        if len(candidates) >= retrieval:
            break

    # Step 3: Reranking (CrossEncoder)
    pairs = [(claim_text, original_evidence_dict[orig_id]) for (orig_id, _, _) in candidates]  
    similarity_scores = reranker.predict(pairs)

    reranked = sorted(zip(candidates, similarity_scores), key=lambda x: x[1], reverse=True)

    # Only return the original document-level evidence ID
    top_k_original_ids = [orig_id for (orig_id, _, _), _ in reranked[:top_k]]

    # Construct the final result dict
    result = {
        "claim_text": claim_text,
        "evidences": top_k_original_ids
    }

    return result


In [11]:
from tqdm import tqdm
import numpy as np
import json

# Load dev claims
with open(dev_claims_path, 'r') as f:
    dev_claims = json.load(f)

claim_ids = list(dev_claims.keys())

retrieval_values = [100,200,500,1000]
top_k_values = [3, 4, 5]

best_f1 = 0
best_setting = {}

for retrieval in retrieval_values:
    for top_k in top_k_values:
        recalls = []
        precisions = []
        f1s = []

        for cid in tqdm(claim_ids, desc=f"Evaluating R={retrieval}, K={top_k}"):
            truth = set(dev_claims[cid]["evidences"])
            
            retrieved_info = retrieve_evidence(cid, dev_claims[cid], retrieval=retrieval, top_k=top_k)
            retrieved = set(retrieved_info["evidences"])

            hit = len(truth & retrieved)

            recall = hit / len(truth) if len(truth) > 0 else 0
            precision = hit / top_k if top_k > 0 else 0

            if precision + recall > 0:
                f1 = 2 * precision * recall / (precision + recall)
            else:
                f1 = 0

            recalls.append(recall)
            precisions.append(precision)
            f1s.append(f1)

        avg_recall = np.mean(recalls)
        avg_precision = np.mean(precisions)
        avg_f1 = np.mean(f1s)

        print(f"\nRetrieval={retrieval}, Top-K={top_k}")
        print(f"   - Avg Recall   : {avg_recall:.2%}")
        print(f"   - Avg Precision: {avg_precision:.2%}")
        print(f"   - Avg F1       : {avg_f1:.2%}")

        if avg_f1 > best_f1:
            best_f1 = avg_f1
            best_setting = {'retrieval': retrieval, 'top_k': top_k}

print(f"\nBest Setting: Retrieval={best_setting['retrieval']}, Top-K={best_setting['top_k']}, F1={best_f1:.2%}")


Evaluating R=100, K=3: 100%|██████████| 154/154 [00:21<00:00,  7.14it/s]



Retrieval=100, Top-K=3
   - Avg Recall   : 22.71%
   - Avg Precision: 20.35%
   - Avg F1       : 19.85%


Evaluating R=100, K=4: 100%|██████████| 154/154 [00:21<00:00,  7.21it/s]



Retrieval=100, Top-K=4
   - Avg Recall   : 26.36%
   - Avg Precision: 18.02%
   - Avg F1       : 19.90%


Evaluating R=100, K=5: 100%|██████████| 154/154 [00:23<00:00,  6.63it/s]



Retrieval=100, Top-K=5
   - Avg Recall   : 27.86%
   - Avg Precision: 15.45%
   - Avg F1       : 18.59%


Evaluating R=200, K=3: 100%|██████████| 154/154 [00:38<00:00,  4.03it/s]



Retrieval=200, Top-K=3
   - Avg Recall   : 22.25%
   - Avg Precision: 19.70%
   - Avg F1       : 19.32%


Evaluating R=200, K=4: 100%|██████████| 154/154 [00:41<00:00,  3.74it/s]



Retrieval=200, Top-K=4
   - Avg Recall   : 25.65%
   - Avg Precision: 17.37%
   - Avg F1       : 19.25%


Evaluating R=200, K=5: 100%|██████████| 154/154 [00:43<00:00,  3.56it/s]



Retrieval=200, Top-K=5
   - Avg Recall   : 27.47%
   - Avg Precision: 15.06%
   - Avg F1       : 18.20%


Evaluating R=500, K=3: 100%|██████████| 154/154 [01:45<00:00,  1.46it/s]



Retrieval=500, Top-K=3
   - Avg Recall   : 20.89%
   - Avg Precision: 18.61%
   - Avg F1       : 18.25%


Evaluating R=500, K=4: 100%|██████████| 154/154 [01:49<00:00,  1.41it/s]



Retrieval=500, Top-K=4
   - Avg Recall   : 25.31%
   - Avg Precision: 16.88%
   - Avg F1       : 18.84%


Evaluating R=500, K=5: 100%|██████████| 154/154 [01:50<00:00,  1.39it/s]



Retrieval=500, Top-K=5
   - Avg Recall   : 26.56%
   - Avg Precision: 14.55%
   - Avg F1       : 17.57%


Evaluating R=1000, K=3: 100%|██████████| 154/154 [03:40<00:00,  1.43s/it]



Retrieval=1000, Top-K=3
   - Avg Recall   : 20.78%
   - Avg Precision: 18.61%
   - Avg F1       : 18.21%


Evaluating R=1000, K=4: 100%|██████████| 154/154 [03:41<00:00,  1.44s/it]



Retrieval=1000, Top-K=4
   - Avg Recall   : 25.18%
   - Avg Precision: 16.72%
   - Avg F1       : 18.69%


Evaluating R=1000, K=5: 100%|██████████| 154/154 [03:39<00:00,  1.42s/it]


Retrieval=1000, Top-K=5
   - Avg Recall   : 26.61%
   - Avg Precision: 14.55%
   - Avg F1       : 17.59%

Best Setting: Retrieval=100, Top-K=4, F1=19.90%





In [None]:
from tqdm import tqdm
import numpy as np
import json

# Load dev claims
with open(dev_claims_path, 'r') as f:
    dev_claims = json.load(f)

claim_ids = list(dev_claims.keys())

retrieval = 100
top_k = 4

recalls = []
precisions = []
f1s = []

for cid in tqdm(claim_ids, desc=f"Evaluating R={retrieval}, K={top_k}"):
    truth = set(dev_claims[cid]["evidences"])
    
    retrieved_info = retrieve_evidence(cid, dev_claims[cid], retrieval=retrieval, top_k=top_k)
    retrieved = set(retrieved_info["evidences"])

    hit = len(truth & retrieved)

    recall = hit / len(truth) if len(truth) > 0 else 0
    precision = hit / top_k if top_k > 0 else 0

    if precision + recall > 0:
        f1 = 2 * precision * recall / (precision + recall)
    else:
        f1 = 0

    recalls.append(recall)
    precisions.append(precision)
    f1s.append(f1)

avg_recall = np.mean(recalls)
avg_precision = np.mean(precisions)
avg_f1 = np.mean(f1s)

print(f"\nEvaluation Results (Retrieval={retrieval}, Top-K={top_k})")
print(f"   - Avg Recall   : {avg_recall:.2%}")
print(f"   - Avg Precision: {avg_precision:.2%}")
print(f"   - Avg F1       : {avg_f1:.2%}")

In [10]:
import json
import re
import numpy as np
from tqdm import tqdm

# Load test claims data
with open('./data/test-claims-unlabelled.json', 'r') as f:
    test_claims = json.load(f)

# Clean function
def clean_claim(claim: str) -> str:
    claim = claim.lower()
    claim = re.sub(r'[^a-z0-9\s]', '', claim)
    claim = re.sub(r'\s+', ' ', claim).strip()
    return claim

# Execute retrieval and save
test_predictions = {}

for claim_id, claim_data in tqdm(test_claims.items(), desc="Retrieving Evidence"):
    result = retrieve_evidence(claim_id, claim_data, retrieval=100, top_k=4)

    # Remove claim_label (because test has no label)
    if "claim_label" in result:
        del result["claim_label"]

    test_predictions[claim_id] = result

# Save the predictions
with open("test-claims-predictions.json", "w") as f:
    json.dump(test_predictions, f, indent=2)

print("Retrieval with reranker completed and saved to test-claims-predictions.json")


Retrieving Evidence: 100%|██████████| 153/153 [00:16<00:00,  9.14it/s]

Retrieval with reranker completed and saved to test-claims-predictions.json



