In [None]:
# Import all the packages
import numpy as np
import json
from collections import Counter
from sentence_transformers import SentenceTransformer
import pickle
import faiss
from tqdm import tqdm
import nltk
import re
from datasets import Dataset
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
train_claims_path = './data/train-claims.json'
dev_claims_path = './data/dev-claims.json'
evidence_path = './data/evidence.json'

## 训minilm的

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import json
import random

train = False
dev_train_included = False
if (train):
    # 加载数据
    with open(train_claims_path, 'r') as f:
        train_claims = json.load(f)
    with open(evidence_path, 'r') as f:
        evidence_dict = json.load(f)
    if (dev_train_included):
        with open(dev_claims_path,'r') as f:
            dev_claims = json.load(f)

    # 构建训练样本列表（claim, evidence_text） -> label默认为1.0
    train_samples = []
    missed = 0

    for claim in train_claims.values():
        claim_text = claim['claim_text']
        evidence_ids = claim.get('evidences', [])
        for eid in evidence_ids:
            if eid in evidence_dict:
                ev_text = evidence_dict[eid]
                train_samples.append(InputExample(texts=[claim_text, ev_text], label=1.0))
            else:
                missed += 1

    if (dev_train_included):
        for claim in dev_claims.values():
            claim_text = claim['claim_text']
            evidence_ids = claim.get('evidences', [])
            for eid in evidence_ids:
                if eid in evidence_dict:
                    ev_text = evidence_dict[eid]
                    train_samples.append(InputExample(texts=[claim_text, ev_text], label=1.0))
                else:
                    missed += 1
    
    print(f"Total training pairs: {len(train_samples)}")
    print(f"Missing evidence ids: {missed}")

    # 加载预训练模型
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    # 构建 Dataloader
    train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=32)

    # 定义损失函数
    train_loss = losses.MultipleNegativesRankingLoss(model)

    # 开始训练
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=5,  # 可设为 2-3，看你资源
        warmup_steps=100,
        show_progress_bar=True
    )

    # 保存模型
    if (dev_train_included):
        model.save('./model/final_finetuned_minilm_retriever')
        print("✅ Finetuned model saved.")
    else:
        model.save('./model/my_finetuned_minilm_retriever')
        print("✅ Finetuned model saved.")


Total training pairs: 4613
Missing evidence ids: 0


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.6943


✅ Finetuned model saved.


# 训msmarco reranker的

In [None]:
from sentence_transformers import CrossEncoder, InputExample
from torch.utils.data import DataLoader
import json
import random

train = False
dev_train_included = False

if train:
    with open(train_claims_path, 'r') as f:
        train_claims = json.load(f)
    with open(evidence_path, 'r') as f:
        evidence_dict = json.load(f)
    if dev_train_included:
        with open(dev_claims_path, 'r') as f:
            dev_claims = json.load(f)

    # ✅ 构造正负样本
    train_samples = []

    def generate_samples(claims_data):
        samples = []
        for claim in claims_data.values():
            claim_text = claim["claim_text"]
            evidence_ids = claim.get("evidences", [])
            pos_evidence_texts = [evidence_dict[eid] for eid in evidence_ids if eid in evidence_dict]

            # 正样本
            for ev in pos_evidence_texts:
                samples.append(InputExample(texts=[claim_text, ev], label=1.0))

            # 负样本
            neg_pool = [e for eid, e in evidence_dict.items() if eid not in evidence_ids]
            for _ in range(len(pos_evidence_texts)):
                neg_ev = random.choice(neg_pool)
                samples.append(InputExample(texts=[claim_text, neg_ev], label=0.0))

        return samples

    train_samples.extend(generate_samples(train_claims))
    if dev_train_included:
        train_samples.extend(generate_samples(dev_claims))

    print(f"✅ Total training samples: {len(train_samples)}")

    # ✅ 构建 DataLoader（InputExample 是合法格式）
    train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=16)

    # ✅ 加载 MS MARCO CrossEncoder
    model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", num_labels=1)

    # ✅ 训练模型
    model.fit(
        train_dataloader=train_dataloader,
        epochs=5,
        warmup_steps=100,
        show_progress_bar=True
    )

    # ✅ 保存模型
    if (dev_train_included):
        model.save('./model/final_finetuned_msmarco_reranker')
        print("✅ Finetuned model saved.")
    else:
        model.save('./model/my_finetuned_msmarco_reranker')
        print("✅ Finetuned model saved.")


In [None]:
if (dev_train_included):
    model = SentenceTransformer('./model/final_finetuned_minilm_retriever')
    reranker =  CrossEncoder('./model/final_finetuned_msmarco_reranker')
else:
    model = SentenceTransformer('./model/my_finetuned_minilm_retriever')
    reranker =  CrossEncoder('./model/my_finetuned_msmarco_reranker')

## encode evidence dictionary

In [None]:
import nltk
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from lemminflect import getAllInflections
nlp = spacy.load("en_core_web_sm")
with open(train_claims_path, 'r') as f:
    train_claims = json.load(f)

with open(evidence_path, 'r') as f:
    evidence_dict = json.load(f)

# === 从 claim_text 中提取所有名词并统计词频 ===
all_nouns = []
for claim_obj in train_claims.values():
    doc = nlp(claim_obj["claim_text"])
    nouns = [token.lemma_.lower() for token in doc if token.pos_ == "NOUN"]
    all_nouns.extend(nouns)

# === 选出 top 100 高频名词作为关键词 ===
top_keywords = set(word for word, _ in Counter(all_nouns).most_common(100))

all_forms = set()
for lemma in top_keywords:
  all_forms.add(lemma)
  # 拿到所有可能的名词形式
  infl_map = getAllInflections(lemma, upos="NOUN")
  # infl_map 是个 dict：{ 'NNS': ['cats'], 'NNPS': ['children'], ... }
  for forms in infl_map.values():
      all_forms.update(forms)


In [None]:
def contains_climate_keywords(text: str, all_forms: set) -> bool:
    # 直接把文本小写化、拆成“词”后查集合
    words = re.findall(r"\b[a-z']+\b", text.lower())
    return any(word in all_forms for word in words)


def is_english(text: str, threshold: float = 0.5) -> bool:
  # 清理文本，仅保留字母和空格
  text = re.sub(r'[^a-zA-Z\s]', '', text)
  if len(text) == 0:  # 如果文本清理后为空，返回False
    return False
  # 计算英文字符占比
  english_char_count = sum(1 for char in text if char.isalpha())
  return (english_char_count / len(text)) >= threshold

def clean_and_split(eid, text):
  result_ids = []
  result_texts = []
  sentences = sent_tokenize(text)
  for i, sent in enumerate(sentences):
    sent = sent.lower()
    sent = re.sub(r'[^a-z0-9\s.,!?]', '', sent)  # 去除标点
    sent = re.sub(r'\s+', ' ', sent).strip()
    if len(sent.split()) >= 5:  # 可选：过滤太短的文本
      result_ids.append(f"{eid}_s{i}")  # 用原始的 eID 加上句子索引
      result_texts.append(sent)
  return result_ids, result_texts



In [None]:
encode = False # Variable to control whether encode the evidence
# Load the sentence-BERT (all-MiniLM-L6-v2)
if (dev_train_included):
  word_embedding_path = './word_embedding/final_evidence_embeddings.npy'
  word_embedding_meta_path = "./word_embedding/final_evidence_meta.pkl"
else: 
  word_embedding_path = './word_embedding/evidence_embeddings.npy'
  word_embedding_meta_path = "./word_embedding/evidence_meta.pkl"

if (encode):
  with open(evidence_path, 'r') as f:
    evidence_dict = json.load(f)
  # ——— 1. 批量剔除非英文 —— #
  eids  = list(evidence_dict.keys())
  texts = list(evidence_dict.values())
  english_pairs = [
    (eid, txt)
    for eid, txt in zip(eids, texts)
    if is_english(txt)
  ]
  print(f"Step1: English keep {len(english_pairs)}/{len(texts)}")

  # ——— 2. 批量剔除非 climate-related —— #
  climate_pairs = [
    (eid, txt)
    for eid, txt in english_pairs
    if contains_climate_keywords(txt, all_forms)
  ]
  print(f"Step2: Climate-related keep {len(climate_pairs)}/{len(english_pairs)}")


  # 清洗和分句后的结果
  cleaned_evidence_ids = []
  cleaned_evidence_texts = []
  original_evidence_ids = []  # 记录每个分句所属的原始 evidence_id

  # 遍历 evidence 数据并进行清洗和分句
  for eid, text in climate_pairs:
    cleaned_ids, cleaned_texts = clean_and_split(eid, text)
    cleaned_evidence_ids.extend(cleaned_ids)
    cleaned_evidence_texts.extend(cleaned_texts)
    original_evidence_ids.extend([eid] * len(cleaned_ids))  # 每个分句都记录原始的 eID

  # 使用 Sentence-BERT 对清洗后的分句进行编码
  evidence_embeddings = model.encode(
    cleaned_evidence_texts,
    convert_to_numpy=True,
    normalize_embeddings=True,
    show_progress_bar=True
  )

  # 保存编码后的嵌入（embeddings）
  np.save(word_embedding_path, evidence_embeddings)

  # 保存分句后的 evidence_ids 和 texts 以及对应的原始 evidence_id 映射
  with open(word_embedding_meta_path, "wb") as f:
      pickle.dump((cleaned_evidence_ids, cleaned_evidence_texts, original_evidence_ids), f)


## test and predict

In [None]:
# Load numpy embeddings
evidence_embeddings = np.load(word_embedding_path)

# Load evidence_ids, evidence_texts, and original_evidence_ids
with open(word_embedding_meta_path, "rb") as f:
  evidence_ids, evidence_texts, original_evidence_ids = pickle.load(f)


dimension = evidence_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner Product = Cosine similarity if normalized
index.add(evidence_embeddings)


def clean_claim(claim: str) -> str:
  # 小写化
  claim = claim.lower()
  # 去除标点符号
  claim = re.sub(r'[^a-z0-9\s]', '', claim)
  # 去除多余空格
  claim = re.sub(r'\s+', ' ', claim).strip()
  # 如果需要，你也可以添加去除停用词的步骤
  # claim = " ".join([word for word in claim.split() if word not in stopwords])
  return claim

# Create mapping
evidence_dict = dict(zip(evidence_ids, evidence_texts))

with open(evidence_path, 'r') as f:
    original_evidence_dict = json.load(f)

def retrieve_evidence(claim_id, claim_data, retrieval=100, top_k=5):
    claim_text = claim_data["claim_text"]
    cleaned_claim = clean_claim(claim_text)

    # Step 1: 粗检索（model + FAISS）
    claim_embedding = model.encode([cleaned_claim], convert_to_numpy=True, normalize_embeddings=True)
    scores, indices = index.search(claim_embedding, retrieval * 3)

    # Step 2: 去重候选
    seen_original_ids = set()
    candidates = []
    for i in indices[0]:
        eid = evidence_ids[i]
        text = evidence_dict[eid]
        original_id = original_evidence_ids[i]

        if original_id not in seen_original_ids:
            candidates.append((original_id, eid, text))
            seen_original_ids.add(original_id)

        if len(candidates) >= retrieval:
            break

    # Step 3: 精排序（CrossEncoder）
    pairs = [(claim_text, original_evidence_dict[orig_id]) for (orig_id, _, _) in candidates]  # ✅
    similarity_scores = reranker.predict(pairs)

    reranked = sorted(zip(candidates, similarity_scores), key=lambda x: x[1], reverse=True)

    # ✅ 只返回原始 document-level evidence ID
    top_k_original_ids = [orig_id for (orig_id, _, _), _ in reranked[:top_k]]

    # ✅ 构造最终结果 dict
    result = {
        "claim_text": claim_text,
        "evidences": top_k_original_ids
    }

    return result


In [None]:
from tqdm import tqdm
import numpy as np
import json

# 加载 dev claims
with open(dev_claims_path, 'r') as f:
    dev_claims = json.load(f)

claim_ids = list(dev_claims.keys())
top_k = 4

recalls = []
precisions = []
f1s = []

for cid in tqdm(claim_ids, desc="Evaluating"):
    truth = set(dev_claims[cid]["evidences"])
    
    # ✅ 新版 retrieve_evidence 返回 dict
    retrieved_info = retrieve_evidence(cid, dev_claims[cid], retrieval=100, top_k=top_k)
    retrieved = set(retrieved_info["evidences"])

    hit = len(truth & retrieved)

    # Recall
    recall = hit / len(truth) if len(truth) > 0 else 0
    recalls.append(recall)

    # Precision
    precision = hit / top_k if top_k > 0 else 0
    precisions.append(precision)

    # F1
    if precision + recall > 0:
        f1 = 2 * precision * recall / (precision + recall)
    else:
        f1 = 0
    f1s.append(f1)

# 输出平均指标
print(f"\n📊 Average Recall@{top_k}:    {np.mean(recalls):.2%}")
print(f"📊 Average Precision@{top_k}: {np.mean(precisions):.2%}")
print(f"📊 Average F1@{top_k}:        {np.mean(f1s):.2%}")


Evaluating: 100%|██████████| 154/154 [00:17<00:00,  9.06it/s]


📊 Average Recall@4:    26.36%
📊 Average Precision@4: 18.02%
📊 Average F1@4:        19.90%





In [None]:
import json
import re
import numpy as np
from tqdm import tqdm

# 加载 test claims 数据
with open('./data/test-claims-unlabelled.json', 'r') as f:
    test_claims = json.load(f)

# 清洗函数
def clean_claim(claim: str) -> str:
    claim = claim.lower()
    claim = re.sub(r'[^a-z0-9\s]', '', claim)
    claim = re.sub(r'\s+', ' ', claim).strip()
    return claim

# ✅ 执行检索并保存
test_predictions = {}

for claim_id, claim_data in tqdm(test_claims.items(), desc="Retrieving Evidence"):
    result = retrieve_evidence(claim_id, claim_data, retrieval=100, top_k=4)

    # 去掉 claim_label（因为 test 中无 label）
    if "claim_label" in result:
        del result["claim_label"]

    test_predictions[claim_id] = result

# ✅ 保存预测结果
with open("test-claims-predictions.json", "w") as f:
    json.dump(test_predictions, f, indent=2)

print("✅ Retrieval with reranker completed and saved to test-claims-predictions.json")


Retrieving Evidence: 100%|██████████| 153/153 [00:16<00:00,  9.11it/s]

✅ Retrieval with reranker completed and saved to test-claims-predictions.json



