In [1]:
import os
import json
import pandas as pd

def csv2json(data_path):
    data = pd.read_csv(data_path, encoding="utf-8")
    data = data.to_dict(orient='records')
    return data

data_paths = ["train.csv", "val.csv"]
for data_path in data_paths:
    data_save_path = ".".join(data_path.split('.')[:-1]) + ".json"
    data = csv2json(data_path)
    with open(data_save_path, 'w', encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

# 检索【维基百科信息（../index）】和【训练集例题（../index_ref）】构造数据

In [5]:
import os
import json
import pandas as pd
from pyserini.search.lucene import LuceneSearcher
from jinja2 import Template

with open("kb_search_template.j2") as f:
    kb_template = Template(f.read())

with open("ref_search_template.j2") as f:
    ref_template = Template(f.read())

def render(template, item):
    if "question" in item:
        return template.render(
            question=item['question'],
            options=[option[2:] for option in item["options"].split("\n")], # 移除开头的字母和空格
        )
    else:
        # 大小写问题
        return template.render(
            question=item['Question'],
            options=[option[2:] for option in item["Options"].split("\n")], # 移除开头的字母和空格
        )


def select_kb(data_path, kb_path, kb_num, ref_path, ref_num, data_save_path):
    # Load the data
    with open(data_path, 'r') as f:
        if data_path.endswith('.json'):
            data = json.load(f)
        else:
            raise ValueError('Unsupported data format')
        # 这里应该不用再考虑 CSV 了，而且这一段有问题，应该不存在 explaination
        # elif data_path.endswith('.csv'):
        #     data = pd.read_csv(f)
        #     data_json = []
        #     for i, row in data.iterrows():
        #         data_json.append({
        #             'question': row['Question'],
        #             'options': row['Options'],
        #             'answer': row['Answer'],
        #             'explanation': row['Explanation']
        #         })
        #     data = data_json
    # Load the indexes
    # kb_searcher = LuceneSearcher(kb_path)
    # kb_searcher.set_language('zh')
    ref_searcher = LuceneSearcher(ref_path)
    ref_searcher.set_language('zh')

    # Select the kb
    new_data = []
    for i, item in enumerate(data):
        # 搜 kb
        # kb_query = render(kb_template, item)
        # kb_hits = kb_searcher.search(kb_query, k=kb_num)

        # 搜 ref
        ref_query = render(ref_template, item)
        ref_hits = ref_searcher.search(ref_query, k=ref_num)

        new_item = item.copy()
        # new_item['kb'] = [hit_item.lucene_document.get("raw") for hit_item in kb_hits]
        new_item['ref'] = [hit_item.lucene_document.get("raw") for hit_item in ref_hits]

        # 这个地方应该是不需要 if 的，不然有可能导致漏题
        # if len(hits) > 0:
        new_data.append(new_item)

        if i % 1000 == 0:
            with open(data_save_path, 'w') as f:
                json.dump(new_data, f, ensure_ascii=False, indent=4)
            print(f'Processing {i}...')

    # Save the new data
    with open(data_save_path, 'w') as f:
        json.dump(new_data, f, ensure_ascii=False, indent=4)

data_path = "test_answer_kb.json"
kb_index = "../index"
ref_index = "../index_ref"
kb_num = 4
ref_num = 8
data_save_path = ".".join(data_path.split('.')[:-1]) + f"_kb{kb_num}_ref{ref_num}.json"
select_kb(data_path, kb_index, kb_num, ref_index, ref_num, data_save_path)

Processing 0...
Processing 1000...
Processing 2000...


# 检索外部知识构造数据集（先使用TF-IDF提取keywords，将keywords作为检索的query）（未用）

In [None]:
import keyword
import os
import json
import pandas as pd
from pyserini.search.lucene import LuceneSearcher

import re
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

def preprocess_text(text, stopwords):
    """中文文本预处理：分词、去除停用词、标点符号"""
    text = re.sub(r'\d+', '', text)  # 去除数字
    # text = re.sub(r'[^\u4e00-\u9fa5]', '', text)  # 去除非中文字符
    words = jieba.lcut(text)  # 使用 jieba 分词
    words = [word for word in words if word not in stopwords and len(word) > 1]  # 去除停用词和长度小于2的词
    return ' '.join(words)

def extract_keywords(texts, top_n=5):
    """使用 TF-IDF 提取关键词"""
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names_out()
    
    keywords_list = []
    for i in range(tfidf_matrix.shape[0]):
        # 获取每条文本的 TF-IDF 权重
        tfidf_scores = tfidf_matrix[i, :].toarray()[0]
        # 选择得分最高的关键词
        sorted_indices = tfidf_scores.argsort()[-top_n:][::-1]
        keywords = [feature_names[index] for index in sorted_indices]
        keywords_list.append(keywords)
    
    return keywords_list

def select_kb(data_path, index_path, kb_num, kw_num, data_save_path):
    # Load the data
    with open(data_path, 'r') as f:
        if data_path.endswith('.json'):
            data = json.load(f)
        elif data_path.endswith('.csv'):
            data = pd.read_csv(f)
            data_json = []
            for i, row in data.iterrows():
                data_json.append({
                    'question': row['Question'],
                    'options': row['Options'],
                    'answer': row['Answer'],
                    'explanation': row['Explanation']
                })
            data = data_json
    # Extract keyword
    with open("baidu_stopwords.txt", 'r') as f:
        stopwords = [line.strip() for line in f]
    texts = [text['question'] + "\n" + text['options'] for text in data]
    texts = [preprocess_text(text, stopwords) for text in texts]
    keywords_list = extract_keywords(texts, kw_num)
    data = [{**data[i], 'keywords': keywords_list[i]} for i in range(len(data))]
    
    # Load the index
    searcher = LuceneSearcher(index_path)
    searcher.set_language('zh')
    # Select the kb
    new_data = []
    for i, item in enumerate(data):
        query = ' '.join(item['keywords'])
        hits = searcher.search(query, k=kb_num)
        new_item = item.copy()
        new_item['kb'] = [hit_item.lucene_document.get("raw") for hit_item in hits]
        if len(hits) > 0:
            new_data.append(new_item)
        if i % 1000 == 0:
            with open(data_save_path, 'w') as f:
                json.dump(new_data, f, ensure_ascii=False, indent=4)
            print(f'Processing {i}...')
    # Save the new data
    with open(data_save_path, 'w') as f:
        json.dump(new_data, f, ensure_ascii=False, indent=4)

data_path = "train.csv"
kb_path = "../index"
kb_num = 4
kw_num = 5
data_save_path = ".".join(data_path.split('.')[:-1]) + f"_index_by_keyword{kw_num}_kb{kb_num}.json"
select_kb(data_path, kb_path, kb_num, kw_num, data_save_path)

Processing 0...
Processing 1000...
Processing 2000...
Processing 3000...
Processing 4000...
Processing 5000...
Processing 6000...
Processing 7000...
Processing 8000...
Processing 9000...
Processing 10000...
Processing 11000...
Processing 12000...
Processing 13000...
Processing 14000...
Processing 15000...
Processing 16000...
Processing 17000...
Processing 18000...
Processing 19000...
Processing 20000...
Processing 21000...
Processing 22000...
Processing 23000...
Processing 24000...
Processing 25000...
Processing 26000...
Processing 27000...
Processing 28000...
Processing 29000...
Processing 30000...
Processing 31000...
Processing 32000...
Processing 33000...
Processing 34000...
Processing 35000...
Processing 36000...
Processing 37000...
Processing 38000...
Processing 39000...
Processing 40000...
Processing 41000...
Processing 42000...
Processing 43000...
Processing 44000...
Processing 45000...
Processing 46000...
Processing 47000...
Processing 48000...
Processing 49000...
Processing 50

In [5]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Error loading stopwords: <urlopen error [Errno 110]
[nltk_data]     Connection timed out>


KeyboardInterrupt: 