In [1]:
import pandas as pd
from pathlib import Path
import os
import glob
import re

In [2]:
HOME_DIR = Path.home() / 'Desktop' / 'clir' / 'data' / 'wikidata' / 'base_data_file'


In [6]:
# --------------------- 过滤 query 的 QID ---------------------
def filter_qid(original_file: str, filtered_file: str):

    qid_df = pd.read_csv(original_file, encoding='utf-8')

    # columns_to_keep = ['query', 'search_term', 'id', 'label', 'description']
    columns_to_keep = ['query_id', 'query_text', 'q_item_qid']

    # 只取 columns_to_keep 列数据，然后去除 id 列 为NaN的行数据
    qid_filtered_df = qid_df[columns_to_keep].dropna(subset=['q_item_qid'])
    # 去除重复项
    qid_filtered_df.drop_duplicates(keep='first', inplace=True)

    # 保存文件
    qid_filtered_df.to_csv(filtered_file, index=False, encoding='utf-8')

    print(f"数据处理完成 文件存储在了{filtered_file}")

query_entity_qid_file = str(HOME_DIR / 'base_test1_qid.csv')
query_entity_qid_filtered_file = str(HOME_DIR / 'base_test1_qid_filtered.csv')

filter_qid(query_entity_qid_file, query_entity_qid_file)

In [12]:
# --------------------- 删除 实体、属性英文信息为空的行数据 ---------------------
def filter_item_info(original_file: str, filtered_file: str):

    item_info_df = pd.read_csv(original_file, encoding='utf-8')
    # 删除 'label_en', 'description_en' 为空的行数据
    item_info_df = item_info_df.dropna(subset=['label_en', 'description_en'], how='any')

    # 删除重复数据
    item_info_df.drop_duplicates(inplace=True)
    item_info_df.drop_duplicates(subset="item_qid", keep="first", inplace=True)

    item_info_df.to_csv(filtered_file, index=False, encoding='utf-8')
    
    print(f"数据处理完成 文件存储在了{filtered_file}")

item_info_file = str(HOME_DIR / 'base_test2_adj_item_info.csv')
item_filtered_info_file = str(HOME_DIR / 'base_test2_adj_item_info_filtered.csv')

filter_item_info(item_info_file, item_filtered_info_file)


数据处理完成 文件存储在了C:\Users\bajiuqier\Desktop\clir\data\wikidata\base_data\base_test2_adj_item_info_filtered.csv


In [6]:
# --------------------- 过滤三元组 （实体-关系-实体) ---------------------
def filter_triplet_id(original_file: str, filtered_file: str):
    # 读取 CSV 文件
    triplet_id_df = pd.read_csv(original_file, encoding='utf-8').astype(str)

    # 删除含有任何 NaN 值的行
    triplet_id_df = triplet_id_df.dropna()

    # 使用正则表达式过滤符合条件的行 匹配以 "Q" 开头后跟数字的字符串
    # na=False 确保 NaN 值不会引起错误。
    triplet_id_filtered_df = triplet_id_df[triplet_id_df['adj_item_qid'].str.match(r'^Q\d+$', na=False)]
    triplet_id_filtered_df = triplet_id_filtered_df[triplet_id_filtered_df['property_qid'].str.match(r'^P\d+$', na=False)]

    # 删除 重复行
    triplet_id_filtered_df = triplet_id_filtered_df.drop_duplicates(keep='first')

    # 将结果保存到 CSV 文件
    triplet_id_filtered_df.to_csv(filtered_file, index=False, encoding='utf-8')

    print(f"数据处理完成 文件存储在了{filtered_file}")

triplet_id_file = str(HOME_DIR / 'base_test11_triplet_id.csv')
triplet_id_filtered_file = str(HOME_DIR / 'base_test11_triplet_id_filtered.csv')

filter_triplet_id(triplet_id_file, triplet_id_filtered_file)

数据处理完成 文件存储在了C:\Users\bajiuqier\Desktop\clir\data\wikidata\base_data\base_test11_triplet_id_filtered.csv


In [22]:
# --------------------- 获取 triplet_id 片段 ---------------------
# 对于每一个item的每一个属性对应的最多 n个 adj_item
def get_triplet_id_fragment(original_file: str, filtered_file: str, n: int):
    # 读取原始文件
    triplet_id_df = pd.read_csv(original_file, encoding='utf-8')

    # 设置随机种子
    seed = 33

    # 创建一个空的列表来存储结果
    result_list = []

    # 对数据进行分组
    grouped = triplet_id_df.groupby(['item_qid', 'property_qid'])

    # 遍历每个分组
    for (item_qid, property_qid), group in grouped:
        # 如果组的大小小于或等于 n，直接添加整个组
        if len(group) <= n:
            result_list.append(group)
        else:
            # 否则，随机选择 n 个样本
            sampled = group.sample(n, random_state=seed)
            result_list.append(sampled)

    # 将结果列表连接成一个 DataFrame
    result_df = pd.concat(result_list, ignore_index=True)

    # 将结果保存到新的 CSV 文件中
    result_df.to_csv(filtered_file, index=False, encoding='utf-8')

triplet_id_file = str(HOME_DIR / 'triplet_id_filtered.csv')
triplet_id_fragment_file = str(HOME_DIR / 'triplet_id_fragment.csv')

get_triplet_id_fragment(triplet_id_file, triplet_id_fragment_file, n=3)



In [17]:
# 使用翻译引擎填充完缺失值后 检查是否还存在 空值 如果还存在空值 数量不多的情况下 手动 翻译 填充
# 读取文件
item_info_filled_file = str(HOME_DIR / 'base_test2_query_entity_info_filled.csv')
item_info_filled_df = pd.read_csv(item_info_filled_file, encoding='utf-8')
# 将存在空值的行的 index 转成列表
empty_index_list=item_info_filled_df[item_info_filled_df.isnull().any(axis=1)].index.to_list()

# 
if len(empty_index_list) == 0:
    print("不存在空值")
else:
    empty_qid_list = []
    for index in empty_index_list:
        item_qid = item_info_filled_df.loc[index]['item_qid']
        empty_qid_list.append(item_qid)
    print("存在空值的实体的 qid 列表：")
    print(f"{empty_qid_list}")
    


不存在空值


In [None]:
# Yandex 翻译 调用测试
# import requests

# text = "Yu Fei"

# url = f"https://translate.yandex.com/?source_lang=en&target_lang=kk&text={text}" 

# response = requests.get(url)

# print(response.json)


In [7]:
from utils import merge_csv_files

ADJ_ITEM_INFO_HOME_DIR = Path.home() / 'Desktop' / 'clir' / 'data' / 'wikidata' / 'base_adj_item_info'

# 合并所有的query-entity信息
pattern = r'base_adj_item_info_\d+\.csv'

folder_path = str(ADJ_ITEM_INFO_HOME_DIR)
output_file = str(ADJ_ITEM_INFO_HOME_DIR / 'base_adj_item_info.csv')

merge_csv_files(folder_path=folder_path, output_file=output_file, pattern=pattern)

合并完成，输出文件: C:\Users\bajiuqier\Desktop\clir\data\wikidata\base_adj_item_info\base_adj_item_info.csv


In [31]:
# --------------------- 重新整理 triplet id ---------------------
def rearranging_triplet_id(triplet_id_file: str, filter_reference_file: str, final_triplet_id_file: str):

    triplet_id_df = pd.read_csv(triplet_id_file, encoding='utf-8')
    filter_reference_df = pd.read_csv(filter_reference_file, encoding='utf-8')

    adj_item_qids = set(filter_reference_df["item_qid"])

    final_triplet_id_df = triplet_id_df[triplet_id_df["adj_item_qid"].isin(adj_item_qids)]
    final_triplet_id_df.to_csv(final_triplet_id_file, index=False, encoding='utf-8')

    print(f"数据处理完成 文件存储在了{final_triplet_id_file}")

triplet_id_file = HOME_DIR / 'base_test2_triplet_id_filtered.csv'
filter_reference_file = HOME_DIR / 'base_test2_adj_item_info_filled.csv'
final_triplet_id_file = HOME_DIR / 'base_test2_triplet_id_final.csv'

rearranging_triplet_id(
    triplet_id_file=triplet_id_file,
    filter_reference_file=filter_reference_file,
    final_triplet_id_file=final_triplet_id_file
)

数据处理完成 文件存储在了C:\Users\bajiuqier\Desktop\clir\data\wikidata\base_data\base_test2_triplet_id_final.csv


In [33]:
# --------------------- 重新整理 query entity qid ---------------------
def rearranging_query_entity_qid(query_entity_qid_file: str, filter_reference_file: str, final_query_entity_qid_file: str):

    query_entity_qid_df = pd.read_csv(query_entity_qid_file, encoding='utf-8')
    filter_reference_df = pd.read_csv(filter_reference_file, encoding='utf-8')

    q_item_qids = set(filter_reference_df["item_qid"])

    final_query_entity_qid_df = query_entity_qid_df[query_entity_qid_df["q_item_qid"].isin(q_item_qids)]
    final_query_entity_qid_df.to_csv(final_query_entity_qid_file, index=False, encoding='utf-8')

    print(f"数据处理完成 文件存储在了{final_query_entity_qid_file}")

query_entity_qid_file = HOME_DIR / 'base_test2_query_entity_qid_filtered.csv'
filter_reference_file = HOME_DIR / 'base_test2_triplet_id_final.csv'
final_query_entity_qid_file = HOME_DIR / 'base_test2_query_entity_qid_final.csv'

rearranging_query_entity_qid(
    query_entity_qid_file=query_entity_qid_file,
    filter_reference_file=filter_reference_file,
    final_query_entity_qid_file=final_query_entity_qid_file
)

数据处理完成 文件存储在了C:\Users\bajiuqier\Desktop\clir\data\wikidata\base_data\base_test2_query_entity_qid_final.csv


In [3]:
# 合并 test 相关文件

# 合并 query_entity_qid 文件
test1_query_entity_qid_file = str(HOME_DIR / "base_test1_query_entity_qid_final.csv")
test2_query_entity_qid_file = str(HOME_DIR / "base_test2_query_entity_qid_final.csv")
test_query_entity_qid_file = str(HOME_DIR / "base_test_query_entity_qid_final.csv")

test1_query_entity_qid_df = pd.read_csv(test1_query_entity_qid_file, encoding='utf-8')
test2_query_entity_qid_df = pd.read_csv(test2_query_entity_qid_file, encoding='utf-8')

test_query_entity_qid_df = pd.concat([test1_query_entity_qid_df, test2_query_entity_qid_df], ignore_index=True)
if test_query_entity_qid_df.duplicated(subset="query_id").any():
    raise ValueError("合并文件后 query_id 存在重复数据 请检查 并删除")
elif test_query_entity_qid_df.duplicated(subset="q_item_qid").any():
    raise ValueError("合并文件后 q_item_qid 存在重复数据 请检查 并删除")
else:
    test_query_entity_qid_df.to_csv(test_query_entity_qid_file, index=False, encoding='utf-8')
    print(f"数据合并完成 文件存储在了{test_query_entity_qid_file}")



数据合并完成 文件存储在了C:\Users\bajiuqier\Desktop\clir\data\wikidata\base_data_file\base_test_query_entity_qid_final.csv


In [3]:
# 合并 triplet_id 文件
test1_triplet_id_file = str(HOME_DIR / "base_test1_triplet_id_final.csv")
test2_triplet_id_file = str(HOME_DIR / "base_test2_triplet_id_final.csv")
test_triplet_id_file = str(HOME_DIR / "base_test_triplet_id_final.csv")

test1_triplet_id_df = pd.read_csv(test1_triplet_id_file, encoding='utf-8')
test2_triplet_id_df = pd.read_csv(test2_triplet_id_file, encoding='utf-8')

test_triplet_id_df = pd.concat([test1_triplet_id_df, test2_triplet_id_df], ignore_index=True)

item_num = len(set(test1_triplet_id_df["item_qid"])) + len(set(test2_triplet_id_df["item_qid"]))
item_merged_num = len(set(test_triplet_id_df["item_qid"]))

if item_num == item_merged_num:
    test_triplet_id_df.to_csv(test_triplet_id_file, index=False, encoding='utf-8')
    print(f"数据合并完成 文件存储在了{test_triplet_id_file}")
else:
    raise ValueError("合并文件后 存在重复 item_qid 数据 请检查 并删除")



数据合并完成 文件存储在了C:\Users\bajiuqier\Desktop\clir\data\wikidata\base_data_file\base_test_triplet_id_final.csv


In [4]:
# 合并 query_entity_info 

test1_query_entity_info_file = str(HOME_DIR / "base_test1_query_entity_info_filled.csv")
test2_query_entity_info_file = str(HOME_DIR / "base_test2_query_entity_info_filled.csv")
test_query_entity_info_file = str(HOME_DIR / "base_test_query_entity_info_filled.csv")

test1_query_entity_info_df = pd.read_csv(test1_query_entity_info_file, encoding='utf-8')
test2_query_entity_info_df = pd.read_csv(test2_query_entity_info_file, encoding='utf-8')

test_query_entity_info_df = pd.concat([test1_query_entity_info_df, test2_query_entity_info_df], ignore_index=True)

if test_query_entity_info_df.duplicated(subset="item_qid").any():
    raise ValueError("合并文件后 item_qid 存在重复数据 请检查 并删除")
else:
    test_query_entity_info_df.to_csv(test_query_entity_info_file, index=False, encoding='utf-8')
    print(f"数据合并完成 文件存储在了{test_query_entity_info_file}")


数据合并完成 文件存储在了C:\Users\bajiuqier\Desktop\clir\data\wikidata\base_data_file\base_test_query_entity_info_filled.csv


In [10]:
# 合并 adj_item_info 文件
test1_adj_item_info_file = str(HOME_DIR / "base_test1_adj_item_info_filled.csv")
test2_adj_item_info_file = str(HOME_DIR / "base_test2_adj_item_info_filled.csv")
test_adj_item_info_file = str(HOME_DIR / "base_test_adj_item_info_filled.csv")

test1_adj_item_info_df = pd.read_csv(test1_adj_item_info_file, encoding='utf-8')
test2_adj_item_info_df = pd.read_csv(test2_adj_item_info_file, encoding='utf-8')

test_adj_item_info_df = pd.concat([test1_adj_item_info_df, test2_adj_item_info_df], ignore_index=True)

if test_adj_item_info_df.duplicated(subset="item_qid").any():
    print("数据合并完成 文件中 item_qid 存在重复项")
    
    test_adj_item_info_df.drop_duplicates(subset="item_qid", keep="first", inplace=True)
    print("对重复项 进行仅保留一条数据 处理")

    test_adj_item_info_df.to_csv(test_adj_item_info_file, index=False, encoding='utf-8')
    print(f"文件存储在了{test_adj_item_info_file}")

else:
    test_adj_item_info_df.to_csv(test_adj_item_info_file, index=False, encoding='utf-8')
    print(f"数据合并完成 文件存储在了{test_adj_item_info_file}")


数据合并完成 文件中 item_qid 存在重复项
对重复项 进行仅保留一条数据 处理
文件存储在了C:\Users\bajiuqier\Desktop\clir\data\wikidata\base_data_file\base_test_adj_item_info_filled.csv


In [16]:
# 合并 test_qrels 文件
test1_qrels_file = str(HOME_DIR / "base_test1_qrels.csv")
test2_qrels_file = str(HOME_DIR / "base_test2_qrels.csv")
test_qrels_file = str(HOME_DIR / "base_test_qrels.csv")

test1_qrels_df = pd.read_csv(test1_qrels_file, encoding='utf-8')
test2_qrels_df = pd.read_csv(test2_qrels_file, encoding='utf-8')

test_qrels_df = pd.concat([test1_qrels_df, test2_qrels_df], ignore_index=True)

query_id_num = len(set(test1_qrels_df["query_id"])) + len(set(test2_qrels_df["query_id"]))
query_id_merged_num = len(set(test_qrels_df["query_id"]))

if query_id_num == query_id_merged_num:
    test_qrels_df.to_csv(test_qrels_file, index=False, encoding='utf-8')
    print(f"数据合并完成 文件存储在了{test_qrels_file}")
else:
    raise ValueError("合并文件后 存在重复 item_qid 数据 请检查 并删除")

数据合并完成 文件存储在了C:\Users\bajiuqier\Desktop\clir\data\wikidata\base_data_file\base_test_qrels.csv
