In [2]:
import pandas as pd
from pathlib import Path
import os
import glob
import re

In [3]:
HOME_DIR = Path.home() / 'Desktop' / 'clir' / 'data' / 'wikidata' / 'base_data'


In [6]:
# --------------------- 过滤 query 的 QID ---------------------
def filter_qid(original_file: str, filtered_file: str):

    QID_df = pd.read_csv(original_file, encoding='utf-8')

    # columns_to_keep = ['query', 'search_term', 'id', 'label', 'description']
    columns_to_keep = ['query_id', 'query_text', 'q_item_qid']

    # 只取 columns_to_keep 列数据，然后去除 id 列 为NaN的行数据
    QID_filtered_df = QID_df[columns_to_keep].dropna(subset=['q_item_qid'])
    # 去除重复项
    QID_filtered_df.drop_duplicates(keep='first', inplace=True)

    # 保存文件
    QID_filtered_df.to_csv(filtered_file, index=False, encoding='utf-8')

    print(f"数据处理完成 文件存储在了{filtered_file}")

query_entity_qid_file = str(HOME_DIR / 'base_test1_qid.csv')
query_entity_qid_filtered_file = str(HOME_DIR / 'base_test1_qid_filtered.csv')

filter_qid(query_entity_qid_file, query_entity_qid_file)

In [5]:
# --------------------- 删除 实体、属性英文信息为空的行数据 ---------------------
def filter_item_info(original_file: str, filtered_file: str):

    item_info_df = pd.read_csv(original_file, encoding='utf-8')
    # 删除'label_en', 'description_en' 为空的行数据
    item_info_df = item_info_df.dropna(subset=['label_en', 'description_en'], how='any')

    # 删除重复数据
    item_info_df.drop_duplicates(inplace=True)
    item_info_df.drop_duplicates(subset="item_qid", keep="first", inplace=True)

    item_info_df.to_csv(filtered_file, index=False, encoding='utf-8')
    
    print(f"数据处理完成 文件存储在了{filtered_file}")

item_info_file = str(HOME_DIR / 'base_train_adj_item_info.csv')
item_filtered_info_file = str(HOME_DIR / 'base_train_adj_item_filtered_info.csv')

filter_item_info(item_info_file, item_filtered_info_file)


In [9]:
# --------------------- 过滤三元组 （实体-关系-实体) ---------------------
def filter_triplet_id(original_file: str, filtered_file: str):
    # 读取 CSV 文件
    triplet_id_df = pd.read_csv(original_file, encoding='utf-8').astype(str)

    # 删除含有任何 NaN 值的行
    triplet_id_df = triplet_id_df.dropna()

    # 使用正则表达式过滤符合条件的行 匹配以 "Q" 开头后跟数字的字符串
    # na=False 确保 NaN 值不会引起错误。
    triplet_id_filtered_df = triplet_id_df[triplet_id_df['adj_item_qid'].str.match(r'^Q\d+$', na=False)]
    triplet_id_filtered_df = triplet_id_filtered_df[triplet_id_filtered_df['property_qid'].str.match(r'^P\d+$', na=False)]

    # 删除 重复行
    triplet_id_filtered_df = triplet_id_filtered_df.drop_duplicates(keep='first')

    # 将结果保存到 CSV 文件
    triplet_id_filtered_df.to_csv(filtered_file, index=False, encoding='utf-8')

    print(f"数据处理完成 文件存储在了{filtered_file}")

triplet_id_file = str(HOME_DIR / 'base_train_triplet_id.csv')
triplet_id_filtered_file = str(HOME_DIR / 'base_train_triplet_id_dddd.csv')

filter_triplet_id(triplet_id_file, triplet_id_filtered_file)

数据处理完成 文件存储在了C:\Users\bajiuqier\Desktop\clir\data\wikidata\base_data\base_train_triplet_id_dddd.csv


In [22]:
# --------------------- 获取 triplet_id 片段 ---------------------
# 对于每一个item的每一个属性对应的最多 n个 adj_item
def get_triplet_id_fragment(original_file: str, filtered_file: str, n: int):
    # 读取原始文件
    triplet_id_df = pd.read_csv(original_file, encoding='utf-8')

    # 设置随机种子
    seed = 33

    # 创建一个空的列表来存储结果
    result_list = []

    # 对数据进行分组
    grouped = triplet_id_df.groupby(['item_qid', 'property_qid'])

    # 遍历每个分组
    for (item_qid, property_qid), group in grouped:
        # 如果组的大小小于或等于 n，直接添加整个组
        if len(group) <= n:
            result_list.append(group)
        else:
            # 否则，随机选择 n 个样本
            sampled = group.sample(n, random_state=seed)
            result_list.append(sampled)

    # 将结果列表连接成一个 DataFrame
    result_df = pd.concat(result_list, ignore_index=True)

    # 将结果保存到新的 CSV 文件中
    result_df.to_csv(filtered_file, index=False, encoding='utf-8')

triplet_id_file = str(HOME_DIR / 'triplet_id_filtered.csv')
triplet_id_fragment_file = str(HOME_DIR / 'triplet_id_fragment.csv')

get_triplet_id_fragment(triplet_id_file, triplet_id_fragment_file, n=3)



In [15]:
# 使用翻译引擎填充完缺失值后 检查是否还存在 空值 如果还存在空值 数量不多的情况下 手动 翻译 填充
# 读取文件
item_info_filled_file = str(HOME_DIR / 'base_train_query_entity_filled_info.csv')
item_info_filled_df = pd.read_csv(item_info_filled_file, encoding='utf-8')
# 将存在空值的行的 index 转成列表
empty_index_list=item_info_filled_df[item_info_filled_df.isnull().any(axis=1)].index.to_list()

# 
if len(empty_index_list) == 0:
    print("不存在空值")
else:
    empty_qid_list = []
    for index in empty_index_list:
        item_qid = item_info_filled_df.loc[index]['item_qid']
        empty_qid_list.append(item_qid)
    print("存在空值的实体的 qid 列表：")
    print(f"{empty_qid_list}")
    


不存在空值


In [None]:
# Yandex 翻译 调用测试
# import requests

# text = "Yu Fei"

# url = f"https://translate.yandex.com/?source_lang=en&target_lang=kk&text={text}" 

# response = requests.get(url)

# print(response.json)


In [3]:
from utils import merge_csv_files

ADJ_ITEM_INFO_HOME_DIR = Path.home() / 'Desktop' / 'clir' / 'data' / 'wikidata' / 'base_train_adj_item_info'

# 合并所有的query-entity信息
pattern = r'base_train_adj_item_info_\d+\.csv'

folder_path = str(ADJ_ITEM_INFO_HOME_DIR)
output_file = str(ADJ_ITEM_INFO_HOME_DIR / 'base_train_adj_item_info.csv')

merge_csv_files(folder_path=folder_path, output_file=output_file, pattern=pattern)

合并完成，输出文件: C:\Users\bajiuqier\Desktop\clir\data\wikidata\base_train_adj_item_info\base_train_adj_item_info.csv


In [4]:
# --------------------- 重新整理 triplet id ---------------------
def rearranging_triplet_id(triplet_id_file: str, filter_reference_file: str, final_triplet_id_file: str):

    triplet_id_df = pd.read_csv(triplet_id_file, encoding='utf-8')
    filter_reference_df = pd.read_csv(filter_reference_file, encoding='utf-8')

    adj_item_qids = set(filter_reference_df["item_qid"])

    final_triplet_id_df = triplet_id_df[triplet_id_df["adj_item_qid"].isin(adj_item_qids)]
    final_triplet_id_df.to_csv(final_triplet_id_file, index=False, encoding='utf-8')

    print(f"数据处理完成 文件存储在了{final_triplet_id_file}")

triplet_id_file = HOME_DIR / 'base_train_triplet_id_fragment_3.csv'
filter_reference_file = HOME_DIR / 'base_train_adj_item_info_filled.csv'
final_triplet_id_file = HOME_DIR / 'base_train_triplet_id_fragment_3_final.csv'

rearranging_triplet_id(
    triplet_id_file=triplet_id_file,
    filter_reference_file=filter_reference_file,
    final_triplet_id_file=final_triplet_id_file
)

数据处理完成 文件存储在了C:\Users\bajiuqier\Desktop\clir\data\wikidata\base_data\base_train_triplet_id_fragment_3_final.csv


In [7]:
# --------------------- 重新整理 query entity qid ---------------------
def rearranging_query_entity_qid(query_entity_qid_file: str, filter_reference_file: str, final_query_entity_qid_file: str):

    query_entity_qid_df = pd.read_csv(query_entity_qid_file, encoding='utf-8')
    filter_reference_df = pd.read_csv(filter_reference_file, encoding='utf-8')

    q_item_qids = set(filter_reference_df["item_qid"])

    final_query_entity_qid_df = query_entity_qid_df[query_entity_qid_df["q_item_qid"].isin(q_item_qids)]
    final_query_entity_qid_df.to_csv(final_query_entity_qid_file, index=False, encoding='utf-8')

    print(f"数据处理完成 文件存储在了{final_query_entity_qid_file}")

query_entity_qid_file = HOME_DIR / 'base_train_query_entity_qid_filtered.csv'
filter_reference_file = HOME_DIR / 'base_train_triplet_id_fragment_3_final.csv'
final_query_entity_qid_file = HOME_DIR / 'base_train_query_entity_qid_final.csv'

rearranging_query_entity_qid(
    query_entity_qid_file=query_entity_qid_file,
    filter_reference_file=filter_reference_file,
    final_query_entity_qid_file=final_query_entity_qid_file
)

数据处理完成 文件存储在了      query_id  query_text q_item_qid
0         2625       7月19日      Q2726
1          345          数论     Q12479
2         4150          儒家      Q9581
3          595        人口密度     Q22856
4         2818     田纳西·威廉斯    Q134262
...        ...         ...        ...
7992   6839169  波奧亞 (夏威夷州)   Q3878252
7994   6844149     魔進戰隊煌輝者  Q79818233
7996   6845479       碳酸乙烯酯    Q421145
7998   6852733      溫特施托克山   Q6981225
8000   6861930      升变王车易位   Q2048586

[5087 rows x 3 columns]
