In [4]:
import pandas as pd
from pathlib import Path
import os
import glob
import re

In [5]:
HOME_DIR = Path.home() / 'Desktop' / 'clir' / 'data' / 'wikidata' / 'base_data'


In [6]:
# --------------------- 过滤 query 的 QID ---------------------
QID_search_results_file = str(HOME_DIR / 'base_test1_qid.csv')
QID_filtered_search_results_file = str(HOME_DIR / 'base_test1_qid_filtered.csv')

def filter_qid(original_file: str, filtered_file: str):

    QID_df = pd.read_csv(original_file, encoding='utf-8')

    # columns_to_keep = ['query', 'search_term', 'id', 'label', 'description']
    columns_to_keep = ['query_id', 'query', 'qid']

    # 只取 columns_to_keep 列数据，然后去除 id 列 为NaN的行数据
    QID_filtered_df = QID_df[columns_to_keep].dropna(subset=['qid'])
    # 去除重复项
    QID_filtered_df.drop_duplicates(keep='first', inplace=True)

    # 保存文件
    QID_filtered_df.to_csv(filtered_file, index=False, encoding='utf-8')

filter_qid(QID_search_results_file, QID_filtered_search_results_file)

# QID_df = pd.read_csv(QID_search_results_file, encoding='utf-8')

# columns_to_keep = ['query_id', 'query', 'qid']
# QID_filtered_df = QID_df[columns_to_keep]

# ddd = str(HOME_DIR / 'base_test1_qid.csv')

# QID_filtered_df.to_csv(ddd, index=False, encoding='utf-8')

In [7]:
# --------------------- 删除 实体、属性英文信息为空的行数据 ---------------------
def filter_item_info(original_file: str, filtered_file: str):

    item_info_df = pd.read_csv(original_file, encoding='utf-8')
    # 删除'label_en', 'description_en' 为空的行数据
    item_info_df = item_info_df.dropna(subset=['label_en', 'description_en'], how='any')

    item_info_df.to_csv(filtered_file, index=False, encoding='utf-8')

query_entity_info_file = str(HOME_DIR / 'base_test2_query_entity_info.csv')
query_entity_filtered_info_file = str(HOME_DIR / 'base_test2_query_entity_filtered_info.csv')

filter_item_info(query_entity_info_file, query_entity_filtered_info_file)


In [10]:
# --------------------- 过滤三元组 （实体-关系-实体) ---------------------
def filter_triplet_id(original_file: str, filtered_file: str):
    # 读取 CSV 文件
    triplet_id_df = pd.read_csv(original_file, encoding='utf-8')

    # 删除含有任何 NaN 值的行
    triplet_id_df = triplet_id_df.dropna()

    # 确保 AdjItem 列中的值是字符串类型，并且填充 NaN 值
    triplet_id_df['adj_item_qid'] = triplet_id_df['adj_item_qid'].astype(str)

    # 使用正则表达式过滤符合条件的行 匹配以 "Q" 开头后跟数字的字符串
    # na=False 确保 NaN 值不会引起错误。
    triplet_id_filtered_df = triplet_id_df[triplet_id_df['adj_item_qid'].str.match(r'^Q\d+$', na=False)]

    # 删除 重复行
    triplet_id_filtered_df = triplet_id_filtered_df.drop_duplicates(keep='first')

    # 将结果保存到 CSV 文件
    triplet_id_filtered_df.to_csv(filtered_file, index=False, encoding='utf-8')

triplet_id_file = str(HOME_DIR / 'triplet_id_595.csv')
triplet_id_filtered_file = str(HOME_DIR / 'triplet_id_595_filtered.csv')

filter_triplet_id(triplet_id_file, triplet_id_filtered_file)

In [22]:
# --------------------- 获取 triplet_id 片段 ---------------------
# 对于每一个item的每一个属性对应的最多 n个 adj_item
def get_triplet_id_fragment(original_file: str, filtered_file: str, n: int):
    # 读取原始文件
    triplet_id_df = pd.read_csv(original_file, encoding='utf-8')

    # 设置随机种子
    seed = 33

    # 创建一个空的列表来存储结果
    result_list = []

    # 对数据进行分组
    grouped = triplet_id_df.groupby(['item_qid', 'property_qid'])

    # 遍历每个分组
    for (item_qid, property_qid), group in grouped:
        # 如果组的大小小于或等于 n，直接添加整个组
        if len(group) <= n:
            result_list.append(group)
        else:
            # 否则，随机选择 n 个样本
            sampled = group.sample(n, random_state=seed)
            result_list.append(sampled)

    # 将结果列表连接成一个 DataFrame
    result_df = pd.concat(result_list, ignore_index=True)

    # 将结果保存到新的 CSV 文件中
    result_df.to_csv(filtered_file, index=False, encoding='utf-8')

triplet_id_file = str(HOME_DIR / 'triplet_id_filtered.csv')
triplet_id_fragment_file = str(HOME_DIR / 'triplet_id_fragment.csv')

get_triplet_id_fragment(triplet_id_file, triplet_id_fragment_file, n=3)



In [23]:
from transformers import AutoModel, AutoTokenizer

model_path = str(Path.home() / 'Desktop' / 'clir' / 'models' / 'models--xlm-roberta-base')

tokenizer = AutoTokenizer.from_pretrained(model_path)

In [24]:
tokenizer

XLMRobertaTokenizerFast(name_or_path='C:\Users\yanghe\Desktop\clir\models\models--xlm-roberta-base', vocab_size=250002, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	250001: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}