In [1]:
import pandas as pd
import numpy as np

In [7]:
import json
import re
from tqdm.auto import tqdm

# Load entity2id mapping and normalize keys
with open('/home/Nema/UniCRS_GraphRAG/GraphRAG/entity2id.json', encoding='utf-8') as f:
    entity2id = {key.lower(): value for key, value in json.load(f).items()}  # Convert to lowercase

def simplify_uri(uri):
    """Extracts the suffix from a URI and normalizes it."""
    if uri.startswith("<http://dbpedia.org/resource/"):
        # Remove URI prefix and convert underscores to spaces
        simplified = uri.split("/")[-1].strip(">").replace("_", " ").lower()
        # Remove parenthetical annotations like "(film)"
        return re.sub(r"\s*\(.*?\)", "", simplified).strip()
    return uri.lower()

def find_matching_entity(simplified_entity, entity2id):
    """Find a matching entity in entity2id using substring matching."""
    for key in entity2id:
        # Normalize key (entity2id keys are already lowercase)
        normalized_key = re.sub(r"\s*\(.*?\)", "", key).strip()
        if simplified_entity in normalized_key:  # Substring matching
            return key
    return None

def process_file(src_file, tgt_file, entity2id):
    """
    Processes the input JSONL file to:
    1. Simplify all URIs in entities and movies.
    2. Normalize and filter entities and movies based on entity2id using substring matching.
    3. Save the processed data into a new JSONL file.

    Args:
        src_file (str): Path to the input JSONL file.
        tgt_file (str): Path to the output JSONL file.
        entity2id (dict): Mapping of entities to IDs.
    """
    with open(src_file, encoding='utf-8') as f, open(tgt_file, 'w', encoding='utf-8') as tgt:
        for line in tqdm(f, desc=f"Processing {src_file}"):
            record = json.loads(line)
            for message in record['messages']:
                # Simplify and filter entities
                new_entity, new_entity_name = [], []
                for j, entity in enumerate(message['entity']):
                    simplified_entity = simplify_uri(entity)
                    matching_entity = find_matching_entity(simplified_entity, entity2id)
                    if matching_entity:
                        new_entity.append(matching_entity)
                        new_entity_name.append(message['entity_name'][j])
                message['entity'] = new_entity
                message['entity_name'] = new_entity_name

                # Simplify and filter movies
                new_movie, new_movie_name = [], []
                for j, movie in enumerate(message['movie']):
                    simplified_movie = simplify_uri(movie)
                    matching_movie = find_matching_entity(simplified_movie, entity2id)
                    if matching_movie:
                        new_movie.append(matching_movie)
                        new_movie_name.append(message['movie_name'][j])
                message['movie'] = new_movie
                message['movie_name'] = new_movie_name

            # Write the processed record to the target file
            tgt.write(json.dumps(record, ensure_ascii=False) + '\n')


if __name__ == "__main__":
    # Define input and output file paths
    src_files = [
        '/home/Nema/UniCRS_GraphRAG/data/redial/test_data_dbpedia_raw.jsonl',
        '/home/Nema/UniCRS_GraphRAG/data/redial/valid_data_dbpedia_raw.jsonl',
        '/home/Nema/UniCRS_GraphRAG/data/redial/train_data_dbpedia_raw.jsonl'
    ]

    tgt_files = [
        '/home/Nema/UniCRS_GraphRAG/data/redial/test_data_dbpedia.jsonl',
        '/home/Nema/UniCRS_GraphRAG/data/redial/valid_data_dbpedia.jsonl',
        '/home/Nema/UniCRS_GraphRAG/data/redial/train_data_dbpedia.jsonl'
    ]

    # Process each file
    for src_file, tgt_file in zip(src_files, tgt_files):
        process_file(src_file, tgt_file, entity2id)

    print("Processing completed for all files.")


Processing /home/Nema/UniCRS_GraphRAG/data/redial/test_data_dbpedia_raw.jsonl: 0it [00:00, ?it/s]

Processing /home/Nema/UniCRS_GraphRAG/data/redial/valid_data_dbpedia_raw.jsonl: 0it [00:00, ?it/s]

Processing /home/Nema/UniCRS_GraphRAG/data/redial/train_data_dbpedia_raw.jsonl: 0it [00:00, ?it/s]

Processing completed for all files.
