In [None]:
import warnings
import logging
from rdflib import Graph, URIRef, Literal, XSD
import json
import re
import pandas as pd
import json
import requests
from time import sleep
from openai import OpenAI
import os


# Natural language translation of uniquely identified entities

In [None]:
# === Suppress RDFLib literal datatype conversion warnings ===
warnings.simplefilter("ignore")
logging.getLogger("rdflib.term").setLevel(logging.ERROR)

def get_last_part(uri):
    """Extract the last part of a URI (after last / or #)"""
    if isinstance(uri, str):
        return re.split(r'[/#]', uri)[-1]
    elif isinstance(uri, URIRef):
        return re.split(r'[/#]', str(uri))[-1]
    return str(uri)

def uri_to_qname(graph, uri):
    """Convert URI to QName (e.g., mwo:description), with fallback to last part"""
    try:
        qname = graph.qname(uri)
        # Split the qname at ':' and take the last part if it exists
        return qname.split(':')[-1] if ':' in qname else qname
    except:
        return get_last_part(uri)

def get_literal_value(literal):
    """Safely extract the value from a Literal, handling invalid datatypes"""
    try:
        return literal.toPython()
    except (ValueError, TypeError):
        return str(literal).split('^^')[0].strip('"')

def parse_ttl_to_json(ttl_path, output_json="../data/MSE-KG/KGPreprocess/MSE_KG.json", error_log="../data/MSE-KG/errors.json"):
    g = Graph()
    g.parse(ttl_path, format="turtle")

    results = {}
    errors = []

    for entity in g.subjects():
        # Apply the same URI simplification to entity URIs
        entity_key = uri_to_qname(g, entity)
        results[entity_key] = {}

        for predicate, obj in g.predicate_objects(subject=entity):
            pred_key = uri_to_qname(g, predicate)

            try:
                # URI object
                if isinstance(obj, URIRef):
                    obj_value = uri_to_qname(g, obj)

                # Literal object
                elif isinstance(obj, Literal):
                    if obj.datatype == XSD.time:
                        try:
                            obj_value = obj.toPython()
                        except Exception:
                            obj_value = str(obj)
                            errors.append({
                                "entity": entity_key,
                                "predicate": pred_key,
                                "value": str(obj),
                                "datatype": str(obj.datatype),
                                "reason": "Invalid xsd:time"
                            })
                    else:
                        obj_value = get_literal_value(obj)
                else:
                    obj_value = str(obj)

                if not isinstance(obj_value, str):
                    obj_value = str(obj_value)

                if pred_key not in results[entity_key]:
                    results[entity_key][pred_key] = []
                results[entity_key][pred_key].append(obj_value)

            except Exception as e:
                errors.append({
                    "entity": entity_key,
                    "predicate": pred_key,
                    "value": str(obj),
                    "error": str(e)
                })

    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    if errors:
        with open(error_log, "w", encoding="utf-8") as f:
            json.dump(errors, f, ensure_ascii=False, indent=2)
        print(f"⚠️  {len(errors)} literal values failed to parse. See {error_log} for details.")

    print(f"✅ Parsed {len(results)} chunks. Output saved to {output_json}")
    return results

if __name__ == "__main__":
    ttl_file = "../data/MSE-KG/KG_QA/Output.ttl"  # Modify this path to your TTL file
    parse_ttl_to_json(ttl_file)

In [None]:
df_kg = pd.read_json("../data/MSE-KG/KGPreprocess/MSE_KG.json", encoding="utf-8").T
df_kg

## Handling Wikidata Identifiers (Q-items) 

Cite from https://www.wikidata.org/wiki/Wikidata:Identifiers

Wikidata identifiers: Each Wikidata entity is identified by an entity ID, which is a number prefixed by a letter. 

Here, only items, also known as Q-items, are prefixed with Q (e.g. Q12345)

In [None]:
def get_wikidata_label(q_number):
    """Fetch English label for a Wikidata Q-number"""
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{q_number}.json"
    try:
        response = requests.get(url, timeout=10)
        data = response.json()
        entity = data['entities'][q_number]
        
        # Get English label if available
        if 'labels' in entity and 'en' in entity['labels']:
            return entity['labels']['en']['value']
        
        # Fallback to first available label
        if 'labels' in entity and len(entity['labels']) > 0:
            first_label = next(iter(entity['labels'].values()))
            return first_label['value']
            
    except Exception as e:
        print(f"Failed to fetch label for {q_number}: {str(e)}")
    return None

def create_label_mapping(kg_data):
    """Create mapping from Q-numbers to display labels"""
    label_map = {}
    for properties in kg_data.values():
        # Process Wikidata references in 'sameAs'
        if 'sameAs' in properties:
            for uri in properties['sameAs']:
                if isinstance(uri, str):
                    # Handle both Q123 and full URI formats
                    if uri.startswith('Q') and uri[1:].isdigit():
                        q_number = uri
                    elif 'wikidata.org/entity/Q' in uri:
                        q_number = uri.split('/')[-1]
                    else:
                        continue
                        
                    if q_number not in label_map:
                        sleep(1)  # Respectful API delay
                        label = get_wikidata_label(q_number)
                        if label:
                            label_map[q_number] = label
    return label_map

def replace_references(data, label_map):
    """Replace all references with their display labels"""
    if isinstance(data, dict):
        return {k: replace_references(v, label_map) for k, v in data.items()}
    elif isinstance(data, list):
        return [replace_references(item, label_map) for item in data]
    elif isinstance(data, str):
        # Handle both Q123 and full URI formats
        if data in label_map:
            return label_map[data]
        elif data.startswith('Q') and data[1:].isdigit() and data in label_map:
            return label_map[data]
        elif 'wikidata.org/entity/Q' in data:
            q_number = data.split('/')[-1]
            return label_map.get(q_number, data)
    return data

def process_kg_data(input_file, output_file):
    """Main processing function"""
    with open(input_file, 'r', encoding='utf-8') as f:
        kg_data = json.load(f)
    
    # Step 1: Create label mapping
    label_map = create_label_mapping(kg_data)
    
    # Step 2: Replace all references
    transformed_data = replace_references(kg_data, label_map)
    
    # Save results
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(transformed_data, f, ensure_ascii=False, indent=2)
    
    print(f"Processing complete. Results saved to {output_file}")

if __name__ == "__main__":
    input_json = "../data/MSE-KG/KGPreprocess/MSE_KG.json"
    output_json = "../data/MSE-KG/KGPreprocess/MSE_KG_Wikidata.json"
    process_kg_data(input_json, output_json)

In [None]:
df_wk = pd.read_json("../data/MSE-KG//KGPreprocess/MSE_KG_Wikidata.json", encoding="utf-8").T
df_wk

## Processes tail entities

In [None]:
import json

def create_label_mapping(kg_data):
    """Create mapping from entity IDs to display labels based on specified rules"""
    label_map = {}
    
    for e_name, properties in kg_data.items():
        # Rule 1: Use 'label' if available
        if 'label' in properties and properties['label']:
            label_map[e_name] = properties['label'][0]
            continue
            
        # Rule 2: Use 'sameAs' if available
        if 'sameAs' in properties:
            for uri in properties['sameAs']:
                if isinstance(uri, str):
                    label_map[e_name] = uri
                    break  # Use the first valid sameAs
            if e_name in label_map:
                continue
                
        # Rule 3: For Person type, combine firstName and surname
        if 'type' in properties and 'Person' in properties['type']:
            first_name = properties.get('firstName', [''])[0]
            surname = properties.get('surname', [''])[0]
            if first_name and surname:
                label_map[e_name] = f"{first_name} {surname}"
                continue
            elif first_name:
                label_map[e_name] = first_name
                continue
            elif surname:
                label_map[e_name] = surname
                continue
                
        # Rule 4: Use 'title' if available
        if 'title' in properties and properties['title']:
            label_map[e_name] = properties['title'][0]
            continue
            
        # Rule 5: Keep empty if none of the above
        label_map[e_name] = None
    
    return label_map

def replace_references(data, label_map):
    """Replace all entity ID references with their display labels"""
    if isinstance(data, dict):
        return {k: replace_references(v, label_map) for k, v in data.items()}
    elif isinstance(data, list):
        return [replace_references(item, label_map) for item in data]
    elif isinstance(data, str):
        if data in label_map:
            return label_map[data] or data  # Use original if label is None
    return data

def process_kg_data(input_file, output_file):
    """Main processing function"""
    with open(input_file, 'r', encoding='utf-8') as f:
        kg_data = json.load(f)
    
    label_map = create_label_mapping(kg_data)
    transformed_data = replace_references(kg_data, label_map)
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(transformed_data, f, ensure_ascii=False, indent=2)
    
    print(f"Processing complete. Results saved to {output_file}")

if __name__ == "__main__":
    input_json = "../data/MSE-KG/KGPreprocess/MSE_KG_Wikidata.json"
    output_json = "../data/MSE-KG/KGPreprocess/MSE_KG_Tail_Entity.json"
    process_kg_data(input_json, output_json)

In [None]:
df_label = pd.read_json("../data/MSE-KG/KGPreprocess/MSE_KG_Tail_Entity.json", encoding="utf-8").T
df_label

## Process all relations


In [None]:
import json

# Load the original JSON file
with open("../data/MSE-KG/KGPreprocess/MSE_KG_Tail_Entity.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Step 1: Build a mapping from property keys to their shortest label
key_label_map = {}
for key, value in data.items():
    if (
        isinstance(value, dict) 
        and "label" in value 
        and isinstance(value["label"], list) 
        and value["label"]
    ):
        # Select the shortest label (by character length, stripped of spaces)
        shortest_label = min(value["label"], key=lambda x: len(x.strip()))
        key_label_map[key] = shortest_label.strip()

# Step 2: Process all entity entries and replace keys using the label map
transformed_data = {}

for entity_key, entity_value in data.items():
    # Only transform entities (e.g. "E426504"), not property definitions
    if not entity_key.startswith("E"):
        transformed_data[entity_key] = entity_value
        continue

    new_entity = {}
    for prop_key, prop_value in entity_value.items():
        # Replace property name with its shortest label if available
        new_key = key_label_map.get(prop_key, prop_key)
        new_entity[new_key] = prop_value

    transformed_data[entity_key] = new_entity

# Step 3: Write the transformed output to a new JSON file
with open("../data/MSE-KG/KGPreprocess/MSE_KG_Relation.json", "w", encoding="utf-8") as f:
    json.dump(transformed_data, f, ensure_ascii=False, indent=2)

print("Transformation complete. Output saved to MSE_KG_Relation.json")


In [None]:
df_relation = pd.read_json("../data/MSE-KG/KGPreprocess/MSE_KG_Relation.json", encoding="utf-8").T
df_relation

## Process all head entities

In [None]:
# Extracting E-names from the labeled JSON file
file_path = "../data/MSE-KG/KGPreprocess/MSE_KG_Relation.json"
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f) 

# Create a dictionary to store entities starting with 'E'
e_data = {}
for key, value in data.items():  
    if key.startswith('E'):
        e_data[key] = value

# Save the extracted entities to a new JSON file
output_path = "../data/MSE-KG/KGPreprocess/MSE_KG_Entity.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(e_data, f, indent=2, ensure_ascii=False)

print(f"Extraction completed! Saved to {output_path}")
print(f"Number of keys extracted: {len(e_data)}")

In [None]:
df_entity = pd.read_json("../data/MSE-KG/KGPreprocess/MSE_KG_Entity.json", encoding="utf-8").T
df_entity

In [None]:
import json

def get_display_name(properties):
    """Get entity display name according to the five-step rule"""
    # Rule 1: Use 'label' if available
    if 'label' in properties and properties['label']:
        return properties['label'][0]
        
    # Rule 2: Use 'sameAs' if available
    if 'sameAs' in properties:
        for uri in properties['sameAs']:
            if isinstance(uri, str):
                return uri  # Use the first valid sameAs
    
    # Rule 3: For Person type, combine firstName and surname
    if 'type' in properties and 'Person' in properties['type']:
        first_name = properties.get('firstName', [''])[0]
        surname = properties.get('surname', [''])[0]
        if first_name and surname:
            return f"{first_name} {surname}"
        elif first_name:
            return first_name
        elif surname:
            return surname
    
    # Rule 4: Use 'title' if available
    if 'title' in properties and properties['title']:
        return properties['title'][0]
    
    # Rule 5: Return None if none of the above
    return None

def transform_kg_data(kg_data):
    """Transform knowledge graph data: replace outer keys and remove label/sameAs"""
    new_kg_data = {}
    
    for e_name, properties in kg_data.items():
        # Get display name
        display_name = get_display_name(properties)
        
        # If no display name found, keep the original E-name
        new_key = display_name if display_name is not None else e_name
        
        # Create new properties dictionary, excluding label and sameAs
        new_properties = {
            k: v for k, v in properties.items() 
            if k not in ['label', 'sameAs']
        }
        
        # Add to new dictionary
        new_kg_data[new_key] = new_properties
    
    return new_kg_data

def process_kg_data(input_file, output_file):
    """Main processing function"""
    with open(input_file, 'r', encoding='utf-8') as f:
        kg_data = json.load(f)
    
    # Transform data
    transformed_data = transform_kg_data(kg_data)
    
    # Save results
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(transformed_data, f, ensure_ascii=False, indent=2)
    
    print(f"Processing completed, results saved to {output_file}")

if __name__ == "__main__":
    input_json = "../data/MSE-KG/KGPreprocess/MSE_KG_Entity.json"
    output_json = "../data/MSE-KG/KGPreprocess/MSE_KG_NL.json"
    process_kg_data(input_json, output_json)

In [None]:
df_entity_processed = pd.read_json("../data/MSE-KG/KGPreprocess/MSE_KG_NL.json", encoding="utf-8").T
df_entity_processed.info()

Triple Textualization: To support vector-based semantic retrieval, RDF triples can be converted into readable text fragments. For example, each triple can be transformed into a natural language sentence such as: "<subject>'s <predicate> is <object>". If the entities and relations in the knowledge graph have labels or descriptive properties, these can be leveraged to generate richer text. For instance, for a given entity, all its related triples can be compiled into a descriptive "document" (with the entity as the paragraph topic, including all its predicate-object pairs). In this way, the relevant facts about each entity or relation become indexable textual passages.

## Triplets & Chunks

In [None]:
import json
from collections import defaultdict
import re

def process_object_value(obj):
    """Clean object value: remove line breaks and extra whitespace."""
    obj = str(obj)
    obj = re.sub(r'[\r\n]+', ' ', obj)         # Replace line breaks with space
    obj = ' '.join(obj.strip().split())        # Remove extra spaces
    return obj


def convert_json_to_triples(json_path, 
                          output_rdf="../data/MSE-KG/KGPreprocess/triples_original.txt",
                          output_grouped_rdf="../data/MSE-KG/KGPreprocess/triples_paragraph.txt"):
    """Convert labeled JSON to RDF triples"""
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f) 

    # Clean subject URIs before processing
    cleaned_data = {}
    for subject_uri, props in data.items():
        clean_uri = re.sub(r'[\r\n]+', ' ', subject_uri).strip()
        cleaned_data[clean_uri] = props
    data = cleaned_data

    # Output structures
    rdf_triples = []                                # Standard RDF triples
    grouped_rdf = defaultdict(list)                 # Grouped RDF data

    for subject_uri, props in data.items():
        subj_label = subject_uri.split("/")[-1]

        for predicate, objects in props.items():
            pred_label = predicate.split(":")[-1].replace("_", " ")

            object_list = [objects] if not isinstance(objects, list) else objects
            for obj in object_list:
                # RDF triple (treat all object values as strings)
                cleaned_obj = process_object_value(obj)
                escaped = cleaned_obj.replace('<', '').replace('>', '')  # Remove any accidental angle brackets

                rdf_triple = f"<{subject_uri}>, <{predicate}>, <{escaped}>"
                rdf_triples.append(f"({rdf_triple})")  # 格式: (<s>, <p>, <o>)

                # Grouped RDF triple (without trailing dot)
                grouped_rdf[subject_uri].append(f"({rdf_triple})")

    # Generate grouped RDF lines in the correct format
    grouped_lines = []
    for uri, triples in grouped_rdf.items():
        # Join all triples for this subject with commas and wrap in outer parentheses
        joined_triples = ", ".join(triples)
        grouped_lines.append(f"({joined_triples})")

    # Write output files
    def write_file(path, data):
        with open(path, "w", encoding="utf-8") as f:
            f.write("\n".join(data))

    write_file(output_rdf, rdf_triples)
    write_file(output_grouped_rdf, grouped_lines)

    print(f"Conversion complete:\n"
          f"- RDF triples: {len(rdf_triples):,}\n"
          f"- Grouped RDF triples: {len(grouped_lines):,}")

if __name__ == "__main__":
    convert_json_to_triples("../data/MSE-KG/KGPreprocess/MSE_KG_NL.json")

# Verbalization

## Sentence-Level Verbalization

In [None]:
verbalization_prompt="""
## Role
You are a knowledge graph verbalization expert that converts triples into natural English sentences while strictly following these rules:

## Instructions:
1. **Preservation Rule**  
   - Subject and object must appear exactly as in the original triple
2. **Predicate Handling**  
   - Maintain the original meaning.  
   - Prefer using words from the original predicate.  
   - Make the connection sound natural in English
3. Ensure that the generated sentence is grammatically correct in English.
4. Directly output only the converted sentence without any additional text or explanation.


## Examples
### Standard Case
**Triple:**  
`(<PyTorch>, <developedBy>, <Meta AI Research>)`

**Output:**  
PyTorch is developed by Meta AI Research.

### Complex Case
**Triple:**  
`(<Lewis, Patrick, et al. "Retrieval-augmented generation for knowledge-intensive nlp tasks." Advances in neural information processing systems 33 (2020): 9459-9474.>, <introduced>, <Retrieval-Augmented Generation (RAG) model>)`

**Output:**  
"Lewis, Patrick, et al. 'Retrieval-augmented generation for knowledge-intensive nlp tasks.' Advances in neural information processing systems 33 (2020): 9459-9474." introduced Retrieval-Augmented Generation (RAG) model.

**Triple:**  
`(<subject>, <predicate>, <object>)`

"""

In [None]:
# Initialize DeepSeek client
client = OpenAI(
    api_key=API_KEY,  # Replace with your actual API key
    base_url="https://api.deepseek.com"
)

# File paths
input_file = "../data/MSE-KG/KGPreprocess/triples_original.txt"
output_file = "../data/MSE-KG/KGPreprocess/verbalized_triples.txt"

# Batch settings
BATCH_SIZE = 100  # Adjust based on token size

# Load all input lines
with open(input_file, 'r', encoding='utf-8') as f:
    lines = [line.strip() for line in f if line.strip()]
total_lines = len(lines)

# Count already processed lines
start_line = 0
if os.path.exists(output_file):
    with open(output_file, 'r', encoding='utf-8') as f:
        start_line = sum(1 for _ in f)


# Start batch processing
with open(output_file, 'a', encoding='utf-8') as outfile:
    for i in range(start_line, total_lines, BATCH_SIZE):
        batch = lines[i:i + BATCH_SIZE]
        
        # Construct prompt
        prompt = verbalization_prompt + "\n\n"
        for idx, triple in enumerate(batch, start=1):
            prompt += f"{idx}. {triple}\n"
        prompt += "\nReturn each sentence on a new line."

        try:
            response = client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}
                ],
                stream=False
            )
            result_text = response.choices[0].message.content.strip()
            results = result_text.split("\n")
            
            # Write output lines
            print("\nGenerated verbalizations:")
            for res_line in results:
                print(f"- {res_line.strip()}")
                outfile.write(res_line.strip() + "\n")
            
            # Handle potential missing responses
            if len(results) < len(batch):
                for _ in range(len(batch) - len(results)):
                    outfile.write("ERROR: Missing output\n")
                    
        except Exception as e:
            # Write errors for entire batch
            for _ in batch:
                outfile.write("ERROR: Processing failed\n")

##  Paragraph-Level Verbalization

In [None]:
chunk_prompt = """
You are an assistant that generates descriptive paragraph chunks about a given entity based on a list of RDF-style triples. Each triple shares the same head entity, and your goal is to verbalize these triples into a coherent, natural English paragraph by following these instructions:

1. Begin the paragraph directly with the head entity name
2. Use head/tail entities exactly as given
3. Generate appropriate connecting language from predicates
4. Avoid unnecessary repetition of the head entity
5. Create a single, fluent paragraph mentioning every triple
6. Never comment on or question triple validity
7. Include all triples, even if inconsistent
8. Exclude all notes or explanations

Example:

Input triples:
((<Alan Turing>, <instance of>, <human>), (<Alan Turing>, <given name>, <Alan>), (<Alan Turing>, <family name>, <Turing>), (<Alan Turing>, <occupation>, <mathematician>), (<Alan Turing>, <occupation>, <computer scientist>), (<Alan Turing>, <occupation>, <cryptanalyst>), (<Alan Turing>, <educated at>, <University of Cambridge>), (<Alan Turing>, <educated at>, <Princeton University>), (<Alan Turing>, <date of birth>, <1912-06-23>), (<Alan Turing>, <date of death>, <1954-06-07>), (<Alan Turing>, <notable work>, <Turing machine>), (<Alan Turing>, <notable work>, <On Computable Numbers>))

Output paragraph:
Alan Turing is a human whose given name is Alan and family name is Turing. He was born on June 23, 1912, and died on June 7, 1954. Turing was a mathematician, computer scientist, and cryptanalyst. He studied at both the University of Cambridge and Princeton University. His most notable works include the concept of the Turing machine and the influential paper On Computable Numbers.

Task:
Verbalize the following new triples into a coherent paragraph. Output only the final paragraph without any additional text or explanation.

Input triples: ((subject A, predicate A, object A), (subject A, predicate B, object B), (subject A, predicate C, object C)) ...
Output paragraph:
"""



In [None]:
import os
import logging
import time
from datetime import datetime
from openai import OpenAI

# ==== Setup ====
BATCH_SIZE = 100  # Start with a small batch size for testing
INPUT_FILE = "../data/MSE-KG/KGPreprocess/triples_paragraph.txt"
OUTPUT_FILE = "../data/MSE-KG/KGPreprocess/verbalized_paragraphs.txt"  
LOG_FILE = "../log/verbalization_test.log"


# ==== Logging ====
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
logging.basicConfig(
    filename=LOG_FILE,
    filemode='w',  # Overwrite old logs during testing
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

# ==== DeepSeek Client ====
client = OpenAI(
    api_key=API_KEY,  # Replace with your actual API key
    base_url="https://api.deepseek.com"
)

# ==== Load Input File ====
if not os.path.exists(INPUT_FILE):
    raise FileNotFoundError(f"{INPUT_FILE} not found!")

with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    paragraphs = [line.strip() for line in f if line.strip()]

total_lines = len(paragraphs)
logging.info(f"Total lines to process: {total_lines}")
print(f"\n🚀 Starting processing. Total lines to process: {total_lines}\n")

# ==== Processing ====
start_time = time.time()
processed_count = 0

with open(OUTPUT_FILE, 'w', encoding='utf-8') as f_out: 
    for i in range(0, total_lines, BATCH_SIZE):
        batch_start_time = time.time()
        batch = paragraphs[i:i + BATCH_SIZE]
        line_range = f"{i + 1}-{i + len(batch)}"
        
        # Print progress information
        progress = (i / total_lines) * 100
        elapsed_time = time.time() - start_time
        estimated_total = (elapsed_time / (i + 1)) * total_lines if i > 0 else 0
        remaining_time = estimated_total - elapsed_time
        
        print(f"\n📊 Progress: {progress:.1f}% | Processed: {i}/{total_lines} | "
              f"Elapsed: {elapsed_time:.1f}s | Remaining: {remaining_time:.1f}s")
        print(f"🔄 Processing batch {line_range}...")
        
        logging.info(f"Processing batch {line_range}")
        batch_output = []
        
        try:
            for j, triple_group in enumerate(batch, 1):
                print(f"\n🔍 Input triples ({j}/{len(batch)}):")
                print(triple_group[:200] + ("..." if len(triple_group) > 200 else ""))
                
                response = client.chat.completions.create(
                    model="deepseek-chat",
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant that creates fluent paragraphs from structured data."},
                        {"role": "user", "content": chunk_prompt + triple_group}
                    ],
                    temperature=0.3  
                )
                result = response.choices[0].message.content.strip()
                
                print("\n💬 Generated result:")
                print(result)
                
                batch_output.append(result)
                processed_count += 1
            
            # Write output
            for output in batch_output:
                f_out.write(output + "\n")  # Add blank lines between paragraphs
            
            batch_time = time.time() - batch_start_time
            logging.info(f"Successfully processed batch {line_range} in {batch_time:.2f}s")
            print(f"✅ Batch {line_range} completed (Time: {batch_time:.1f}s)")
            
        except Exception as e:
            error_msg = f"Error in batch {line_range}: {str(e)}"
            logging.error(error_msg)
            print(f"❌ Error: {error_msg}")
            for _ in batch:
                f_out.write("ERROR: Could not process this triple group\n")

# Final statistics
total_time = time.time() - start_time
avg_time = total_time / processed_count if processed_count > 0 else 0

print(f"\n🎉 Processing completed! Total time: {total_time:.1f} seconds")
print(f"📝 Successfully processed: {processed_count}/{total_lines} items")
print(f"⏱️ Average processing time per item: {avg_time:.2f} seconds")

logging.info(f"Test processing completed in {total_time:.2f}s")
logging.info(f"Successfully processed {processed_count}/{total_lines} items")

# Post Processing for Sentence-Level Verbalization

In [None]:
import re

def clean_file(input_file, output_file):
    """
    Cleans a text file by:
    1. Deleting lines starting with "let" or "here" (case insensitive)
    2. Removing all blank lines
    3. Removing leading numbers followed by dots (e.g., "100. ")
    """
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        
        for line in infile:
            # Skip lines starting with let/here (case insensitive)
            lower_line = line.lower().strip()
            if lower_line.startswith(('let ', 'here ')):
                continue
            
            # Skip blank lines
            stripped_line = line.strip()
            if not stripped_line:
                continue
            
            # Remove leading number and dot (e.g., "1. ", "100. ")
            cleaned_line = re.sub(r'^\d+\.\s*', '', stripped_line)
            
            # Write the processed line to output
            outfile.write(cleaned_line + '\n')

if __name__ == "__main__":
    input_filename = '../data/MSE-KG/verbalized_triples.txt'
    output_filename = '../data/MSE-KG/verbalized_triples_np.txt'
    
    clean_file(input_filename, output_filename)
    print(f"Processing complete. Results saved to {output_filename}")