# Construct Hybrid Datasets and Prepare Training datasets

# 1. Construct Hybrid Datasets

In [None]:
import json
import re
from collections import defaultdict

def clean_entity(entity):
    """Clean entity (definiendum or lemma) using the same rules, return cleaned form or empty string"""
    if not entity or not isinstance(entity, str):
        return ""
    
    # Common mathematical symbols to remove
    math_symbols = set("+-*/=∑∏√∫<>∈∉{}[]()")
    
    # Remove mathematical symbols
    entity = ''.join(c for c in entity if c not in math_symbols)
    
    # Remove non-ASCII characters
    entity = ''.join(c for c in entity if ord(c) < 128)
    
    # Replace spaces with underscores
    entity = entity.replace(" ", "_")
    
    # Keep only valid characters (letters, numbers, underscores)
    entity = re.sub(r'[^A-Za-z0-9_]', '', entity)
    
    # If empty after cleaning, consider invalid
    if not entity:
        return ""
    
    return entity

def extract_lemma(token):
    """Extract lemma and remove ending punctuation, also clean single letters and 'i.e.'"""
    if not token:
        return None
    
    token = token.lower().strip()
    token = token.rstrip('.,;:!?')  # Remove ending punctuation

    # Basic lemmatization
    if token.endswith('ies'):
        token = token[:-3] + 'y'
    elif token.endswith('es') and len(token) > 3:
        token = token[:-2]
    elif token.endswith('s') and len(token) > 3:
        token = token[:-1]
    elif token.endswith('ing') and len(token) > 4:
        token = token[:-3]

    # Additional cleaning
    if len(token) == 1 or token == 'i.e':
        return None
    
    # Apply the same cleaning rules as definiendum
    token = clean_entity(token)
    if not token:
        return None
    
    return token

def clean_triple(triple):
    """Clean a single triple, return cleaned triple or None (if invalid)"""
    if not isinstance(triple, dict):
        return None
    
    subject = triple.get("subject", "")
    role = triple.get("role", "")
    lemma = triple.get("lemma", "")
    explanation = triple.get("explanation", "")
    
    # Clean subject
    cleaned_subject = clean_entity(subject)
    if not cleaned_subject:
        return None
    
    # Clean lemma
    cleaned_lemma = extract_lemma(lemma)
    if not cleaned_lemma:
        return None
    
    # Clean role (ensure uppercase and correct format)
    cleaned_role = role.upper().replace("-", "_").strip()
    if not cleaned_role:
        return None
    
    # Return cleaned triple
    return {
        "subject": cleaned_subject.lower(),
        "role": cleaned_role,
        "lemma": cleaned_lemma,
        "explanation": explanation  # Keep original explanation
    }

def normalize_concept_name(name):
    """Standardize concept name for matching"""
    if not name or not isinstance(name, str):
        return ""
    
    # Use the same cleaning rules
    return clean_entity(name)

def load_and_index_data(embeddings_file, triples_file):
    """Load and index two datasets"""
    
    # Load embeddings data
    with open(embeddings_file, "r", encoding="utf-8") as f:
        embeddings_data = json.load(f)
    
    # Load triples data
    with open(triples_file, "r", encoding="utf-8") as f:
        triples_data = json.load(f)
    
    return embeddings_data, triples_data

def debug_triples_structure(triples_data):
    """Debug the structure of triples data"""
    print("=== Debugging Triples Data Structure ===")
    
    for module_name, module_content in list(triples_data.items())[:2]:
        print(f"\nModule: {module_name}")
        for definition in module_content.get("definitions", [])[:2]:
            semantic_analysis = definition.get("semantic_analysis", {})
            concepts = semantic_analysis.get("concepts", [])
            
            for concept in concepts[:2]:
                print(f"  Concept: {concept.get('name')}")
                print(f"  All fields: {list(concept.keys())}")
                
                if "triples" in concept:
                    triples_value = concept["triples"]
                    print(f"  triples type: {type(triples_value)}")
                    if isinstance(triples_value, list):
                        print(f"  triples count: {len(triples_value)}")
                        if triples_value:
                            first_triple = triples_value[0]
                            print(f"  First triple - subject: {first_triple.get('subject')}")
                            print(f"  First triple - role: {first_triple.get('role')}")
                            print(f"  First triple - lemma: {first_triple.get('lemma')}")
                print()

def build_concept_mapping(triples_data):
    """Build concept name to triples mapping table and perform cleaning"""
    concept_mapping = defaultdict(list)
    stats = defaultdict(int)
    
    for module_name, module_content in triples_data.items():
        for definition in module_content.get("definitions", []):
            semantic_analysis = definition.get("semantic_analysis", {})
            concepts = semantic_analysis.get("concepts", [])
            
            for concept in concepts:
                concept_name = concept.get("name")
                
                # Safely get triples field
                triples = concept.get("triples", [])
                
                # Handle triples field type issues
                if isinstance(triples, str):
                    try:
                        triples = json.loads(triples)
                        stats['string_triples_parsed'] += 1
                    except json.JSONDecodeError:
                        print(f"Warning: Unable to parse string triples: {triples[:100]}...")
                        triples = []
                        stats['invalid_string_triples'] += 1
                
                elif not isinstance(triples, list):
                    triples = []
                    stats['non_list_triples'] += 1
                
                if concept_name and triples:
                    normalized_name = normalize_concept_name(concept_name)
                    if not normalized_name:
                        stats['invalid_concept_names'] += 1
                        continue
                    
                    # Clean triples
                    cleaned_triples = []
                    for triple in triples:
                        cleaned_triple = clean_triple(triple)
                        if cleaned_triple:
                            cleaned_triples.append(cleaned_triple)
                    
                    stats['original_triples'] += len(triples)
                    stats['cleaned_triples'] += len(cleaned_triples)
                    stats['filtered_triples'] += (len(triples) - len(cleaned_triples))
                    
                    if cleaned_triples:
                        concept_mapping[normalized_name].append({
                            "triples": cleaned_triples,
                            "source_module": module_name,
                            "original_name": concept_name
                        })
                        stats['concepts_mapped'] += 1
                    else:
                        stats['concepts_no_valid_triples'] += 1
                    
                elif concept_name:
                    stats['concepts_no_triples'] += 1
                else:
                    stats['concepts_no_name'] += 1
    
    print(f"Triples data mapping completed: {dict(stats)}")
    return concept_mapping

def safe_extend_triples(all_triples, data):
    """Safely extend triples list"""
    triples = data.get("triples", [])
    
    if isinstance(triples, list):
        all_triples.extend(triples)
    elif isinstance(triples, str):
        try:
            parsed_triples = json.loads(triples)
            if isinstance(parsed_triples, list):
                all_triples.extend(parsed_triples)
        except json.JSONDecodeError:
            print(f"Warning: Unable to parse triples string")
    else:
        print(f"Warning: Unknown triples type: {type(triples)}")

def merge_datasets_by_concept_name(embeddings_data, concept_mapping):
    """Merge datasets by concept name matching"""
    
    stats = defaultdict(int)
    match_details = []
    
    for module_name, module_content in embeddings_data.items():
        stats['modules_processed'] += 1
        
        for definition in module_content.get("definitions", []):
            semantic_analysis = definition.get("semantic_analysis", {})
            concepts = semantic_analysis.get("concepts", [])
            
            for concept in concepts:
                concept_name = concept.get("name")
                stats['concepts_total'] += 1
                
                if not concept_name:
                    stats['concepts_no_name'] += 1
                    continue
                
                # Standardize concept name for matching
                normalized_name = normalize_concept_name(concept_name)
                if not normalized_name:
                    stats['invalid_target_names'] += 1
                    continue
                
                # Find matching triples
                if normalized_name in concept_mapping:
                    matched_data = concept_mapping[normalized_name]
                    
                    # Merge all matching triples
                    all_triples = []
                    for data in matched_data:
                        safe_extend_triples(all_triples, data)
                    
                    # Add to embeddings concept
                    concept["triples"] = all_triples
                    
                    # Add matching metadata
                    source_modules = set()
                    original_names = set()
                    for data in matched_data:
                        source_modules.add(data.get("source_module", ""))
                        original_names.add(data.get("original_name", ""))
                    
                    concept["triples_metadata"] = {
                        "match_type": "exact_name",
                        "matched_count": len(matched_data),
                        "source_modules": list(source_modules),
                        "original_names": list(original_names),
                        "cleaned_triples_count": len(all_triples)
                    }
                    
                    stats['concepts_matched'] += 1
                    stats['triples_added'] += len(all_triples)
                    
                    match_details.append({
                        'target_module': module_name,
                        'target_concept': concept_name,
                        'normalized_name': normalized_name,
                        'matched_count': len(matched_data),
                        'triples_count': len(all_triples),
                        'match_type': 'exact_name'
                    })
                    
                else:
                    # No matching triples found
                    concept["triples"] = []
                    concept["triples_metadata"] = {
                        "match_type": "no_match",
                        "matched_count": 0,
                        "cleaned_triples_count": 0
                    }
                    stats['concepts_not_matched'] += 1
                    
                    match_details.append({
                        'target_module': module_name,
                        'target_concept': concept_name,
                        'normalized_name': normalized_name,
                        'matched_count': 0,
                        'triples_count': 0,
                        'match_type': 'no_match'
                    })
    
    return embeddings_data, stats, match_details

def main():
    # File paths
    embeddings_file = "../embedding_results/merged_with_embeddings_sentence-t5-large_cleaned.json"
    triples_file = "./informal_data_with_triples/merged_with_triples.json"
    output_file = "test.json"
    
    try:
        print("Loading data...")
        embeddings_data, triples_data = load_and_index_data(embeddings_file, triples_file)
        
        print("Debugging triples data structure...")
        debug_triples_structure(triples_data)
        
        print("Building concept mapping and cleaning triples...")
        concept_mapping = build_concept_mapping(triples_data)
        
        if not concept_mapping:
            print("Warning: Concept mapping is empty, please check triples file structure")
            return
        
        print("Merging data by concept name...")
        merged_data, stats, match_details = merge_datasets_by_concept_name(embeddings_data, concept_mapping)
        
        # Save results
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(merged_data, f, indent=4, ensure_ascii=False)
        
        # Print statistics
        print("\n=== Merge Statistics ===")
        for key, value in stats.items():
            print(f"{key}: {value}")
        
        # Save match details
        with open("merge_match_details_cleaned.json", "w", encoding="utf-8") as f:
            json.dump(match_details, f, indent=2, ensure_ascii=False)
        
        print(f"\nMerge completed! Results saved to: {output_file}")
        
        # Show cleaning effect
        print("\n=== Cleaning Effect ===")
        if 'original_triples' in stats and 'cleaned_triples' in stats:
            print(f"Original triples count: {stats['original_triples']}")
            print(f"Cleaned triples count: {stats['cleaned_triples']}")
            print(f"Filtered triples count: {stats['filtered_triples']}")
            retention_rate = (stats['cleaned_triples'] / stats['original_triples'] * 100) if stats['original_triples'] > 0 else 0
            print(f"Retention rate: {retention_rate:.1f}%")
        
        # Show example results
        print("\n=== Example Merge Results ===")
        for module_name, module_content in list(merged_data.items())[:2]:
            for definition in module_content.get("definitions", [])[:1]:
                for concept in definition.get("semantic_analysis", {}).get("concepts", [])[:2]:
                    print(f"Concept: {concept.get('name')}")
                    print(f"Triples count: {len(concept.get('triples', []))}")
                    if concept.get('triples'):
                        for triple in concept['triples'][:3]:
                            print(f"  - {triple.get('subject')} {triple.get('role')} {triple.get('lemma')}")
                    print()
        
    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

In [None]:
import json

output_json = "test.json"

with open(output_json, "r", encoding="utf-8") as f:
    merged_data = json.load(f)

print("\n=== Example triples output ===")
count = 0
for module_name, module_content in merged_data.items():
    for definition in module_content.get("definitions", []):
        semantic = definition.get("semantic_analysis", {})
        for concept in semantic.get("concepts", []):
            triples = concept.get("triples", [])
            if triples:
                print(f"Concept: {concept.get('name')}")
                print("Triples:")
                for t in triples:
                    print(f"  - subject: {t['subject']}, role: {t['role']}, lemma: {t['lemma']}")
                print()
                count += 1
                if count >= 5:
                    break
        if count >= 5:
            break
    if count >= 5:
        break

## 2. Prepare Training Dataset

### 2.1 Extract Triples

In [None]:
import json
import os
import re
from collections import defaultdict

def clean_entity(entity):
    """Clean entity (definiendum or lemma) using the same rules, return cleaned form or empty string"""
    if not entity or not isinstance(entity, str):
        return ""
    
    # Common mathematical symbols to remove
    math_symbols = set("+-*/=∑∏√∫<>∈∉{}[]()")
    
    # Remove mathematical symbols
    entity = ''.join(c for c in entity if c not in math_symbols)
    
    # Remove non-ASCII characters
    entity = ''.join(c for c in entity if ord(c) < 128)
    
    # Replace spaces with underscores
    entity = entity.replace(" ", "_")
    
    # Keep only valid characters (letters, numbers, underscores)
    entity = re.sub(r'[^A-Za-z0-9_]', '', entity)
    
    # If empty after cleaning, consider invalid
    if not entity:
        return ""
    
    return entity

def extract_lemma(token):
    """Extract lemma and remove ending punctuation, also clean single letters and 'i.e.'"""
    if not token:
        return None
    
    token = token.lower().strip()
    token = token.rstrip('.,;:!?')  # Remove ending punctuation

    # Basic lemmatization
    if token.endswith('ies'):
        token = token[:-3] + 'y'
    elif token.endswith('es') and len(token) > 3:
        token = token[:-2]
    elif token.endswith('s') and len(token) > 3:
        token = token[:-1]
    elif token.endswith('ing') and len(token) > 4:
        token = token[:-3]

    # Additional cleaning
    if len(token) == 1 or token == 'i.e':
        return None
    
    # Apply the same cleaning rules as definiendum
    token = clean_entity(token)
    if not token:
        return None
    
    return token

def process_json_to_triples(json_file_path, output_file):
    """
    Extract triples from JSON file and save in specified format
    
    Parameters:
    json_file_path: JSON file path
    output_file: Output file path
    """
    
    # Read JSON file
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    triples_count = 0
    extracted_triples = []
    
    # Iterate through all modules
    for module_name, module_content in data.items():
        definitions = module_content.get("definitions", [])
        
        # Iterate through all definitions
        for definition in definitions:
            semantic_analysis = definition.get("semantic_analysis", {})
            concepts = semantic_analysis.get("concepts", [])
            
            # Iterate through all concepts
            for concept in concepts:
                concept_name = concept.get("name", "")
                extracted_triples_list = concept.get("extracted_triples", [])
                
                # Clean concept name
                cleaned_concept = clean_entity(concept_name)
                if not cleaned_concept:
                    continue
                
                # Process each triple
                for triple in extracted_triples_list:
                    subject = triple.get("subject", "")
                    role = triple.get("role", "")
                    lemma = triple.get("lemma", "")
                    
                    # Use cleaned concept name as subject
                    subject_clean = cleaned_concept.lower()
                    
                    # Clean lemma using the same rules as definiendum
                    lemma_clean = extract_lemma(lemma)
                    if not lemma_clean:
                        continue
                    
                    # Format role (ensure uppercase)
                    role_clean = role.upper().replace("-", "_")
                    
                    # Add to triple list
                    triple_line = f"{subject_clean}\t{role_clean}\t{lemma_clean}"
                    extracted_triples.append(triple_line)
                    triples_count += 1
    
    # Write to output file
    with open(output_file, 'w', encoding='utf-8') as f:
        for triple in extracted_triples:
            f.write(triple + '\n')
    
    print(f"Extracted {triples_count} triples from {json_file_path}")
    print(f"Results saved to {output_file}")
    
    return extracted_triples

def process_json_folder(json_folder_path, output_file):
    """
    Process all JSON files in a folder
    
    Parameters:
    json_folder_path: JSON folder path
    output_file: Output file path
    """
    
    # Get all JSON files
    json_files = [f for f in os.listdir(json_folder_path) if f.endswith('.json')]
    
    if not json_files:
        print(f"No JSON files found in folder {json_folder_path}")
        return
    
    all_triples = []
    total_triples = 0
    
    # Clear output file
    open(output_file, 'w', encoding='utf-8').close()
    
    # Process each JSON file
    for json_file in json_files:
        json_path = os.path.join(json_folder_path, json_file)
        try:
            triples = process_json_to_triples(json_path, output_file)
            all_triples.extend(triples)
            total_triples += len(triples)
            
            # Append to file
            with open(output_file, 'a', encoding='utf-8') as f:
                for triple in triples:
                    f.write(triple + '\n')
                    
        except Exception as e:
            print(f"Error processing file {json_file}: {e}")
    
    print(f"\nExtracted {total_triples} triples from {len(json_files)} JSON files")
    print(f"All results saved to {output_file}")
    
    # Statistics
    print_statistics(all_triples)
    
    return all_triples

def print_statistics(triples):
    """Print statistics"""
    if not triples:
        return
    
    # Count relation types
    relation_counts = defaultdict(int)
    concept_counts = defaultdict(int)
    lemma_counts = defaultdict(int)
    
    for triple in triples:
        parts = triple.split('\t')
        if len(parts) == 3:
            concept, relation, lemma = parts
            relation_counts[relation] += 1
            concept_counts[concept] += 1
            lemma_counts[lemma] += 1
    
    print("\n=== Statistics ===")
    print(f"Total triples: {len(triples)}")
    print(f"Unique concepts: {len(concept_counts)}")
    print(f"Unique lemmas: {len(lemma_counts)}")
    print(f"Unique relation types: {len(relation_counts)}")
    
    print("\nRelation type distribution:")
    for relation, count in sorted(relation_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"  {relation}: {count}")
    
    print("\nTop 10 most frequent concepts:")
    for concept, count in sorted(concept_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
        print(f"  {concept}: {count}")
    
    print("\nTop 10 most frequent lemmas:")
    for lemma, count in sorted(lemma_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
        print(f"  {lemma}: {count}")

def validate_triples(input_file):
    """
    Validate the format of the generated triples file
    """
    valid_count = 0
    invalid_count = 0
    
    with open(input_file, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                continue
            
            parts = line.split('\t')
            if len(parts) != 3:
                print(f"Invalid format at line {line_num}: {line}")
                invalid_count += 1
                continue
            
            concept, relation, lemma = parts
            # Check if all fields are non-empty and contain only valid characters
            if (concept and relation and lemma and 
                re.match(r'^[a-z0-9_]+$', concept) and 
                re.match(r'^[A-Z_]+$', relation) and 
                re.match(r'^[a-z0-9_]+$', lemma)):
                valid_count += 1
            else:
                print(f"Invalid content at line {line_num}: {line}")
                invalid_count += 1
    
    print(f"\nValidation results for {input_file}:")
    print(f"Valid triples: {valid_count}")
    print(f"Invalid triples: {invalid_count}")
    print(f"Total lines: {valid_count + invalid_count}")

# Usage example
if __name__ == "__main__":
    # Process single JSON file
    # json_file_path = "path/to/your/file.json"
    # output_file = "my_dataset.txt"
    # process_json_to_triples(json_file_path, output_file)
    

    # Process entire folder of JSON files
    json_folder_path = "./informal_data_with_triples"  # Replace with your JSON folder path
    output_file = "../multi_relational_hyperbolic_embeddings/data/my_dataset_cleaned/my_dataset.txt"

    # Process all JSON files
    all_triples = process_json_folder(json_folder_path, output_file)
    
    # Validate the output file
    validate_triples(output_file)

## 2.2 Cleaning the data for Multi-Relation Models

In [None]:
import re
from collections import defaultdict
from nltk.corpus import stopwords
from nltk import download

# Download stopwords (required for the first run)
download('stopwords')
stop_words = set(stopwords.words('english'))

# Define regex for valid mathematical concepts: only letters, numbers, underscores allowed, with a minimum length of 2
valid_definiendum_pattern = re.compile(r"^[A-Za-z0-9_]{2,}$")

# Common mathematical symbols;剔除 if present
math_symbols = set("+-*/=∑∏√∫<>∈∉{}[]()")

def clean_definiendum(definiendum):
    """Clean the definiendum, return a valid form or an empty string"""
    if not definiendum or not isinstance(definiendum, str):
        return ""
    
    # Remove mathematical symbols
    definiendum = ''.join(c for c in definiendum if c not in math_symbols)
    
    # Remove non-ASCII characters
    definiendum = ''.join(c for c in definiendum if ord(c) < 128)
    
    # Replace spaces with underscores
    definiendum = definiendum.replace(" ", "_")
    
    # Keep only valid characters (letters, numbers, underscores)
    definiendum = re.sub(r'[^A-Za-z0-9_]', '', definiendum)
    
    # If empty after cleaning, consider it invalid
    if not definiendum:
        return ""
    
    return definiendum

def extract_lemma(token):
    """Extract lemma and remove trailing punctuation, also clean single letters and 'i.e.'"""
    token = token.lower().strip()
    token = token.rstrip('.,;:!?')  # Remove trailing punctuation

    # Basic lemmatization
    if token.endswith('ies'):
        token = token[:-3] + 'y'
    elif token.endswith('es') and len(token) > 3:
        token = token[:-2]
    elif token.endswith('s') and len(token) > 3:
        token = token[:-1]
    elif token.endswith('ing') and len(token) > 4:
        token = token[:-3]

    # Additional cleaning
    if len(token) == 1 or token == 'i.e':
        return None  # Return None to discard
    return token


def clean_triples_file(input_file, output_file):
    """
    Clean triple file directly and show what was removed
    Format: concept\trelation\tlemma
    """
    
    # Read all triples
    triples = []
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                concept, relation, lemma = line.split('\t')
                triples.append((concept, relation, lemma))
            except ValueError:
                print(f"Skip malformed line: {line}")
                continue
    
    # Track removed data
    removed_data = {
        'too_short': [],
        'stopwords': [],
        'special_chars_removed': [],
        'invalid_definiendum': [],
        'malformed_lines': 0
    }
    
    # Cleaning rules
    def clean_lemma(lemma, original_lemma):
        """Clean lemma and track changes"""
        if not lemma:
            return None
        
        cleaned_lemma = extract_lemma(lemma)
        if cleaned_lemma is None:
            removed_data['too_short'].append((original_lemma, lemma))
            return None
        
        if cleaned_lemma in stop_words:
            removed_data['stopwords'].append((original_lemma, cleaned_lemma))
            return None
        
        return cleaned_lemma
    
    def clean_concept(concept):
        """Clean concept using the same rules as clean_definiendum"""
        cleaned_concept = clean_definiendum(concept)
        if not cleaned_concept:
            removed_data['invalid_definiendum'].append(concept)
        return cleaned_concept
    
    # Apply cleaning
    cleaned_triples = []
    removed_triples = []
    
    for concept, relation, lemma in triples:
        # Clean concept
        cleaned_concept = clean_concept(concept)
        if not cleaned_concept:
            removed_triples.append((concept, relation, lemma))
            continue
        
        # Clean lemma
        cleaned_lemma = clean_lemma(lemma, lemma)
        if not cleaned_lemma:
            removed_triples.append((concept, relation, lemma))
            continue
        
        # 关系标签格式化（与原始代码一致）
        # 合并 B-/I- 前缀，转为全大写
        if "-" in relation:
            relation = relation.split("-", 1)[1].replace("-", "").upper()
        else:
            relation = relation.upper()
        
        cleaned_triples.append((cleaned_concept.lower(), relation, cleaned_lemma))
    
    # Write cleaned file
    with open(output_file, 'w', encoding='utf-8') as f:
        for concept, relation, lemma in cleaned_triples:
            f.write(f"{concept}\t{relation}\t{lemma}\n")
    
    # Statistics and removal report
    print(f"Original triples count: {len(triples)}")
    print(f"Cleaned triples count: {len(cleaned_triples)}")
    print(f"Removed triples count: {len(removed_triples)}")
    print(f"Retention rate: {len(cleaned_triples)/len(triples)*100:.1f}%")
    
    # Detailed removal report
    print("\n=== REMOVAL REPORT ===")
    
    # Removed triples
    if removed_triples:
        print(f"\nRemoved triples ({len(removed_triples)}):")
        for i, (concept, relation, lemma) in enumerate(removed_triples[:10]):
            print(f"  {concept}\t{relation}\t{lemma}")
        if len(removed_triples) > 10:
            print(f"  ... and {len(removed_triples) - 10} more")
    
    # Invalid definienda
    if removed_data['invalid_definiendum']:
        print(f"\nInvalid definienda removed ({len(removed_data['invalid_definiendum'])}):")
        unique_invalid = set(removed_data['invalid_definiendum'])
        for concept in sorted(unique_invalid)[:10]:
            print(f"  '{concept}'")
        if len(unique_invalid) > 10:
            print(f"  ... and {len(unique_invalid) - 10} more")
    
    # Too short lemmas
    if removed_data['too_short']:
        print(f"\nToo short lemmas removed ({len(removed_data['too_short'])}):")
        unique_short = set(lemma for _, lemma in removed_data['too_short'])
        for lemma in sorted(unique_short)[:10]:
            print(f"  '{lemma}'")
        if len(unique_short) > 10:
            print(f"  ... and {len(unique_short) - 10} more")
    
    # Stopwords
    if removed_data['stopwords']:
        print(f"\nStopwords removed ({len(removed_data['stopwords'])}):")
        unique_stopwords = set(lemma for _, lemma in removed_data['stopwords'])
        for lemma in sorted(unique_stopwords):
            print(f"  '{lemma}'")
    
    # Summary by removal reason
    print(f"\n=== SUMMARY BY REMOVAL REASON ===")
    print(f"Total removed: {len(removed_triples)}")
    print(f"- Invalid definienda: {len(removed_data['invalid_definiendum'])}")
    print(f"- Too short lemmas: {len(removed_data['too_short'])}")
    print(f"- Stopwords: {len(removed_data['stopwords'])}")
    
    return removed_triples, removed_data

# Usage example
if __name__ == "__main__":
    input_file = "../multi_relational_hyperbolic_embeddings/data/my_dataset_cleaned/my_dataset.txt"
    cleaned_file = "../multi_relational_hyperbolic_embeddings/data/my_dataset_cleaned/train.txt"

    # Basic cleaning with preservation of mathematical functions
    print("=== Basic Cleaning with Mathematical Function Preservation ===")
    removed_triples, removal_data = clean_triples_file(input_file, cleaned_file)

=== Basic Cleaning with Mathematical Function Preservation ===


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chouyinghan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
