In [1]:
import unicodedata

def classify_character(char):
    category = unicodedata.category(char)

    if '\u3040' <= char <= '\u309F':
        return "Japanese Hiragana"
    elif '\u30A0' <= char <= '\u30FF':
        return "Japanese Katakana"
    elif '\u4E00' <= char <= '\u9FFF':
        return "Chinese Character"
    elif category.startswith('L'):
        return "Alphabet"
    elif category.startswith('P'):
        return "Punctuation"
    else:
        return "Miscellaneous"

def classify_text(text):
    results = {}
    for char in text:
        if char.isspace():
            continue
        classification = classify_character(char)
        if classification not in results:
            results[classification] = []
        results[classification].append(char)
    return results

# Example usage
sample_text = "こんにちは、世界！Hello, ワールド！123"
classification = classify_text(sample_text)

for category, chars in classification.items():
    print(f"{category}: {''.join(chars)}")

Japanese Hiragana: こんにちは
Punctuation: 、！,！
Chinese Character: 世界
Miscellaneous: Hello123
Japanese Katakana: ワールド


In [7]:
import os
import csv
import unicodedata
from collections import defaultdict

def classify_character(char):
    if '\u3040' <= char <= '\u309F':
        return "Japanese Hiragana"
    elif '\u30A0' <= char <= '\u30FF':
        return "Japanese Katakana"
    elif '\u4E00' <= char <= '\u9FFF':
        return "Chinese Character"
    elif char.isascii() and char.isalpha():
        return "English Alphabet"
    elif unicodedata.category(char).startswith('P'):
        return "Punctuation"
    else:
        return "Miscellaneous"

def process_files(input_dir, output_dir):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Initialize character buckets
    buckets = {
        "Japanese Hiragana": defaultdict(set),
        "Japanese Katakana": defaultdict(set),
        "Chinese Character": defaultdict(set),
        "English Alphabet": defaultdict(set),
        "Punctuation": defaultdict(set),
        "Miscellaneous": defaultdict(set)
    }

    # Process all CSV files in the input directory
    for filename in os.listdir(input_dir):
        if filename.endswith('.csv'):
            file_number = int(filename.split('.')[0])  # Convert to integer
            with open(os.path.join(input_dir, filename), 'r', encoding='utf-8') as file:
                csv_reader = csv.DictReader(file)
                for row in csv_reader:
                    token = row.get('token', '')
                    for char in token:
                        category = classify_character(char)
                        buckets[category][char].add(file_number)

    # Save results
    for category, char_dict in buckets.items():
        output_file = os.path.join(output_dir, f"{category.lower().replace(' ', '_')}.csv")
        with open(output_file, 'w', newline='', encoding='utf-8') as file:
            csv_writer = csv.writer(file)
            csv_writer.writerow(['Character', 'File Numbers'])
            for char, file_numbers in char_dict.items():
                # Convert set to sorted list of strings
                sorted_numbers = [str(num) for num in sorted(file_numbers)]
                csv_writer.writerow([char, '{' + ', '.join(sorted_numbers) + '}'])

    print("Processing complete. Results saved in the 'character_classification_results' directory.")

# Main execution
if __name__ == "__main__":
    input_directory = '../data/stage_4_processed'
    output_directory = 'character_classification_results'
    process_files(input_directory, output_directory)

Processing complete. Results saved in the 'character_classification_results' directory.
