In [1]:
import csv
import json
from collections import defaultdict

In [3]:
def process_csv(file_path):
    languages = defaultdict(lambda: {"phonemes": set(), "allophones": {}})
    phoneme_features = {}
    prosodic_features = defaultdict(dict)

    with open(file_path, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            glottocode = row['Glottocode']
            phoneme = row['Phoneme']
            allophones = row['Allophones']
            
            # Always add the phoneme to the language's phoneme set
            languages[glottocode]["phonemes"].add(phoneme)
            
            # Only add to allophones if it's not 'NA'
            if allophones != "NA":
                languages[glottocode]["allophones"][phoneme] = allophones.split()
            
            # Phoneme features
            features = {
                key: value for key, value in row.items() 
                if key not in ['InventoryID', 'Glottocode', 'ISO639-3', 'LanguageName', 'SpecificDialect', 'GlyphID', 'Phoneme', 'Allophones', 'MarginalSegment', 'Class', 'Source']
            }
            phoneme_features[phoneme] = features
            
            # Prosodic features (now phoneme-specific for each language)
            prosodic_features[glottocode][phoneme] = {
                "tone": row['tone'],
                "stress": row['stress'],
                "syllabic": row['syllabic']
            }

    return languages, phoneme_features, prosodic_features

In [4]:
def save_json(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [5]:
# Process the CSV file
csv_file_path = 'phonemes.csv'  # Replace with your actual CSV file path
languages, phoneme_features, prosodic_features = process_csv(csv_file_path)

# Prepare data for JSON files
language_phonemes = {glottocode: list(data["phonemes"]) for glottocode, data in languages.items()}
language_allophones = {glottocode: {phoneme: list(allophones) for phoneme, allophones in data["allophones"].items()} for glottocode, data in languages.items()}

# Save JSON files
save_json(language_phonemes, 'language_phonemes.json')
save_json(phoneme_features, 'phoneme_features.json')
save_json(prosodic_features, 'prosodic_features.json')
save_json(language_allophones, 'language_allophones.json')