In [27]:
import pandas as pd
csv_file = "lithuanian_words_1page.csv"
df = pd.read_csv(csv_file)
print("📝 Original Data Overview:")
print(df.info())

📝 Original Data Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Word             50 non-null     object
 1   Part of Speech   49 non-null     object
 2   Dark Text        50 non-null     object
 3   Description      50 non-null     object
 4   Additional Info  47 non-null     object
 5   Update Date      37 non-null     object
dtypes: object(6)
memory usage: 2.5+ KB
None


In [28]:
from IPython.display import display
display(df.head())

Unnamed: 0,Word,Part of Speech,Dark Text,Description,Additional Info,Update Date
0,a1,♦,a,"rasti, galva, kas; meta, gale | lapė, matė |...","raidė, vartojama žymėti užpakalinės eilės žemu...",Atnaujinta: 2024-07-19
1,a2,jst.,ã,"Ã, kai̇̃p šálta! Ã, prisi̇̀menu, prisi̇̀me...",(ir tęsiamai) vartojamas reiškiant nusistebėji...,Atnaujinta: 2021-10-27
2,a3,jst.,à,"À, kai̇̃p tù mán nusibódai. À, tiẽk tõ...","vartojamas reiškiant nekantrumą, nepasitenkini...",
3,a-,priešd.,a-,"aritmija, asimetrija, ateizmas, apolitiškas, ...","tarptautinė žodžio dalis, reiškianti ko neigim...",Atnaujinta: 2023-05-12
4,ą,,ą,"kąsti, šąla, ką; ranką, žąsis | „Ą, ą“ yrà ...","raidė, vartojama žymėti užpakalinės eilės žemu...",


In [None]:
# Updated Lithuanian-to-IPA mapping
lithuanian_to_ipa = {

    #vowels
    'a': 'a', 'ą': 'ãː', 
    'e': 'ɛ', 'ę': 'ɛ̃ː', 'ė': 'eː',
    'i': 'i', 'į': 'iː', 
    'o': 'o', 
    'u': 'u', 'ų': 'uː', 'ū': 'uː',

    #consonants
    'b': 'b', 
    'c': 't͡s', 'č': 't͡ʃ',
    'd': 'd', 
    'f': 'f', 'g': 'ɡ',
    'h': 'ɦ', 
    'j': 'j', 
    'k': 'k', 
    'l': 'l', 
    'm': 'm', 
    'n': 'n',
    'p': 'p', 
    'r': 'r', 
    's': 's', 'š': 'ʃ', 
    't': 't', 
    'v': 'v',
    'z': 'z', 'ž': 'ʒ',

    #diphthongs
    'uo': 'u̯o', 
    'ie': 'i̯ɛ', 
    'ai': 'ai̯', 
    'au': 'au̯', 
    'ei': 'ei̯', 
    'ui': 'ui̯'
}

# Updated vowels set
vowels = {
    # Short vowels
    'a', 'ɛ', 'i', 'o', 'u',
    # Long vowels (including nasalized forms)
    'aː', 'ãː', 'ɛː', 'ɛ̃ː', 'eː', 'iː', 'ĩː', 'oː', 'uː', 'ũː',
    # Diphthongs
    'u̯o', 'i̯ɛ', 'ai̯', 'au̯', 'ei̯', 'ui̯'
}


def transcribe_to_ipa(word):
    ipa = ''
    i = 0
    while i < len(word):
        if i < len(word) - 1 and word[i:i+2] in lithuanian_to_ipa:
            ipa += lithuanian_to_ipa[word[i:i+2]]
            i += 2
        else:
            ipa += lithuanian_to_ipa.get(word[i], word[i])
            i += 1
    return ipa




def syllabify_from_ipa(ipa):
    syllables = []
    current_syllable = ''
    
    i = 0
    while i < len(ipa):
        char = ipa[i]
        current_syllable += char
        
        # Look ahead for multi-character sequences (e.g., diphthongs, affricates)
        if char in {'t', 'd', 'ʃ', 'ʒ'} and i + 1 < len(ipa) and ipa[i+1] == '͡':
            current_syllable += ipa[i+1:i+3]  # Include tie bar and next char (e.g., t͡ʃ)
            i += 2
        
        # Check if we've hit a vowel/diphthong
        if any(v in current_syllable for v in vowels):
            # Look ahead to see if next segment starts with a vowel
            next_pos = i + 1
            next_segment = ''
            while next_pos < len(ipa) and ipa[next_pos] not in vowels:
                next_segment += ipa[next_pos]
                next_pos += 1
            if next_pos < len(ipa) and ipa[next_pos] in vowels:
                # Split before the next vowel, maximizing onset
                syllables.append(current_syllable)
                current_syllable = ''
        i += 1
    
    # Append the last syllable
    if current_syllable:
        syllables.append(current_syllable)
    
    # Join with syllable breaks
    return '-'.join(syllables)

In [30]:
import pyphen
def hyphenation(word):

    dic = pyphen.Pyphen(lang='lt', left=1, right=1)
    syllables = dic.inserted(word).split('-')

    return "-".join(syllables)

In [33]:

df["IPA notation"] = df["Dark Text"].apply(transcribe_to_ipa)
df['syllables from IPA'] = df['Dark Text'].apply(syllabify_from_ipa)
df["hyphenes"] = df["Dark Text"].apply(hyphenation)

In [34]:
from IPython.display import display
display(df[["Dark Text", "IPA notation", "syllables from IPA", "hyphenes"]])

Unnamed: 0,Dark Text,IPA notation,syllables from IPA,hyphenes
0,a,a,a,a
1,ã,ã,ã,ã
2,à,à,à,à
3,a-,a-,a-,a-
4,ą,ãː,ą,ą
5,abajà,abajà,a-ba-jà,a-ba-jà
6,Abakãnas,Abakãnas,Aba-ka-̃nas,A-ba-kãna-s
7,abãtas,abãtas,a-ba-̃tas,a-bãta-s
8,abãtė,abãteː,a-bãtė,a-bãtė
9,abatijà,abatijà,a-ba-tijà,a-ba-ti-jà
