In [1]:
import pandas as pd
import numpy as np
import re
import os

In [39]:
def clean_text(text):
    return re.sub(r'\(\d\)', '', text).strip()

def parse_txt(file):
    data = []
    with open(file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        
    current_entry = {}
    current_tag = None
    current_value = []

    for line in lines:
        line = line.strip()
        if line.startswith('\\lx'):
            if current_entry:
                if current_tag:
                    current_entry[current_tag] = ' '.join(current_value).strip()
                data.append(current_entry)
            current_entry = {'lex_isc': clean_text(line.strip('\\lx').strip())}
            current_tag = None
            current_value = []
        elif line.startswith('\\lc'):
            current_entry['lex_citation'] = clean_text(line.strip('\\lc').strip())
            current_tag = None
            current_value = []
        elif line.startswith('\\ps'):
            current_entry['pos'] = line.strip('\\ps').strip()
            current_tag = None
        elif line.startswith('\\gn'):
            if current_tag:
                current_entry[current_tag] = ' '.join(current_value).strip()
            current_tag = 'gloss_es'
            current_value = [line.strip('\\gn').strip()]
        elif line.startswith('\\rn') and 'gloss_es' not in current_entry:
            if current_tag:
                current_entry[current_tag] = ' '.join(current_value).strip()
            current_tag = 'gloss_es'
            current_value = [line.strip('\\rn').strip()]
        elif line.startswith('\\dn'):
            if current_tag:
                current_entry[current_tag] = ' '.join(current_value).strip()
            current_tag = 'def_es'
            current_value = [line.strip('\\dn').strip()]
        elif line.startswith('\\'):
            if current_tag:
                current_entry[current_tag] = ' '.join(current_value).strip()
            current_tag = None
        else:
            if current_tag:
                current_value.append(line)

    if current_entry:
        if current_tag:
            current_entry[current_tag] = ' '.join(current_value).strip()
        data.append(current_entry)
    
    df = pd.DataFrame(data, columns=['lex_isc', 'lex_citation', 'pos', 'gloss_es', 'def_es'])
    return df

In [40]:
file_path = 'data/diccionario.txt'
df = parse_txt(file_path)
df

Unnamed: 0,lex_isc,lex_citation,pos,gloss_es,def_es
0,a,,pron.,él ; ella,"pronombre de tercera persona singular: él, ella."
1,abebi choroin,,v.intr.,correr (con otro),"correr con otro, como en una carrera."
2,aben,,n.,mono capuchino (grande) o mono martí,variedad más grande de mono capuchino o martín...
3,aben ino,,n.,puma o puma,felino de gran tamaño y un solo color sin manc...
4,abo,,pron.,ellos ; ellas,"pronombre de tercera persona plural: ellos, el..."
...,...,...,...,...,...
1709,yunpa,,n.,iguana,lagartija muy grande. Anoles Fam. Polychrotidae
1710,yunpa,,n.,iguana,iguana de gran tamaño y de color verde.
1711,yunsha,,n.,shuyo,Pez similar al huasaco pero de menor tamaño y ...
1712,yushin meshti,,n.,mántido o manta,"insecto de cuerpo delgado, de color verde y pa..."
