In [116]:
from pathlib import Path
import pandas as pd
import re
from tmp import find_words
import html
from collections import Counter

In [2]:
paths = sorted(Path('.').glob('*/[*.csv'))

# # ignore the pinyin for now
# paths = [path for path in paths if not ('english-' in path.stem.lower() or '-english' in path.stem.lower())]

paths

[WindowsPath('bhanot/[zsm] Malay-English.csv'),
 WindowsPath('cedict/[---] Pinyin-Chinese_and_English.csv'),
 WindowsPath('cedict/[zho] Chinese_Simplified-English.csv'),
 WindowsPath('cedict/[zho] Chinese_Traditional-English.csv'),
 WindowsPath('dictsinfo-omegawiki/[---] kolsch-english.csv'),
 WindowsPath('dictsinfo-omegawiki/[---] volapuk-english.csv'),
 WindowsPath('dictsinfo-omegawiki/[afr] afrikaans-english.csv'),
 WindowsPath('dictsinfo-omegawiki/[ara] arabic-english.csv'),
 WindowsPath('dictsinfo-omegawiki/[ast] asturian-english.csv'),
 WindowsPath('dictsinfo-omegawiki/[bel] belarusian-english.csv'),
 WindowsPath('dictsinfo-omegawiki/[ben] bengali-english.csv'),
 WindowsPath('dictsinfo-omegawiki/[bre] breton-english.csv'),
 WindowsPath('dictsinfo-omegawiki/[bul] bulgarian-english.csv'),
 WindowsPath('dictsinfo-omegawiki/[cat] catalan-english.csv'),
 WindowsPath('dictsinfo-omegawiki/[ces] czech-english.csv'),
 WindowsPath('dictsinfo-omegawiki/[cym] welsh-english.csv'),
 WindowsPat

In [3]:
def has_text(text):
    for _ in find_words(text):
        return True
    return False

def has_digit(text):
    # return any(char.isdigit() for char in text)
    return any(char in text for char in '1234567890')

def norm_text(text):
    return ' '.join(sorted(set(find_words(text)))).casefold()

In [4]:

# all tag texts, including deprecated tags, compiled into one huge (but fast) regex
_PATTERN_TAG = '(?:(?:!doctype|a(?:bbr|cronym|ddress|pplet|r(?:ea|ticle)|side|udio)?|b(?:ase(?:font)?|d[io]|ig|' \
               'l(?:ink|ockquote)|ody|r|utton)?|c(?:a(?:nvas|ption)|enter|ite|o(?:de|l(?:group)?))|d(?:ata(?:li' \
               'st)?|d|e(?:l|tails)|fn|i(?:alog|r|v)|l|t)|em(?:bed)?|f(?:i(?:eldset|gure)|o(?:nt|oter|rm)|rame(' \
               '?:set)?)|h(?:1|2|3|4|5|6|ead(?:er)?|group|r|tml)|i(?:frame|mg|n(?:put|s)|sindex)?|k(?:bd|eygen)' \
               '|l(?:abel|egend|i(?:nk)?)|m(?:a(?:in|p|r(?:k|quee))|e(?:nu(?:item)?|t(?:a|er)))|n(?:av|o(?:fram' \
               'es|script))|o(?:bject|l|pt(?:group|ion)|utput)|p(?:aram|laintext|r(?:e|ogress))?|q|r(?:b|p|tc?|' \
               'uby)|s(?:amp|cript|e(?:ction|lect)|mall|ource|pan|t(?:r(?:ike|ong)|yle)|u(?:b|mmary|p))?|t(?:ab' \
               'le|body|d|e(?:mplate|xtarea)|foot|h(?:ead)?|i(?:me|tle)|r(?:ack)?|t)|ul?|v(?:ar|ideo)|wbr))'

RE_COMMENT = re.compile(f'(?:<!--(?P<comment>.*)-->)', flags=re.I | re.U)
RE_SCRIPT = re.compile(f'(?:<script(?:\s+[^<>]*)?>.*</script\s*>)', flags=re.I | re.U)
RE_TAG = re.compile(fr'(?:</?{_PATTERN_TAG}(?:\s+[^<>]*)?/?>)', flags=re.I | re.U)


def remove_html_tags(text: str, replacement: str = ' ') -> str:
    text = RE_COMMENT.sub(replacement, text)  # remove all comments first since they could contain half a script
    text = RE_SCRIPT.sub(replacement, text)  # remove entire script, not just the tag
    text = RE_TAG.sub(replacement, text)
    return text

In [64]:
def html_unescape(text):
    def fix_entities(entities):
        return ''.join(chr(int(entity, 16)) for entity in entities.groups(1)[0].split(';') if entity)
    text = re.sub(r'<charset c=T>((?:[0-9a-fA-F]{4};)+)</charset>', fix_entities, text)
    return html.unescape(text)

In [99]:
tags = set()
for path in paths:
    print(path)
    
    # read null as a string, not NaN
    df = pd.read_csv(path, encoding='utf8', dtype='object', keep_default_na=False)

    # remove pos tags
    tags.update(df[df['Other'].str.contains(r'^\([^()]*\)$')]['Other'].unique())
    
tags = sorted(set(tag[1:-1].casefold().strip().rstrip('.') for tag in tags))
tags

bhanot\[zsm] Malay-English.csv
cedict\[---] Pinyin-Chinese_and_English.csv
cedict\[zho] Chinese_Simplified-English.csv
cedict\[zho] Chinese_Traditional-English.csv
dictsinfo-omegawiki\[---] kolsch-english.csv
dictsinfo-omegawiki\[---] volapuk-english.csv
dictsinfo-omegawiki\[afr] afrikaans-english.csv
dictsinfo-omegawiki\[ara] arabic-english.csv
dictsinfo-omegawiki\[ast] asturian-english.csv
dictsinfo-omegawiki\[bel] belarusian-english.csv
dictsinfo-omegawiki\[ben] bengali-english.csv
dictsinfo-omegawiki\[bre] breton-english.csv
dictsinfo-omegawiki\[bul] bulgarian-english.csv
dictsinfo-omegawiki\[cat] catalan-english.csv
dictsinfo-omegawiki\[ces] czech-english.csv
dictsinfo-omegawiki\[cym] welsh-english.csv
dictsinfo-omegawiki\[dan] danish-english.csv
dictsinfo-omegawiki\[deu] german-english.csv
dictsinfo-omegawiki\[ell] greek-english.csv
dictsinfo-omegawiki\[epo] esperanto-english.csv
dictsinfo-omegawiki\[est] estonian-english.csv
dictsinfo-omegawiki\[eus] basque-english.csv
dictsinfo

dictsinfo-wikipedia\[spa] spanish-english.csv
dictsinfo-wikipedia\[srp] serbian-english.csv
dictsinfo-wikipedia\[swa] swahili-english.csv
dictsinfo-wikipedia\[swe] swedish-english.csv
dictsinfo-wikipedia\[tam] tamil-english.csv
dictsinfo-wikipedia\[tel] telugu-english.csv
dictsinfo-wikipedia\[tgl] tagalog-english.csv
dictsinfo-wikipedia\[tha] thai-english.csv
dictsinfo-wikipedia\[tur] turkish-english.csv
dictsinfo-wikipedia\[ukr] ukrainian-english.csv
dictsinfo-wikipedia\[urd] urdu-english.csv
dictsinfo-wikipedia\[vie] vietnamese-english.csv
dictsinfo-wikipedia\[wln] walloon-english.csv
dictsinfo-wikipedia\[yid] yiddish-english.csv
dictsinfo-wikipedia\[zho] chinese-english.csv
dictsinfo-wikipedia\[zho] chinese_traditional-english.csv
freedict\[afr] Afrikaans-English.csv
freedict\[afr] English-Afrikaans.csv
freedict\[ara] Arabic-English.csv
freedict\[ara] English-Arabic.csv
freedict\[bul] English-Bulgarian.csv
freedict\[ces] Czech-English.csv
freedict\[ces] English-Czech.csv
freedict\[c

freelang\[---] Tujia-English.csv
freelang\[---] Tutchone_(Northern)-English.csv
freelang\[---] Tutchone_(Southern)-English.csv
freelang\[---] Ubykh-English.csv
freelang\[---] Udi-English.csv
freelang\[---] Venetian-English.csv
freelang\[---] Voro-English.csv
freelang\[---] West_Yugur-English.csv
freelang\[---] Xavante-English.csv
freelang\[---] Yugh-English.csv
freelang\[---] Yugur-English.csv
freelang\[---] Yukaghir-English.csv
freelang\[aar] Afar-English.csv
freelang\[aar] English-Afar.csv
freelang\[abk] Abkhaz-English.csv
freelang\[abk] English-Abkhaz.csv
freelang\[ady] Adyghe-English.csv
freelang\[ady] English-Adyghe.csv
freelang\[afr] Afrikaans-English.csv
freelang\[afr] English-Afrikaans.csv
freelang\[ain] Ainu-English.csv
freelang\[ain] English-Ainu.csv
freelang\[ang] English-Old_English.csv
freelang\[ang] Old_English-English.csv
freelang\[ara] Arabic-English.csv
freelang\[ara] English-Arabic.csv
freelang\[arn] English-Mapuche.csv
freelang\[arn] Mapuche-English.csv
freelang\[bak

freelang\[por] English-Brazilian_Portuguese.csv
freelang\[por] English-Portuguese.csv
freelang\[por] Portuguese-English.csv
freelang\[pus] English-Pashto.csv
freelang\[pus] Pashto-English.csv
freelang\[que] Bolivian_Quechua-English.csv
freelang\[que] English-Bolivian_Quechua.csv
freelang\[que] English-Quechua_of_Cuzco.csv
freelang\[que] Quechua_of_Cuzco-English.csv
freelang\[raj] English-Rajasthani.csv
freelang\[raj] Rajasthani-English.csv
freelang\[rar] English-Rarotongan.csv
freelang\[rar] Rarotongan-English.csv
freelang\[rom] English-Romani.csv
freelang\[rom] Romani-English.csv
freelang\[ron] English-Romanian.csv
freelang\[ron] Romanian-English.csv
freelang\[rup] Aromanian-English.csv
freelang\[rup] English-Aromanian.csv
freelang\[rus] English-Russian.csv
freelang\[rus] Russian-English.csv
freelang\[sah] English-Yakut.csv
freelang\[sah] Yakut-English.csv
freelang\[san] English-Sanskrit.csv
freelang\[san] Sanskrit-English.csv
freelang\[sel] English-Selkup.csv
freelang\[sel] Selkup-En

mantuboro\[mar] Marathi-English.csv
mantuboro\[mon] English-Mongolian.csv
mantuboro\[mon] Mongolian-English.csv
mantuboro\[msa] English-Malay.csv
mantuboro\[msa] Malay-English.csv
mantuboro\[mya] Burmese-English.csv
mantuboro\[mya] English-Burmese.csv
mantuboro\[nep] English-Nepali.csv
mantuboro\[nep] Nepali-English.csv
mantuboro\[nld] Dutch-English.csv
mantuboro\[nld] English-Dutch.csv
mantuboro\[nor] English-Norwegian.csv
mantuboro\[nor] Norwegian-English.csv
mantuboro\[pan] English-Punjabi.csv
mantuboro\[pan] Punjabi-English.csv
mantuboro\[pol] English-Polish.csv
mantuboro\[pol] Polish-English.csv
mantuboro\[por] English-Portuguese.csv
mantuboro\[por] Portuguese-English.csv
mantuboro\[pus] English-Pashto.csv
mantuboro\[pus] Pashto-English.csv
mantuboro\[ron] English-Romanian.csv
mantuboro\[ron] Romanian-English.csv
mantuboro\[rus] English-Russian.csv
mantuboro\[rus] Russian-English.csv
mantuboro\[slk] English-Slovak.csv
mantuboro\[slk] Slovak-English.csv
mantuboro\[spa] English-Span

['"',
 '"up" ile',
 "'",
 "'s",
 '+',
 '-laid',
 '...!',
 "...'",
 '...0',
 '...0x',
 '...1',
 '...2',
 '...3',
 '...3x',
 '...=',
 '...d',
 '...i',
 '...p',
 '...x',
 '1/100 albaania leki',
 '28. detsember',
 '<',
 '= i had, i would',
 '= is not',
 '= monosodium glutamate',
 '= not the, neverthe',
 '= shall not',
 '= that will',
 '= we will',
 '>',
 '> fling',
 '?',
 'a',
 'a.b.d',
 'ab',
 'abd',
 'adj',
 'administer =',
 'adv',
 'ain-c',
 'ain-ce',
 'ain-cn',
 'ain-cs',
 'ain-cw',
 'ain-e',
 'ain-ec',
 'ain-en',
 'ain-es',
 'ain-n',
 'ain-nc',
 'ain-nn',
 'ain-ns',
 'ain-s',
 'ain-w',
 'akvaariumikala',
 'al',
 'alay',
 'alt',
 'am',
 'an',
 'an intermation of the vermiform appendix',
 'analysis =',
 'anaph',
 'anat',
 'ant',
 'apanage =',
 'apothecary =',
 'ar',
 'argo',
 'ark',
 'ask',
 'assim',
 'astr',
 'attrib',
 'australien-new zealand',
 'aşağ',
 'b. h',
 'b.h',
 'baglaç',
 'bak',
 'bazen ingilizcede what kelimesi ile baslayan cümlecikler türkçe cümlede fiil içinde belirlenir.

In [118]:
# with open('tmp.txt', 'w', encoding='utf8') as f:
#     for tag in tags:
#         f.write(tag + '\n')

In [128]:
re_tags_1 = re.compile(r'(?<![\w])[\[({<]\s*' + '(?:' + '|'.join(re.escape(tag) for tag in tags) + ')?' + r'\.+\s*[\])}>](?![\w])', flags=re.I|re.U)
def remove_tags_1(text):
    return re_tags_1.sub('', text).strip()

re_tags_2 = re.compile(r'(?<![\w])[\[({<]\s*' + '(?:[a-z0-9 ]+\.)+' + r'\s*[\])}>](?![\w])', flags=re.I|re.U)
def remove_tags_2(text):
    return re_tags_2.sub('', text).strip()

re_tags_3 = re.compile(r'(?<![\w])[\[({<]\s*' + r'\.*\s*[\])}>](?![\w])', flags=re.I|re.U)
def remove_tags_3(text):
    return re_tags_3.sub('', text).strip()

re_tags_4 = re.compile(r'(?<![\w])[\[({<]\s*' + '(?:' + '|'.join(re.escape(tag) for tag in tags) + ')?' + r'\.*\s*[\])}>](?![\w])', flags=re.I|re.U)
def remove_tags_4(text):
    tmp = re_tags_4.sub('', text).strip()
    if tmp:
        return tmp
    else:
        return text

In [135]:
for path in paths:
    print(path)
    
    # read null as a string, not NaN
    df = pd.read_csv(path, encoding='utf8', dtype='object', keep_default_na=False)
    
#     # delete blanks
#     if len(df) == 0:
#         print('BLANK')
#         path.unlink()
#     continue
    
#     # sort
#     df = df.sort_values(by=['Other', 'English'])
    
#     # dedupe
#     df = df.drop_duplicates()

#     # other must have some text (e.g. cannot be all digits)
#     df = df[df['Other'].apply(has_text)]

#     # SKIPPED: remove non-translations (other contained in english)
#     df['tmp'] = [f' {norm_text(other)} ' not in f' {norm_text(eng)} ' for other, eng in zip(df['Other'], df['English'])]
#     df = df[df['tmp']][['Other', 'English']]

#     # SKIPPED: remove rows where there are digits in both other and english
#     df = df[~(df['Other'].apply(has_digit) & df['English'].apply(has_digit))]

#     # remove html tags (but not other generic text in angled brackets)
#     df['Other'] = df['Other'].apply(remove_html_tags)
#     df['English'] = df['English'].apply(remove_html_tags)

#     # normalize whitespace
#     df['Other'] = df['Other'].str.replace('\n', ' ; ').str.split().str.join(' ').str.replace(r';(?: ;)+', ';')
#     df['English'] = df['English'].str.replace('\n', ' ; ').str.split().str.join(' ').str.replace(r';(?: ;)+', ';')

#     # restore html escape sequences
#     df['Other'] = df['Other'].apply(html_unescape)
#     df['English'] = df['English'].apply(html_unescape)

#     # remove ZWSP and ZWNJ and ZWJ
#     df['Other'] = df['Other'].str.replace('\u200B', '')
#     df['English'] = df['English'].str.replace('\u200B', '')
#     df['Other'] = df['Other'].str.replace('\u200C', '')
#     df['English'] = df['English'].str.replace('\u200C', '')
#     df['Other'] = df['Other'].str.replace('\u200D', '')
#     df['English'] = df['English'].str.replace('\u200D', '')

#     # fix weird colons and pipes
#     df['Other'] = df['Other'].str.replace(r':(\s*:)+', ':')
#     df['English'] = df['English'].str.replace(r':(\s*:)+', ':')
#     df['Other'] = df['Other'].str.replace(r'\|(\s*\|)+', '||')
#     df['English'] = df['English'].str.replace(r'\|(\s*\|)+', '||')

#     # someone opened it in excel and turned text into an invalid formula
#     df = df[~df['Other'].str.contains('^#\s*NAME\??$', case=False)]
#     df['English'] = df['English'].str.replace('^#\s*NAME\??$', '', case=False)  # don't drop these rows, just erase the english side
    
#     # someone crawled a website with a wordnet backend, probably
#     df = df[~df['Other'].str.contains('View lemma information', case=False)]
    
#     # spammy entries
#     df = df[~df['Other'].str.contains('http:', case=False)]
#     df = df[~df['English'].str.contains('http:', case=False)]
#     df = df[~df['Other'].str.contains(r''':\\''', case=False)]
#     df = df[~df['English'].str.contains(r''':\\''', case=False)]
#     df = df[~df['Other'].str.contains(r'''\.\\''', case=False)]
#     df = df[~df['English'].str.contains(r'''\.\\''', case=False)]

#     # fix "MarÄthi" -> ["Marathi", "Marāṭhī", "Marāthi"]
#     for other in df[df['English'] == 'MarÄthi']['Other'].to_list():
#         df = df.append({'Other': other, 'English':'Marathi'}, ignore_index=True)
#         df = df.append({'Other': other, 'English':'Marāṭhī'}, ignore_index=True)
#         df = df.append({'Other': other, 'English':'Marāthi'}, ignore_index=True)
#     df = df[df['English'] != 'MarÄthi']

#     # headers
#     df = df[~df['Other'].str.contains(r'^//.*//$')]

#     # remove weird url tags
#     df = df[~df['Other'].str.contains(r'<url>')]

#     # remove pos tags
#     print(Counter([found for row in df['Other'].apply(re_tags_1.findall).tolist() for found in row]).most_common())
#     df['Other'] = df['Other'].apply(remove_tags_1)
#     print(Counter([found for row in df['Other'].apply(re_tags_2.findall).tolist() for found in row]).most_common())
#     df['Other'] = df['Other'].apply(remove_tags_2)
#     print(Counter([found for row in df['Other'].apply(re_tags_3.findall).tolist() for found in row]).most_common())
#     df['Other'] = df['Other'].apply(remove_tags_3)
    
    
    # update csv
    assert len(df) > 0
    df.to_csv(path, encoding='utf8', index=False)

bhanot\[zsm] Malay-English.csv
cedict\[---] Pinyin-Chinese_and_English.csv
cedict\[zho] Chinese_Simplified-English.csv
cedict\[zho] Chinese_Traditional-English.csv
dictsinfo-omegawiki\[---] kolsch-english.csv
dictsinfo-omegawiki\[---] volapuk-english.csv
dictsinfo-omegawiki\[afr] afrikaans-english.csv
dictsinfo-omegawiki\[ara] arabic-english.csv
dictsinfo-omegawiki\[ast] asturian-english.csv
dictsinfo-omegawiki\[bel] belarusian-english.csv
dictsinfo-omegawiki\[ben] bengali-english.csv
dictsinfo-omegawiki\[bre] breton-english.csv
dictsinfo-omegawiki\[bul] bulgarian-english.csv
dictsinfo-omegawiki\[cat] catalan-english.csv
dictsinfo-omegawiki\[ces] czech-english.csv
dictsinfo-omegawiki\[cym] welsh-english.csv
dictsinfo-omegawiki\[dan] danish-english.csv
dictsinfo-omegawiki\[deu] german-english.csv
dictsinfo-omegawiki\[ell] greek-english.csv
dictsinfo-omegawiki\[epo] esperanto-english.csv
dictsinfo-omegawiki\[est] estonian-english.csv
dictsinfo-omegawiki\[eus] basque-english.csv
dictsinfo

dictsinfo-wikipedia\[san] sanskrit-english.csv
dictsinfo-wikipedia\[scn] sicilian-english.csv
dictsinfo-wikipedia\[slk] slovak-english.csv
dictsinfo-wikipedia\[slv] slovenian-english.csv
dictsinfo-wikipedia\[spa] spanish-english.csv
dictsinfo-wikipedia\[srp] serbian-english.csv
dictsinfo-wikipedia\[swa] swahili-english.csv
dictsinfo-wikipedia\[swe] swedish-english.csv
dictsinfo-wikipedia\[tam] tamil-english.csv
dictsinfo-wikipedia\[tel] telugu-english.csv
dictsinfo-wikipedia\[tgl] tagalog-english.csv
dictsinfo-wikipedia\[tha] thai-english.csv
dictsinfo-wikipedia\[tur] turkish-english.csv
dictsinfo-wikipedia\[ukr] ukrainian-english.csv
dictsinfo-wikipedia\[urd] urdu-english.csv
dictsinfo-wikipedia\[vie] vietnamese-english.csv
dictsinfo-wikipedia\[wln] walloon-english.csv
dictsinfo-wikipedia\[yid] yiddish-english.csv
dictsinfo-wikipedia\[zho] chinese-english.csv
dictsinfo-wikipedia\[zho] chinese_traditional-english.csv
freedict\[afr] Afrikaans-English.csv
freedict\[afr] English-Afrikaans

freelang\[---] Taino-English.csv
freelang\[---] Tanacross-English.csv
freelang\[---] Tofa-English.csv
freelang\[---] Tsakhur-English.csv
freelang\[---] Tujia-English.csv
freelang\[---] Tutchone_(Northern)-English.csv
freelang\[---] Tutchone_(Southern)-English.csv
freelang\[---] Ubykh-English.csv
freelang\[---] Udi-English.csv
freelang\[---] Venetian-English.csv
freelang\[---] Voro-English.csv
freelang\[---] West_Yugur-English.csv
freelang\[---] Xavante-English.csv
freelang\[---] Yugh-English.csv
freelang\[---] Yugur-English.csv
freelang\[---] Yukaghir-English.csv
freelang\[aar] Afar-English.csv
freelang\[aar] English-Afar.csv
freelang\[abk] Abkhaz-English.csv
freelang\[abk] English-Abkhaz.csv
freelang\[ady] Adyghe-English.csv
freelang\[ady] English-Adyghe.csv
freelang\[afr] Afrikaans-English.csv
freelang\[afr] English-Afrikaans.csv
freelang\[ain] Ainu-English.csv
freelang\[ain] English-Ainu.csv
freelang\[ang] English-Old_English.csv
freelang\[ang] Old_English-English.csv
freelang\[ara]

freelang\[pan] Punjabi-English.csv
freelang\[pap] English-Papiamento.csv
freelang\[pap] Papiamento-English.csv
freelang\[phn] English-Phoenician.csv
freelang\[phn] Phoenician-English.csv
freelang\[pli] English-Pali.csv
freelang\[pli] Pali-English.csv
freelang\[pol] English-Polish.csv
freelang\[pol] Polish-English.csv
freelang\[por] Brazilian_Portuguese-English.csv
freelang\[por] English-Brazilian_Portuguese.csv
freelang\[por] English-Portuguese.csv
freelang\[por] Portuguese-English.csv
freelang\[pus] English-Pashto.csv
freelang\[pus] Pashto-English.csv
freelang\[que] Bolivian_Quechua-English.csv
freelang\[que] English-Bolivian_Quechua.csv
freelang\[que] English-Quechua_of_Cuzco.csv
freelang\[que] Quechua_of_Cuzco-English.csv
freelang\[raj] English-Rajasthani.csv
freelang\[raj] Rajasthani-English.csv
freelang\[rar] English-Rarotongan.csv
freelang\[rar] Rarotongan-English.csv
freelang\[rom] English-Romani.csv
freelang\[rom] Romani-English.csv
freelang\[ron] English-Romanian.csv
freelang\

mantuboro\[ita] English-Italian.csv
mantuboro\[ita] Italian-English.csv
mantuboro\[kor] English-Korean.csv
mantuboro\[kor] Korean-English.csv
mantuboro\[lat] English-Latin.csv
mantuboro\[lat] Latin-English.csv
mantuboro\[lav] English-Latvian.csv
mantuboro\[lav] Latvian-English.csv
mantuboro\[lit] English-Lithuanian.csv
mantuboro\[lit] Lithuanian-English.csv
mantuboro\[lus] English-Lushai.csv
mantuboro\[lus] Lushai-English.csv
mantuboro\[mar] English-Marathi.csv
mantuboro\[mar] Marathi-English.csv
mantuboro\[mon] English-Mongolian.csv
mantuboro\[mon] Mongolian-English.csv
mantuboro\[msa] English-Malay.csv
mantuboro\[msa] Malay-English.csv
mantuboro\[mya] Burmese-English.csv
mantuboro\[mya] English-Burmese.csv
mantuboro\[nep] English-Nepali.csv
mantuboro\[nep] Nepali-English.csv
mantuboro\[nld] Dutch-English.csv
mantuboro\[nld] English-Dutch.csv
mantuboro\[nor] English-Norwegian.csv
mantuboro\[nor] Norwegian-English.csv
mantuboro\[pan] English-Punjabi.csv
mantuboro\[pan] Punjabi-English.

prodict\[ukr] Ukrainian-English.csv
prodict\[vie] English-Vietnamese.csv
prodict\[vie] Vietnamese-English.csv
prodict\[zho] Chinese-English.csv
prodict\[zho] English-Chinese.csv
prodict\[zul] English-Zulu.csv
prodict\[zul] Zulu-English.csv


In [9]:
df = pd.read_csv('dictsinfo-omegawiki/[bre] breton-english.csv', encoding='utf8', dtype='object', keep_default_na=False)
df

Unnamed: 0,Other,English
0,-va,location
1,-va,place
2,Abc'hazeg,Abkhaz
3,Abc'hazeg,Abkhazian
4,Abkhazia,Abkhazia
...,...,...
1221,zouloueg,Zulu
1222,Åland,Åland Islands
1223,лантан,lanthanum
1224,অস্ট্রেলিয়া,Australia


In [5]:
# todo: hindi -> english
# todo: ???????
# todo: fix control chars
# todo: (mantuboro) $WI_FREQUENCY $WI_CONJ $\d*$
# todo: HTJ char


# (pron. 1st pers.)
# (adj.)
# (v.p.)
# (DN)
# (nm) (nf) (vta) (vai cj) (vai-u) (vai+o) (vai/ii-i)
# (a.b.d.)
# [math.]
# for all removed rows, remove all other rows with the same other_word

# remove brackets
# (m) (f) {m} {f}

# unmatched brackets

# unidecode latin as well

# duplicate quotechar '' -> '

# prefix / suffix
# -xyz

# fillter words by script(s)

In [6]:
df 

Unnamed: 0,Other,English
0,'-ba -khulu,dominate
1,'-de,high
2,'-dla ngamacebo,defraud
3,'-dwa,exceptional
4,'-kwe,after
...,...,...
16837,zwa,live
16838,zwa,obey
16839,zwebela,spy
16840,zwela,regret


In [7]:
re_brackets_1 = re.compile(r'\[[^\[\]]*\]')
re_brackets_2 = re.compile(r'\([^\(\)]*\)')
re_brackets_3 = re.compile(r'\{[^\{\}]*\}')
re_brackets_4 = re.compile(r'\<[^\<\>]*\>')

def remove_brackets(text):
    if '[' in text:
        text = re_brackets_1.sub('', text)
    if '(' in text:
        text = re_brackets_2.sub('', text)
    if '{' in text:
        text = re_brackets_3.sub('', text)
    if '<' in text:
        text = re_brackets_4.sub('', text)