In [1]:
from urllib.request import urlopen
import io
import gzip

In [2]:
# For downloading and decompressing CCDICT file
with open('cedict_1_0_ts_utf-8_mdbg.txt', 'wb') as f:
    response = urlopen('https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz')
    gz = io.BytesIO(response.read())
    txt = gzip.open(gz)
    f.write(txt.read())

In [47]:
def get_cedict_definitions(line): #If I used regex in this function, would it be any slower?
    for i, char in enumerate(line):
        if char == '/':
            begin_index = i + 1
            break
    for i, char in reversed(list(enumerate(line))):
        if char == '/':
            end_index = i
            break
    x = line[begin_index:end_index]
    definitions = x.split('/')
    return definitions

In [63]:
def get_cedict_categ(line):
    definitions = get_cedict_definitions(line)
    try:
        for definition in definitions:
             # This can be expanded to other formats where Buddhism can appear in definitions
            if '(Buddhism)' in definition.split():
                return 'Buddhism'
        # The following must be in order of lowest to highest index required, or else IndexError could be thrown too early
        if definitions[0].split()[1].lower() == 'county':
            return 'county'
        if definitions[0].split()[1].lower() == 'river':
            return 'river'
        if definitions[0].split()[1].lower() == 'lake' or definitions[0].split()[0] == 'Lake':
            return 'lake'
        if definitions[0].split()[2].lower() == 'level' and definitions[0].split()[3].lower() == 'city':
            return 'city'
        if definitions[0].split()[1].lower() == 'district' and definitions[0].split()[4].lower() == 'city':
            return 'district'
    except IndexError as e:
        return 'other'
    return 'other'

In [64]:
print(get_cedict_categ('律動 律动 [lu:4 dong4] /rhythm/to move rhythmically/'))
print(get_cedict_categ('復興 复兴 [Fu4 xing1] /Fuxing district of Handan city 邯鄲市|邯郸市[Han2 dan1 shi4], Hebei/Fuxing or Fuhsing township in Taoyuan county 桃園縣|桃园县[Tao2 yuan2 xian4], north Taiwan'))
print(get_cedict_categ('微山 微山 [Wei1 shan1] /Weishan County in Jining 濟寧|济宁[Ji3 ning2], Shandong/'))
print(get_cedict_categ('菩提 菩提 [pu2 ti2] /bodhi (Sanskrit)/enlightenment (Buddhism)/'))
print(get_cedict_categ('華鎣 华蓥 [Hua2 ying2] /Huaying county level city in Guang\'an 廣安|广安[Guang3 an1], Sichuan/'))
print(get_cedict_categ('萬泉河 万泉河 [Wan4 quan2 He2] /Wanquan River, Hainan/'))
print(get_cedict_categ('貝爾湖 贝尔湖 [Bei4 er3 Hu2] /Buir Lake of Inner Mongolia/'))


other
district
county
Buddhism
county
other
lake


In [67]:


bigrams = []
temp_dict = {}  # contains mutual bigram info except frequency
mutual_bigram_info = {}
with open('cedict_1_0_ts_utf-8_mdbg.txt', 'rt') as cedict, open('ngram_frequencies.txt', 'rt') as bigram_file:
    for line in cedict:
        if line[0] == '#':
            continue
        bigram = line.split()[0]
        bigrams.append(bigram)
        temp_dict[bigram] = get_cedict_categ(line)
    words = set(bigrams) # bigrams is list, words is the same set
    for line in bigram_file:
        info = line.split()
        if info[0] in words:
            mutual_bigram_info[info[0]] = [info[1], temp_dict[info[0]]]

mutual_bigram_info

{'可知': ['8995', 'other'],
 '頭面': ['3350', 'other'],
 '踴躍': ['2742', 'other'],
 '欲求': ['2677', 'other'],
 '三義': ['2530', 'other'],
 '夢中': ['2498', 'other'],
 '作證': ['2375', 'other'],
 '眼見': ['2111', 'other'],
 '頂上': ['2068', 'other'],
 '同上': ['1961', 'other'],
 '正字': ['1789', 'other'],
 '七十': ['1752', 'other'],
 '尋思': ['1738', 'other'],
 '俗字': ['1732', 'other'],
 '童女': ['1716', 'other'],
 '衣食': ['1716', 'other'],
 '可言': ['1715', 'other'],
 '千億': ['1695', 'other'],
 '曠野': ['1640', 'other'],
 '大指': ['1627', 'other'],
 '二分': ['1601', 'other'],
 '惠施': ['1570', 'other'],
 '四面': ['1568', 'other'],
 '鬚髮': ['1560', 'other'],
 '方能': ['1559', 'other'],
 '可解': ['1542', 'other'],
 '今世': ['1501', 'other'],
 '字書': ['1434', 'other'],
 '外人': ['1413', 'other'],
 '國中': ['1392', 'other'],
 '引證': ['1386', 'other'],
 '中行': ['1354', 'other'],
 '作惡': ['1354', 'other'],
 '愚人': ['1330', 'other'],
 '上界': ['1280', 'other'],
 '世世': ['1243', 'other'],
 '枝葉': ['1217', 'other'],
 '來意': ['1214', 'other'],
 '死生': ['119