# Parse external pre-made decks

Note: needs external data in `downloads/`, won't be able to run without it. The outputs however are checked in to the repo, so you don't need to run it.

In [1]:
import os, re, glob
import pandas as pd
from opencc import OpenCC

def postprocess_chars(text):
    mp = {
        '…': '...',
        '‘': "'",
        '’': "'",
        '“': '"',
        '”': '"',
    }
    return ''.join(mp.get(c, c) for c in text)

slides_df = pd.read_csv('data/slides.tsv', sep='\t')
assert list(slides_df.ID) == list(sorted(slides_df.ID))
slides_t2s = slides_df.set_index('Traditional').Simplified.to_dict()
slides_id2s = slides_df.set_index('ID').Simplified.to_dict()

termid_df = pd.read_csv('data/term-ids.tsv', sep='\t', comment='#')
termid_df['Lesson'] = termid_df['ID'].str.slice(0, 5)
termid_df['Lesson2'] = termid_df['ID'].str.extract('^(B.L..-II?)')
lesson_trad_to_id = termid_df.set_index(['Lesson', 'Traditional']).ID.to_dict()
lesson_pos_trad_to_id = termid_df.set_index(['Lesson', 'POS', 'Traditional']).ID.to_dict()
lesson2_trad_to_id = termid_df.set_index(['Lesson2', 'Traditional']).ID.to_dict()
lesson2_pos_trad_to_id = termid_df.set_index(['Lesson2', 'POS', 'Traditional']).ID.to_dict()

## Quizlet deck

* Official quizlet decks for B1-B4: https://quizlet.com/mtcbooktest/sets
* Audio seems tts generated
* Has images for ~600 slides, mostly from web

In [2]:
df = pd.read_csv(
    'downloads/flashcards/quizlet.txt',  # quizlet-to-anki scraped notes exported to .txt
    comment='#',
    names=['Front', 'FrontAudio', 'Back', 'BackAudio', 'Image', 'AddRev', 'Tags'],
    sep='\t'
)

del df['FrontAudio']
del df['BackAudio']
del df['AddRev']
del df['Image']

df['Lesson'] = df['Tags'].str.replace('B0', 'B').str.replace(' ', '')
assert all(re.match('^B[1-6]L[0-1][0-9]$', s) for s in df.Lesson)
del df['Tags']

OVERRIDES = {
  'KTV': ['KTV', 'KTV'],
  '故宮博物院（故宮）（故宫博物院（故宮））': ['故宮博物院（故宮）', '故宫博物院（故宫）'],  # 宮->宫
  '公共汽車（公車）（公共汽车（公车））': ['公共汽車（公車）', '公共汽车（公车）'],
  '星期（星期）<br>+number': ['星期 + number', '星期 + number'],
  '女（女）<br>+noun': ['女 + noun', '女 + noun'],
  'number+<br>月（月）': ['number + 月', 'number + 月'],
  'number+<br>號（号）': ['number + 號', 'number + 号'],
  '男（男）<br>+noun': ['男 + noun', '男 + noun'],
  '師大（師範大學）（师大（师范大学））': ['師大（師範大學）', '师大（师范大学）'],
  '待遇（待遇': ['待遇', '待遇'],
  '貨比三家不吃虧<br>（货比三家不吃亏）': ['貨比三家不吃虧', '货比三家不吃亏'],
  '(台)斤<br>（(台)斤）': ['（台）斤', '（台）斤'],
  'EMBA(高級管理人員工商管理碩士)<br>（EMBA(高级管理<br>人员工商<br>管理硕士)）': ['EMBA（高級管理人員工商管理碩士）', 'EMBA（高级管理人员工商管理硕士）'],
  '科系（科系': ['科系', '科系'],
  '提不起（勇氣）（提不起（勇气））': ['提不起（勇氣）', '提不起（勇气）'],
  '企業管理系(企管系)（企业管理系(企管系)）': ['企業管理系（企管系）', '企业管理系（企管系）'],
}

trad_col = []
simp_col = []

for k, text in enumerate(df.Front):
    assert '\n' not in text
    text_orig = text
    #text = re.sub(r'\s*<br>\s*', '\n', text).strip()
    if text in OVERRIDES:
        trad, simp = OVERRIDES[text]
    else:
        if re.search('[-+`<>,()&\'"]', text) or text.count('（') != 1 or text.count('）') != 1:
            print("  '%s': ['', '']," % text)
        m = re.match('^(.+)[（](.+)[）]$', text)
        assert m, text
        trad, simp = m[1].strip(), m[2].strip()
    #assert text.count('<') == 0, text

    trad_col.append(trad)
    simp_col.append(simp)

    if trad not in slides_t2s or simp != slides_t2s[trad]:
        print('trad "%s"\tsimp "%s"\tslides "%s"' % (trad, simp, slides_t2s.get(trad, '?')))

df['Traditional'] = trad_col
df['Simplified'] = simp_col

trad "臺灣 台灣"	simp "台湾"	slides "?"
trad "請進"	simp "请进"	slides "?"
trad "站"	simp "战"	slides "站"
trad "她"	simp "他"	slides "她"
trad "忘了"	simp "忘了"	slides "?"
trad "睡著"	simp "睡著"	slides "睡着"
trad "年年有餘"	simp "年年有馀"	slides "年年有余"
trad "義大利"	simp "意大利"	slides "义大利"
trad "捨不得"	simp "捨不得"	slides "舍不得"
trad "別的"	simp "別的"	slides "别的"
trad "台北101"	simp "台北101"	slides "?"
trad "2號線"	simp "2号线"	slides "?"
trad "照X光"	simp "照X光"	slides "?"
trad "中間商"	simp "中間商"	slides "中间商"
trad "經營"	simp "經營"	slides "经营"
trad "（台）斤"	simp "（台）斤"	slides "?"
trad "EMBA（高級管理人員工商管理碩士）"	simp "EMBA（高级管理人员工商管理硕士）"	slides "?"
trad "X分之Y"	simp "X分之Y"	slides "?"
trad "無人不知無人不曉"	simp "无人不知无人不晓"	slides "?"
trad "邱吉爾"	simp "邱吉尔"	slides "丘吉尔"
trad "一下...一下..."	simp "一下...一下..."	slides "?"
trad "就...而言"	simp "就...而言"	slides "?"
trad "是...的料"	simp "是...的料"	slides "?"
trad "在...之餘"	simp "在...之余"	slides "?"
trad "小自...大至..."	simp "小自...大至..."	slides "?"
trad "以...為..."	simp "以...为..."	slides "?"


In [3]:
QUIZLET_POS_MAP = {
  '(N)': 'N',
  '(V)': 'V',
  '(Vst)': 'Vst',
  '(Ptc)': 'Ptc',
  '(Det)': 'Det',
  '(Vs)': 'Vs',
  '(Adv)': 'Adv',
  '(Vaux)': 'Vaux',
  '(Vi)': 'Vi',
  '(Vs-pred)': 'Vs-pred',
  '(V-sep)': 'V-sep',
  '(M)': 'M',
  '(Conj)': 'Conj',
  'Adv': 'Adv',
  'N': 'N',
  'V': 'V',
  'M': 'M',
  'Vs': 'Vs',
  'Vs-attr': 'Vs-attr',
  'Prep': 'Prep',
  'Ptc': 'Ptc',
  'Vaux': 'Vaux',
  'V-sep': 'V-sep',
  'Det': 'Det',
  'Vst': 'Vst',
  '(Prep)': 'Prep',
  '(Vp)': 'Vp',
  '(Ptc': 'Ptc',
  '(Vpt)': 'Vpt',
  '(Vp-sep)': 'Vp-sep',
  '(Vs-attr)': 'Vs-attr',
  '(Vi, N)': 'N/Vi',  #1
  '(Vs-sep)': 'Vs-sep',
  '(Vi, V)': 'V/Vi', #1
  '(V': 'V',
  '(N/V)': 'N/V', #10
  '(V/N)': 'V/N', #9
  '(N/Vi)': 'N/Vi',
  '9N)': 'N',
}

pinyin_col = []
pos_col = []
def_col = []

for k, text in enumerate(df.Back):
    assert '\n' not in text
    text_orig = text
    text = text.replace('zhōngjiān<br>shāng<br>(N)', 'zhōngjiānshāng<br>(N)')
    text = text.replace('xiónghuáng jiǔ<br>', 'xiónghuáng jiǔ<br><br>')
    text = re.sub(r'\s*<br>\s*', '\n', text).strip()
    assert text.count('&') == 0 or 'Bed & Breakfast' in text
    assert text.count('<') == 0, text
    lines = text.split('\n')
    assert len(lines) >= 2, text_orig

    if lines[1] != '' and lines[1] not in QUIZLET_POS_MAP:
        print(text_orig)
    #if len(lines) >= 4: print(text_orig)

    pinyin = lines[0]
    pinyin = pinyin.replace('ă', 'ǎ')
    pinyin_col.append(pinyin)

    pos_col.append(QUIZLET_POS_MAP.get(lines[1], ''))
    def_col.append(' '.join(lines[2:]).strip())

df['Pinyin'] = pinyin_col
df['POS'] = pos_col
df['Meaning'] = def_col

id_col = []
for row in df.itertuples():
    key = (row.Lesson, row.POS, row.Traditional)
    if key in lesson_pos_trad_to_id:
        id_col.append(lesson_pos_trad_to_id[key])
        continue
    id_col.append(lesson_trad_to_id[(row.Lesson, row.Traditional)])
df['ID'] = id_col
df = df.sort_values('ID').reset_index(drop=True).copy()

if len(set(df.ID)) != len(df):
    dup_ids = df.ID.value_counts()[lambda X: X >= 2].index
    print('dups in', dup_ids.values)
    for term_id in dup_ids:
        drop_idx = df[df.ID == term_id].index[-1]
        df = df[df.index != drop_idx]

df['Tags'] = ''
df = df[['ID', 'Traditional', 'Simplified', 'Pinyin', 'POS', 'Meaning', 'Tags']].copy()
df.to_csv('data/quizlet.tsv', index=False, sep='\t')

print(len(df))

dups in ['B4L02-II-26' 'B1L12-II-05']
3073


# mquizlet

* Source: https://quizlet.com/Michael5739/sets
* Seems to have been manually typed from the book, lots of diffs vs slides, but good as an independent source for verification.
* Has almost all of book 6, including B6L1-III missing in slides.
* Missing B6L4-III, B6L10.

In [4]:
POS_MAP = {
  'Adv': 'Adv',
  'Adv/N': 'Adv/N',
  'Adv/Vs': 'Adv/Vs',
  'Av': 'Adv',
  'Conj': 'Conj',
  'Det': 'Det',
  'Id': 'Id',
  'M': 'M',
  'N': 'N',
  'N, Vs': 'N/Vs',
  'N/V': 'N/V',
  'N/Vi': 'N/Vi',
  'N/Vst': 'N/Vst',
  'Nl': 'N',
  'PH': 'Ph',
  'Ph': 'Ph',
  'Prep': 'Prep',
  'Ptc': 'Ptc',
  'V': 'V',
  'V-sep': 'V-sep',
  'V/': 'V/N', #jiàodǎo
  'V/N': 'V/N',
  'Vaux': 'Vaux',
  'Vi': 'Vi',
  'Vi/N': 'Vi/N',
  'Vp': 'Vp',
  'Vp-sep': 'Vp-sep',
  'Vpt': 'Vpt',
  'Vs': 'Vs',
  'Vs-attr': 'Vs-attr',
  'Vs-sep': 'Vs-sep',
  'Vs/Adv': 'Vs/Adv',
  'Vs/N': 'Vs/N',
  'Vst': 'Vst',
  'Vst/N': 'Vst/N',
}

slides_df['Lesson2'] = slides_df['ID'].str.extract('^(B.L..-II?)')
lesson2_pinyin_to_id = slides_df.set_index(['Lesson2', 'Pinyin']).ID.to_dict()
rows = []
term_ids = set()

for filepath in sorted(glob.glob('downloads/flashcards/mquizlet/B*')):  # copy pasted text from web pages
    m = re.match('B([0-9])L([0-9]+)+[AD]([123])$', os.path.basename(filepath))
    assert m, filepath
    lesson = 'B%dL%.2d-%s' % (int(m[1]), int(m[2]), 'I' * int(m[3]))
    lesson = lesson.replace('B6L81', 'B6L08')
    lesson = lesson.replace('B6L89', 'B6L09')

    lines = open(filepath).read().strip().split('\n')
    lines = [s.strip() for s in lines]
    s = [i for i,x in enumerate(lines) if x.strip() == 'Original'][0]
    t = [i for i,x in enumerate(lines) if x.startswith('About us')][0]
    lines = lines[s+1:t]
    lines = '\n'.join(lines).strip()
    lines = re.sub('\n\n+', '\n\n', lines)

    for card in lines.split('\n\n'):
        assert card.count('\n') == 1
        hanzi, text = card.split('\n')
        if ';' not in text: continue
        pinyin, text = text.split(';', maxsplit=1)
        pinyin = pinyin.strip()
        text = text.strip()
        pos = ''
        tag = ''

        if ';' in text:
            x, y = text.split(';', maxsplit=1)
            x = x.strip()
            if x in POS_MAP:
                pos = POS_MAP[x]
                text = y
        if pos == '':
            for pref in POS_MAP.keys():
                if text.startswith(pref + ' ') or  text.startswith(pref + ','):
                    pos = POS_MAP[pref]
                    text = text[len(pref)+1:]

        text = text.strip()

        # correct some systematic diffs
        text = text.replace('(literally ', '(lit.')
        text = text.replace('(literally, ', '(lit. ')
        text = text.replace('(literally)', '(lit.)')
        for x, y in ['説說', '内內', '麽麼', '爲為', '强強', '衆眾', '(（', ')）']:
            hanzi = hanzi.replace(x, y)

        key = (lesson, pos, hanzi)
        key2 = (lesson[:5], hanzi)
        key3 = (lesson, pinyin)
        if key in lesson2_pos_trad_to_id:
            term_id = lesson2_pos_trad_to_id[key]
        elif key2 in lesson_trad_to_id:
            term_id = lesson_trad_to_id[key2]
        elif key3 in lesson2_pinyin_to_id:
            term_id = lesson2_pinyin_to_id[key3]
            tag = 'FlaggedTrad'
        else:
            print('unmerged: %s' % '\t'.join((lesson, pos, hanzi, pinyin, text)))
            continue

        row = {
            'ID': term_id,
            'Traditional': hanzi,
            'Pinyin': pinyin,
            'POS': pos,
            'Meaning': text,
        }

        if term_id in term_ids:
            print('dup: %s' % row)
            continue

        rows.append(row)
        term_ids.add(term_id)

df = pd.DataFrame(rows)
df.to_csv('data/mquizlet.tsv', sep='\t', index=False)
print(len(df))

dup: {'ID': 'B4L01-II-28', 'Traditional': '心理', 'Pinyin': 'xīnlǐ', 'POS': 'N', 'Meaning': 'psychological, mental'}
unmerged: B4L07-I		以下...以下...	yǐxià... yǐxia...	measure phrase, describing fast-changing alternating scenes of two activities
dup: {'ID': 'B6L01-II-22', 'Traditional': '傾銷', 'Pinyin': 'qīngxiāo', 'POS': 'V', 'Meaning': 'to dump (products)'}
dup: {'ID': 'B6L09-I-0-05', 'Traditional': '遷移', 'Pinyin': 'qiānyí', 'POS': 'Vi', 'Meaning': 'to move (forward)'}
3538


## jiru

* "A Course in Contemporary Chinese (當代中文課程) all vocabs audio" deck:
  * https://ankiweb.net/shared/info/2118503187
  * https://github.com/jiru/ccc
* High quality, fairly clean source, a little edited from the book
* Some extra People and Place tags, but incomplete and not very useful
* Real recorded audio extracts
* B6 missing except for B6L01-I

In [5]:
df = pd.read_csv(
    'downloads/flashcards/anki-2118503187.txt',
    comment='#',
    names=['Traditional', 'Pinyin', 'Meaning', 'POS', 'Lesson', 'Audio', 'Tags'],
    sep='\t',
    index_col=False,
)

id_col = []
for row in df.itertuples():
    key2 = (row.Lesson, row.POS, row.Traditional)
    if key2 in lesson2_pos_trad_to_id:
        id_col.append(lesson2_pos_trad_to_id[key2])
        continue
    key = (row.Lesson[:5], row.Traditional)
    id_col.append(lesson_trad_to_id[key])
df['ID'] = id_col

df['Meaning'] = [postprocess_chars(s) for s in df.Meaning]

df['Tags'] = df.Tags.str.extract('(People|Place)').fillna('')
df = df[['ID', 'Traditional', 'Pinyin', 'POS', 'Meaning', 'Tags', 'Audio']].copy()
df.to_csv('data/ankiweb.tsv', index=False, sep='\t')
print(len(df))

4039


## DDSG deck

* From "Dangdai Study Guide": https://ddstudyguide.com/mediawiki/resources/assets/ddsg.apkg
* Based off extracted text from slides, but lowest quality / highest diffs, many unwarranted extra remarks/rewordings

In [6]:
df = pd.read_csv(
    'downloads/flashcards/ddsg.txt',
    comment='#',
    names=['Meaning', 'Pinyin', 'Unused', 'Traditional', 'Lesson', 'UnusedTags'],
    sep='\t',
    index_col=False,
)

df['Lesson'] = ['B%sL%s-%s' % re.match('^Book([0-9])/Chapter([0-9]{2})-(I|II)$', s).groups() for s in df['Lesson']]
df['Traditional'] = df.Traditional.str.extract('^<div><p><span style="?font-size:32px"?>([^<]+)</span></p></div>$')
assert sum(df.Traditional.isnull()) == 0
assert sum(df.Lesson.isnull()) == 0

POS_MAP = {
  '(N)': 'N',
  '(V)': 'V',
  '(Vst)': 'Vst',
  '(Ptc)': 'Ptc',
  '(Det)': 'Det',
  '(Vs)': 'Vs',
  '(Adv)': 'Adv',
  '(Vaux)': 'Vaux',
  '(Vi)': 'Vi',
  '(Vs-pred)': 'Vs-pred',
  '(V-sep)': 'V-sep',
  '(M)': 'M',
  '(Conj)': 'Conj',
  '(Prep)': 'Prep',
  '(Vp)': 'Vp',
  '(Ptc': 'Ptc',
  '(Vpt)': 'Vpt',
  '(Vp-sep)': 'Vp-sep',
  '(Vs-attr)': 'Vs-attr',
  '(Vi, N)': 'N/Vi',
  '(Vs-sep)': 'Vs-sep',
  '(Vi, V)': 'V/Vi',
  '(V/N)': 'V/N',
  '(N/V)': 'N/V',
  '(V/N)': 'V/N',
  '(N/Vi)': 'N/Vi',
  '(V ': 'V',

  '(Id)': 'Id',
  '(Ph)': 'Ph',
  '(Adv/N)': 'Adv/N',
  '(N/Vst)': 'N/Vst',
  '(Vst/N)': 'Vst/N',
  '(Vi/N)': 'Vi/N',
  '(Vs/N)': 'Vs/N',
  '(N,V)': 'N/V',
  'Adv ': 'Adv',
  'N ': 'N',
  'IDIOM&nbsp;': 'Id',
}

text_col = []
pos_col = []

for text in df.Meaning:
    text = text.strip()
    pos = ''
    for pref in POS_MAP.keys():
        if text.startswith(pref):
            pos = POS_MAP[pref]
            text = text[len(pref):].strip()
            break
    pos = pos.strip()

    text = re.sub('^· ', '', text).strip()
    text = text.replace(' ( ', ' (')
    text = text.replace(' ) ', ') ')
    text = postprocess_chars(text)

    pos_col.append(pos)
    text_col.append(text)

df['Meaning'] = text_col
df['POS'] = pos_col

id_col = []
for row in df.itertuples():
    key2 = (row.Lesson, row.POS, row.Traditional)
    if key2 in lesson2_pos_trad_to_id:
        id_col.append(lesson2_pos_trad_to_id[key2])
        continue
    key = (row.Lesson[:5], row.Traditional)
    term_id = lesson_trad_to_id[key]
    assert term_id, key
    id_col.append(term_id)
df['ID'] = id_col

df = df[['ID', 'Traditional', 'Pinyin', 'POS', 'Meaning']].copy()
df.to_csv('data/ddsg.tsv', sep='\t', index=False)
print(len(df))

3855


# ccc-flashcards

* Source: https://github.com/kevinlang/ccc-flashcards/blob/master/a_course_in_contemporary_chinese.tsv
* https://www.plecoforums.com/threads/flashcards-for-a-course-in-contemporary-chinese.6100/
* Quality mid way, some weird characters: trailing 's, non standard tone marks chars.

In [7]:
rows = []
for line in open('downloads/flashcards/ccc-flashcards.tsv'):
    if line.startswith('//'):
        m = re.match('//當代中文/Book ([1-6])/(L[0-9][0-9]-II?)$', line.strip())
        assert m, line
        lesson = 'B%s%s' % (m[1], m[2])
        continue

    assert len(line.split('\t')) == 3, line.split('\t')
    line = line.rstrip('\r\n').split('\t')
    assert len(line) == 3
    hanzi = line[0].strip()
    pinyin = line[1].strip()
    text = line[2].strip()

    # incorrect lookalike chars in some terms
    for x, y in [('é','é'), ('ù','ù'), ('ì','ì'), ('ǎ','ǎ'), ('à', 'à'), ('ǐ', 'ǐ'), ('í', 'í'), ('ū', 'ū'),
                 ('á','á'), ('ē', 'ē'), ('ǔ', 'ǔ'), ('ǜ', 'ǜ'), ('ú', 'ú'), ('ā', 'ā'), ('ī', 'ī')
                ]:
        #if rows == []: print(x, end='')
        pinyin = pinyin.replace(x, y)
        text = text.replace(x, y)

    text = text.replace(' ( ', ' (')
    text = text.replace(' ) ', ') ')
    text = postprocess_chars(text)

    pos = ''
    for pref in POS_MAP.keys():
        if text.startswith(pref):
            pos = POS_MAP[pref]
            text = text[len(pref):].strip()
            break
    pos = pos.strip()
    #if pos == '': print(line[-1])

    key = (lesson[:5], hanzi)
    key2 = (lesson, pos, hanzi)
    if key2 in lesson2_pos_trad_to_id:
        term_id = lesson2_pos_trad_to_id[key2]
    else:
        term_id = lesson_trad_to_id[key]
    assert term_id

    rows.append({
        'ID': term_id,
        'Traditional': hanzi,
        'Pinyin': pinyin,
        'POS': pos,
        'Meaning': text
    })

df = pd.DataFrame(rows)
df.to_csv('data/ccc.tsv', sep='\t', index=False)
print(len(df))

4035
