In [1]:
!pip install -q genanki opencc

import os, re, json, glob
import genanki
import pandas as pd
from opencc import OpenCC

opencc_tw2s = OpenCC('tw2s')

SOURCES = ['slides', 'quizlet', 'ankiweb', 'ddsg', 'ccc', 'mquizlet']
DFS = {}

for name in SOURCES:
    df = pd.read_csv(f'data/{name}.tsv', sep='\t', index_col=False)
    assert sum(df['ID'].isnull()) == 0
    assert sum(df['Traditional'].isnull()) == 0
    if len(set(df.ID)) != len(df):
        print(name, df[df.ID.isin(df.ID.value_counts()[lambda X: X >= 2].index)])
    for col in df:
        df[col] = df[col].fillna('')
    df = df.sort_values('ID').set_index('ID').copy()
    DFS[name] = df

slides_df = DFS['slides']

tags_df = pd.read_csv('data/tags.tsv', sep='\t', comment='#').set_index('ID').copy()

# Diffs reviewed against the book + extra terms from book
book_df = pd.read_csv('data/book.tsv', sep='\t', comment='#')
assert len(set(book_df.ID)) == len(book_df), book_df.ID.value_counts().head()
book_df = book_df.sort_values('ID').set_index('ID').copy()

termid_df = pd.read_csv('data/term-ids.tsv', sep='\t', comment='#')
trad_variants_mp = termid_df.groupby('ID').Traditional.apply(list).to_dict()

# Merge sources

Merges input sources, diffs them for review, generates final outputs: `dangdai.csv`, `dangdai.apkg`, `dangdai-pleco.txt`.

Needs `data/*.tsv` from `slides.ipynb` and `extdecks.ipynb` - checked in to repo, don't need to re-run them.

## Diff for manual review

Generates `diffs.tsv`, review and then pick correct lines into `reviewed.tsv`.

In [2]:
def postprocess_chars_for_diff(text):
    text = text.replace('＋ ', ' + ')
    text = text.replace(' ＋', ' + ')
    mp = {
        '…': '...',
        '‘': "'",
        '’': "'",
        '“': '"',
        '”': '"',
        'ﬃ': 'ffi',
        'ﬄ': 'ffl',
        'ﬁ': 'fi',
        'ﬂ': 'fl',
        'ﬀ': 'ff',
        '／': '/',
        '％': '%',
        '～': '~',
        '＂': '"',
        '＝': '=',
        '＋': '+',
    }
    return ''.join(mp.get(c, c) for c in text)

def quick_diff(dfs, verbose=False):
    deltas = set()
    for col in set(sum([list(d.columns) for d in dfs], start=[])):
        if col in ['ID', 'Tags', 'Variants', 'Simplified', 'Audio']: continue
        mp = {}
        for d in dfs:
            for row in d.itertuples():
                val = getattr(row, col)
                val = val.strip()
                if col == 'POS':
                    val = ' '.join(sorted(val.replace(',', '/').replace(' ', '').split('/')))
                else:
                    val = postprocess_chars_for_diff(val)
                mp.setdefault(row.Index, []).append(val)
        for key, vals in mp.items():
            if len(set(vals)) <= 1:
                continue
            if len(set(vals)) == 2:
                a, b = set(vals)
                if len(a) > len(b):
                    a, b = b, a
                if b == a + "'" or b == a + ".":  # ccc deck has many trailing '
                    continue
            if col == 'POS' and set(vals) == set(['', 'Ph']):
                continue
            deltas.add(key)
            if verbose:
                print(key, col, vals)
    return list(sorted(deltas))

print(''.join('%7s ' % s for s in [''] + SOURCES))
dd = []
for s1 in SOURCES:
    print('%7s ' % s1, end='')
    for s2 in SOURCES:
        print('%7d ' % len(quick_diff([DFS[s1], DFS[s2]])), end='')
    dd.append(DFS[s1])
    print(' %d' % len(quick_diff(dd)))

delta_ids = quick_diff(list(DFS.values()))

print('\nDiffs by book:\n%s' % pd.Series(delta_ids).str.slice(0,2).value_counts().sort_index())

         slides quizlet ankiweb    ddsg     ccc mquizlet 
 slides       0     162     184     226     181     562  0
quizlet     162       0     198     284     198     280  162
ankiweb     184     198       0     286      35     396  276
   ddsg     226     284     286       0     267     501  423
    ccc     181     198      35     267       0     399  428
mquizlet     562     280     396     501     399       0  858

Diffs by book:
B1     86
B2     70
B3    148
B4    228
B5    184
B6    142
Name: count, dtype: int64


In [3]:
#delta_ids_ex_review = list(sorted(set(delta_ids)))
delta_ids_ex_review = list(sorted(set(delta_ids) - set(book_df.index)))
print('%d -> %d to review' % (len(delta_ids), len(delta_ids_ex_review)))

if len(delta_ids_ex_review) > 0:
    with open('diffs.tsv', 'w') as fout:
        OUT_COLS = ['ID', 'Traditional', 'Simplified', 'Pinyin', 'POS', 'Meaning', 'Tags']
        fout.write('\t'.join(OUT_COLS) + '\n')
        
        for term_id in delta_ids_ex_review:
            slides_row = slides_df.loc[term_id].to_dict()
            merged_rows = []
            for src in SOURCES:
                if term_id not in DFS[src].index: continue
                row = DFS[src].loc[term_id].to_dict()
                merged = dict(slides_row)
                merged['ID'] = term_id
                merged['Tags'] = ''
                for col in row:
                    if col not in merged: continue
                    merged[col] = row[col]

                str1 = '\t'.join([merged[c] for c in OUT_COLS if c != 'Tags'])
                match = -1
                for i, merged2 in enumerate(merged_rows):
                    str2 = '\t'.join([merged2[c] for c in OUT_COLS if c != 'Tags'])
                    if str1 == str2:
                        match = i
                        break

                if match >= 0:
                    merged_rows[match]['Tags'] = (merged_rows[match]['Tags'] + ' %s' % src).strip()
                else:
                    merged['Tags'] = (merged['Tags'] + ' %s' % src).strip()
                    merged_rows.append(merged)

            for merged in merged_rows:
                fout.write('#%s\n' % '\t'.join([merged[c] for c in OUT_COLS]))

    print('Created diffs.tsv')

858 -> 0 to review


## Merge

In [4]:
assert len(delta_ids_ex_review) == 0

In [5]:
# Term count by book
# Reference: http://mtc.ntnu.edu.tw/upload_files/resource/download/Contemporary-Chinese/181220.pdf
# 569 659 851 997 926(miscounted?) 957 total 4959
term_ids = set(book_df.index)
for df in DFS.values():
    term_ids |= set(df.index)
term_ids = list(sorted(set(term_ids)))
print(len(term_ids))
pd.Series(term_ids).str.slice(0, 2).value_counts().sort_index()

4960


B1    569
B2    659
B3    851
B4    997
B5    927
B6    957
Name: count, dtype: int64

In [6]:
ankiweb_df = DFS['ankiweb']
ankiweb_pinyin_to_audio = ankiweb_df.set_index('Pinyin').Audio.to_dict()

slides_df = DFS['slides']

# Final corrections + variant expansions
errata_df = pd.read_csv('data/errata.tsv', sep='\t', comment='#').fillna('').set_index(['ID', 'Column'])
variants_mp = json.loads(open('data/variants.json').read())

rows = []
for term_id in term_ids:
    row = None
    if term_id in book_df.index:
        row = book_df.loc[term_id].to_dict()
    elif term_id in slides_df.index:
        row = slides_df.loc[term_id].to_dict()
    else:
        row = DFS['mquizlet'].loc[term_id].to_dict()

    row = dict(row)
    row['ID'] = term_id
    for key in row:
        if row[key] != row[key]:
            row[key] = ''

    tags = row.get('Tags', '')
    if term_id in tags_df.index:
        assert tags_df.loc[term_id, 'Traditional'] in trad_variants_mp[term_id], (row, tags_df.loc[term_id, 'Traditional'], trad_variants_mp[term_id])
        tags += ' ' + tags_df.loc[term_id, 'Tags']
    tags = list(sorted(set(tags.split())))
    if 'Ph' in tags:
        assert term_id[1] in '1234'
        assert row['POS'] == '', row
        row['POS'] = 'Ph'

    if 'Simplified' not in row or not row['Simplified']:
        row['Simplified'] = opencc_tw2s.convert(row['Traditional'])

    tags = list(sorted(set(tags)))
    tags = [t for t in tags if t not in DFS.keys()]
    tags = [t for t in tags if not re.match('^(Ph|Place|book|edit-.*|wikt|simp|sic|Flagged.*)$', t)]
    row['Tags'] = ' '.join(tags)

    # generate with audio.ipynb or set to '' to disable audio
    assert os.path.exists(f'data/media/dangdai-{term_id}.mp3')
    row['Audio'] = f'[sound:dangdai-{term_id}.mp3]'

    for col in row:
        key = row['ID'], col
        if key in errata_df.index:
            #print(key, row[col], errata_df.loc[key].to_dict())
            assert row[col] == errata_df.loc[key]['Original'], (col, row[col], errata_df.loc[key])
            row[col] = errata_df.loc[key]['Corrected']
            assert row[col] == row[col].strip()

    row['Variants'] = ''
    if term_id in variants_mp:
        assert row['Traditional'] == variants_mp[term_id]['Traditional']
        assert row['Pinyin'] == variants_mp[term_id]['Pinyin']
        row['Variants'] = json.dumps(variants_mp[term_id]['Variants'], ensure_ascii=False)

    assert row['Traditional']
    assert row['Simplified']
    assert row['Pinyin']
    assert row['POS'] or 'Name' in tags
    assert row['Meaning']

    rows.append(row)

df = pd.DataFrame(rows)
df = df[['ID', 'Traditional', 'Simplified', 'Pinyin', 'POS', 'Meaning', 'Audio', 'Variants', 'Tags']].set_index('ID').fillna('').copy()
df.to_csv('dangdai.csv', index=True)

dangdai_df = df
print(len(dangdai_df))

4960


## Expand variants

In [7]:
expanded_rows = []
for row in pd.read_csv('dangdai.csv', dtype='str').fillna('').to_dict(orient='records'):
    for var_dict in json.loads(row['Variants'] or '[{}]'):
        var = dict(row)
        var.update(var_dict)
        var.pop('Variants')
        assert '/' not in var['Simplified']
        expanded_rows.append(var)

expanded_df = pd.DataFrame(expanded_rows)
expanded_df.to_csv('dangdai-expanded.csv', index=False)
print('dangdai-expanded.csv: %d rows\n' % len(expanded_df))

dangdai-expanded.csv: 4989 rows



## Export in pleco format

In [8]:
EAC1_TAG = '\uEAC1\uEC00\uEC00\uECCC\uEC99'  # lesson tag color, #00cc99 green

df = pd.read_csv('dangdai.csv', dtype='str').set_index('ID').fillna('').copy()

# Clean up a few variants that don't display nicely, otherwise pleco can cope with variants here fine
df.loc['B1L01-I-18', 'Traditional'] = '臺/台灣'  # / separates char variants in pleco
df.loc['B2L02-I-18', 'Pinyin'] = 'Táiběi Yīlíngyī'
df.loc['B3L08-II-46', 'Traditional'] = '高級管理人員工商管理碩士'
df.loc['B3L08-II-46', 'Simplified'] = '高级管理人员工商管理硕士'
df.loc['B3L08-II-46', 'Pinyin'] = 'gāojí guǎnlǐ rényuán gōngshāng guǎnlǐ shuòshì'

with open('dangdai-pleco.txt', 'w') as fout:
    last_header = ''
    for row in df.itertuples():
        m = re.match('^B([1-6])L([0-9]{2})-(I+)-.*', row.Index)
        header = f'//當代中文/Book {m[1]}/L{m[2]}-{m[3]}'
        if header != last_header:
            fout.write(header + '\n')
            last_header = header

        defn = f'({row.POS}) {row.Meaning}' if row.POS else row.Meaning
        defn += f' {EAC1_TAG}[D{m[1]}L{int(m[2])}]\uEAC2'  # [DnLn] lesson tag in color
        fout.write(f'{row.Simplified}[{row.Traditional}]\t{row.Pinyin}\t{defn}\n')

!ls -l dangdai-pleco.txt

-rw-r--r-- 1 jovyan users 384748 Nov  8 18:09 dangdai-pleco.txt


## Quality

In [9]:
df = pd.read_csv('dangdai.csv', dtype='str').set_index('ID').fillna('')

In [10]:
if os.path.exists('../unihan/unihan.csv'):
    unihan_df = pd.read_csv('../unihan/unihan.csv', dtype='str').fillna('').set_index('char')
    for row in df.itertuples():
        assert re.match(r'^([\u4e00-\u9fff（）＝！、，．…1012/]|-$| [+] |X | Y| X |KTV|BBC|EMBA|SSL|LED|number|noun)+$', row.Traditional), row.Traditional
        for c in row.Traditional:
            if c in '（）＝，、．！…-/KTVBBCEMBAXYSSLLED +number noun 101 2': continue
            sc = unihan_df.kSimplifiedVariant[c]
            tc = unihan_df.kTraditionalVariant[c]
            assert tc == '' or (sc != '' and tc != ''), (row.Traditional, c, unihan_df.kTraditionalVariant[c])

        for var_dict in json.loads(row.Variants or '[{}]'):
            var = dict(row._asdict())
            var.update(var_dict)
            assert re.match(r'^([\u4e00-\u9fff…！、，．1012]|-$|KTV|BBC| X )+$', var['Traditional']), var

In [11]:
# Readings check against possible syllable readings in cedict
if os.path.exists('../cedict/syllables.csv'):
    readings_mp = {'一':{'yì','yí'}}
    for row in pd.read_csv('../cedict/syllables.csv', dtype='str').fillna('').itertuples():
        readings_mp.setdefault(row.Traditional, set()).add(row.Pinyin.lower())

    def gen_readings(trad):
        if trad == '':
            yield ''
        elif trad[0] not in readings_mp:
            yield from gen_readings(trad[1:])
        else:
            for x in readings_mp[trad[0]]:
                for y in gen_readings(trad[1:]):
                    yield x.lower() + ("'" if y and y[0] in 'aāáǎàeēéěèoōóǒò' else '') + y
                    yield x.lower() + ' ' + y

    for row in df.to_dict(orient='records'):
        for var_dict in json.loads(row['Variants'] or '[{}]'):
            var = dict(row)
            var.update(var_dict)
            readings = list(gen_readings(var['Traditional']))
            if re.sub("([-,.?!]|/.*)", '', var['Pinyin']).lower() not in readings:
                print(list(var.values())[:4], 'vs', readings[:10])

['KTV', 'KTV', 'KTV', 'N'] vs ['']
['台北101', '台北101', 'Táiběi 101', ''] vs ['tāiběiyīlíngyī', 'tāi běiyīlíngyī', 'tāiběi yīlíngyī', 'tāi běi yīlíngyī', 'tāiběiyī língyī', 'tāi běiyī língyī', 'tāiběi yī língyī', 'tāi běi yī língyī', 'tāiběiyīlíng yī', 'tāi běiyīlíng yī']
['古蹟', '古迹', 'gǔjī', 'N'] vs ['gǔjì', 'gǔ jì', 'gǔjì ', 'gǔ jì ']
['阿嬤', '阿嬷', 'āmà', 'N'] vs ['āmo', 'ā mo', 'āmo ', 'ā mo ', 'āmó', 'ā mó', 'āmó ', 'ā mó ', 'āmā', 'ā mā']
['照 X 光', '照 X 光', 'zhào X guāng', 'Ph'] vs ['zhàoguāng', 'zhào guāng', 'zhàoguāng ', 'zhào guāng ']
['就…而言', '就…而言', 'jiù...éryán', 'Ph'] vs ["jiù'éryán", 'jiù éryán', "jiù'ér yán", 'jiù ér yán', "jiù'éryán ", 'jiù éryán ', "jiù'ér yán ", 'jiù ér yán ', "jiu'éryán", 'jiu éryán']
['阿喀郎．汗', '阿喀郎．汗', 'Ākèláng Hàn', ''] vs ['ākālànghàn', 'ā kālànghàn', 'ākā lànghàn', 'ā kā lànghàn', 'ākālàng hàn', 'ā kālàng hàn', 'ākā làng hàn', 'ā kā làng hàn', 'ākālànghàn ', 'ā kālànghàn ']
['BBC', 'BBC', 'BBC', ''] vs ['bībī', 'bī bī', 'bībī ', 'bī bī ']


In [12]:
assert list(sorted(set(df.index))) == term_ids
assert len(set(df.index)) == len(df)

# Diff combined to sources
for src in SOURCES:
    print('%s %d' % (src, len(quick_diff([df, DFS[src]]))), end='   ')

#d = df.reset_index()[['ID', 'Traditional', 'Pinyin', 'POS', 'Meaning']].sort_values('ID').copy()
#d.loc[d.ID.str.match('B[1-4].*') & d.POS == 'Ph', 'POS'] = ''
#d.to_csv('z-dangdai.tsv', sep='\t', index=False)

slides 247   quizlet 147   ankiweb 112   ddsg 324   ccc 131   mquizlet 470   

In [13]:
# Check traditional/simplified match with opencc.
# Most are just variants, or sometimes more common PRC terms going a step more than char-by-char replacement
slides_df = DFS['slides']
for term_id in term_ids:
    trad = df.loc[term_id, 'Traditional']
    simp = df.loc[term_id, 'Simplified']
    simp_sl = slides_df.loc[term_id, 'Simplified'] if term_id in slides_df.index else ''
    simp_cc = opencc_tw2s.convert(trad)
    if simp != simp_cc:
        print('%-15s %-23s\tsimp %-18s\topencc %-18s\tslides %s' % (term_id, trad, simp, simp_cc, simp_sl))

# TODO: add SimplifiedAlt column for non-direct translations, 妳, etc.

B1L01-I-18      臺灣（＝台灣）                	simp 台湾                	opencc 台湾（＝台湾）           	slides 台湾（＝台灣）
B3L05-II-26     想像                     	simp 想象                	opencc 想像                	slides 想象
B4L04-II-37     邱吉爾                    	simp 丘吉尔               	opencc 邱吉尔               	slides 丘吉尔
B4L12-II-23     骯髒                     	simp 骯脏                	opencc 肮脏                	slides 骯脏
B5L04-II-33     巴塞隆納                   	simp 巴塞隆那              	opencc 巴塞隆纳              	slides 巴塞隆那
B5L08-I-44      索馬利亞                   	simp 索马利亞              	opencc 索马利亚              	slides 索马利亞
B5L10-I-18      藉由                     	simp 藉由                	opencc 借由                	slides 藉由
B6L02-II-32     發光二極體（LED）             	simp 发光二极管（LED）        	opencc 发光二极体（LED）        	slides 发光二极管（LED）
B6L03-I-T-05    瑪莎．葛蘭姆                 	simp 马莎．葛兰姆            	opencc 玛莎．葛兰姆            	slides 马莎．葛兰姆
B6L03-I-T-10    溫蒂．威倫                  	simp 温迪．威伦             	opencc 温蒂．威伦  

In [14]:
MEANING_HZS = '三之了二人以仿作來保倒光兒入全冒出制力化占史名品員啃啊四因國坦基大夫女姓婦子存學定室家寶少師年廢廳彈從心必怖性恐情愛感房改數族日時會板業水河泰清演潔炸熊燈爺片物獄率班生產白的盆監科第節籍糧紅級群習老者聚自至補製說貓身農迷造部都醉量銀長間闖限除隊需題風食髮麻黑'
TONES = 'āáǎàēéěèīíǐìōóǒòūúǔùüǘǚǜĀÁǍÀĒÉĚÈĪÍǏÌŌÓǑÒŪÚǓÙÜǗǙǛ'

for row in df.itertuples():
    assert row.Traditional == row.Traditional.strip()
    assert row.Simplified == row.Simplified.strip()

    assert row.Meaning == row.Meaning.strip()
    for c in row.Meaning:
        if re.match('[- a-zA-Z(),.:;=&+%?~!\'"/0-9－]', c.lower()): continue
        if c in TONES + MEANING_HZS: continue
        print('[%c] %s %s' % (c, row.Index, row.Meaning))

    assert row.Pinyin == row.Pinyin.strip()
    for c in row.Pinyin:
        if re.match("[- a-z/'()01.,=]", c.lower()): continue
        if c in TONES: continue
        print('[%c] %s %s' % (c, row.Index, row.Pinyin))

    assert row.POS != '' or 'Name' in row.Tags
    assert row.POS == row.POS.strip()

In [15]:
df.POS.value_counts()

POS
N              1793
V               614
Ph              505
Vs              431
Adv             291
                230
Vi              161
Vst             140
V-sep           101
Vp               99
Id               98
M                76
Vs-attr          62
Ptc              56
Conj             56
Vpt              50
V/N              35
Prep             35
Vaux             30
N/V              25
Vp-sep           23
Det              21
Vs-sep            9
Vs-pred           4
Vs/N              4
Vi/N              3
N/Vi              3
Vi/V              1
Adv/N             1
N/Vst             1
Vst/N             1
Vs-attr/Adv       1
Name: count, dtype: int64

In [16]:
df.Tags.value_counts()

Tags
                  4708
Name               223
Character Name      29
Name: count, dtype: int64

In [17]:
assert all(df[df.Variants.fillna('') == ''].Traditional.str.match('^[^/()（）]+$'))
assert all(df[df.Variants.fillna('') == ''].Pinyin.str.match('^[^/()（）]+$'))

# Generally don't want spaces in hanzi but book already has some ' + number' etc which need them so a few exceptions
#df[df.Traditional.str.contains('[ ]')]

## Generate anki package

In [18]:
%%bash -e
if ! [[ -f data/media/_MoeStandardKai.ttf ]]; then
  curl -o data/media/_MoeStandardKai.ttf https://www.moedict.tw/fonts/truetype/moe/MoeStandardKai.ttf
  # alternatively get from debian/ubuntu repos https://packages.ubuntu.com/source/mantic/moe-standard-fonts
fi
echo '6744c2ffd6c011f3e6ceb93f15f9b324699be41c7e47be86eb40593cf1ee8078  data/media/_MoeStandardKai.ttf' | sha256sum -c

data/media/_MoeStandardKai.ttf: OK


In [19]:
df = pd.read_csv('dangdai.csv', dtype='str').set_index('ID').fillna('')

cols = ['ID', 'Traditional', 'Simplified', 'Pinyin', 'POS', 'Meaning', 'Audio', 'Variants']

model = genanki.Model(
    1696395923,
    'Dangdai',
    fields=[{'name': c} for c in cols],
    templates=[{
        'name': 'Dangdai',
        'qfmt': open('dangdai-qfmt.html').read(),
        'afmt': open('dangdai-afmt.html').read(),
    }],
    css=open('dangdai.css').read(),
)

deck = genanki.Deck(
    1696395265,
    name='dangdai',
    description='A Course in Contemporary Chinese (當代中文課程 B1-B6, Traditional)'
)

for row in df.reset_index().to_dict(orient='records'):
    note = genanki.Note(
        model=model,
        fields=[row[c] for c in cols],
        tags=row['Tags'].split(),
        guid=genanki.guid_for('dangdai', row['ID'])
    )
    deck.add_note(note)

!rm -f dangdai.apkg
package = genanki.Package(deck, media_files=glob.glob('data/media/*'))
package.write_to_file('dangdai.apkg')
!ls -l dangdai.apkg

-rw-r--r-- 1 jovyan users 136471276 Nov  8 18:10 dangdai.apkg


## Cleanup

In [20]:
!rm -rf downloads/slides/unpacked/
!rm -rf downloads/audio/unpacked/