# Parse vocabulary slides

* Downloads and parses vocabulary from slides from the official website: http://mtc.ntnu.edu.tw/chinese-resource.htm
* This is the main source of data.
* Both traditional and simplified vocab slides parsed.
* Corrections for errors in the slides to get usable somewhat cleanish/mergeable data, consistent typography
* Output: `data/slides.tsv`

In [1]:
%%bash -e
if ! [[ -d downloads/ ]]; then
  if [[ -d ../downloads/dangdai ]]; then ln -s ../downloads/dangdai downloads; else mkdir -p downloads; fi
fi
mkdir -p downloads/slides
cd downloads/slides
if ! [[ -f 'DangDai B6  e.g. of Grammar & vocabulary  (simplifiedl).zip' ]]; then
    wget -nc 'https://mtc.ntnu.edu.tw/upload_files/resource/download/Contemporary-Chinese/DangDai_B1_e.g._of_Grammar__vocabulary_traditional.zip'
    wget -nc 'https://mtc.ntnu.edu.tw/upload_files/resource/download/Contemporary-Chinese/DangDai_B1_e.g._of_Grammar__vocabulary_simplified.zip'
    wget -nc 'https://mtc.ntnu.edu.tw/upload_files/resource/download/Contemporary-Chinese/DangDai_B2_e.g._of_Grammar__vocabulary_traditional.zip'
    wget -nc 'https://mtc.ntnu.edu.tw/upload_files/resource/download/Contemporary-Chinese/DangDai_B2_e.g._of_Grammar__vocabulary_simplified.zip'
    wget -nc 'https://mtc.ntnu.edu.tw/upload_files/resource/download/Contemporary-Chinese/DangDai_B3_e.g._of_Grammar__vocabulary_traditional.zip'
    wget -nc 'https://mtc.ntnu.edu.tw/upload_files/resource/download/Contemporary-Chinese/DangDai_B3_e.g._of_Grammar__vocabulary_simplified.zip'
    wget -nc 'https://mtc.ntnu.edu.tw/upload_files/resource/download/Contemporary-Chinese/DangDai_B4_e.g._of_Grammar__vocabulary_traditional.zip'
    wget -nc 'https://mtc.ntnu.edu.tw/upload_files/resource/download/Contemporary-Chinese/DangDai_B4_e.g._of_Grammar__vocabulary_simplified.zip'
    wget -nc 'https://mtc.ntnu.edu.tw/upload_files/resource/download/Contemporary-Chinese/DangDai_B5_e.g._of_Grammar__vocabulary_traditional.zip'
    wget -nc 'https://mtc.ntnu.edu.tw/upload_files/resource/download/Contemporary-Chinese/DangDai_B5_e.g._of_Grammar__vocabulary_simplified.zip'
    wget -nc 'https://mtc.ntnu.edu.tw/upload_files/resource/download/Contemporary-Chinese/DangDai%20B6%20%20e.g.%20of%20Grammar%20&%20vocabulary%20%20(traditional).zip'
    wget -nc 'https://mtc.ntnu.edu.tw/upload_files/resource/download/Contemporary-Chinese/DangDai%20B6%20%20e.g.%20of%20Grammar%20&%20vocabulary%20%20(simplifiedl).zip'
fi
sha256sum *.zip

mkdir -p unpacked
cd unpacked
for f in ../DangDai_B*.zip; do rar -o- x "$f" >/dev/null; done  # actually .rar
for f in ../DangDai*B6*.zip; do 7z -aos x "$f" >/dev/null; done

8ab294ed7f9db3ae2f81bf8d70877993af3d65a639a1c066641a431f2c25baa7  DangDai_B1_e.g._of_Grammar__vocabulary_simplified.zip
e5e8f41776350ff8e7adb089a9bad6a8ccf3530d0adbd884fba23b6f91395937  DangDai_B1_e.g._of_Grammar__vocabulary_traditional.zip
fa1ea0289ce72b764aded52b6dd94b49b5fa5397ff4b8bd7346930b40c09539b  DangDai_B2_e.g._of_Grammar__vocabulary_simplified.zip
551bbeb7185bdceb47643de10673558de21c70faf1af9dd7b05a022b294d2b75  DangDai_B2_e.g._of_Grammar__vocabulary_traditional.zip
343bed50e89a22bcf2b82dfdd353a4e5d66109b0b9855a1db1fcb5ed17349449  DangDai_B3_e.g._of_Grammar__vocabulary_simplified.zip
80af9536c24e367a0116933d0d1ffb64386ff9acd0d121efbc434c7ffa44d873  DangDai_B3_e.g._of_Grammar__vocabulary_traditional.zip
722951aca10fb88ee37f462332e0e1a9f0c5861fcfaa649dd9e868ca1447b09a  DangDai_B4_e.g._of_Grammar__vocabulary_simplified.zip
81e7cb7e2bea95a05eec48d47ca5bf65af738af0ff083691107e68f16d5479e2  DangDai_B4_e.g._of_Grammar__vocabulary_traditional.zip
0cac9f6f0ef4b7e082c173e156ff13cbc239

In [2]:
!pip install -q python-pptx opencc genanki

In [3]:
import glob
import os, os.path
import re
import pandas as pd
from pptx import Presentation
from opencc import OpenCC

opencc_tw2s = OpenCC('tw2s')

def pptx_glob(cset):
    assert cset in ['trad', 'simp']
    res = list(sorted(
        glob.glob(f'downloads/slides/unpacked/*{cset}*/*Vocab*/B?-L??.pptx'),
        key=os.path.basename
    ))
    assert len(res) == 15+15+12+12+10+10
    return res

def extract_para(filepath):  # => list of lists of text paragraphs for each slide
    res = []
    prs = Presentation(filepath)
    k = 0
    for slide in prs.slides:
        paras = []
        for shape in slide.shapes:
            if not shape.has_text_frame:
                continue
            for paragraph in shape.text_frame.paragraphs:
                text = ''.join(run.text for run in paragraph.runs)
                text = text.replace('\t', ' ').strip()
                if text:
                    paras.append((shape.text_frame.margin_top, shape.text_frame.margin_left, k, text))
                    k += 1
        paras.sort()
        res.append([p[-1] for p in paras])
    return res

def preprocess_para(para):
    res = []
    for s in para:
        if s in ['風氣fēngqì','风气fēngqì']:
            res.extend([s[:2], s[2:]])
        elif re.match(r'^[\u4e00-\u9fff…一]{4}[a-zA-Z].*[ …][a-zA-Z].*', s):
            res.extend([s[:4], s[4:]])
        else:
            res.append(s)
    return res

def extract_hanzi(term_id, para):  # -> hanzi, rest of para
    HANZI_EXC = set([
        '照 X 光',
        'X 分之 Y',
        'EMBA（高级管理人员 工商管理硕士）',
        'EMBA（高級管理人員 工商管理碩士）',
        '固态光源（SSL）',
        '固態光源（SSL）',
        '發光二極體（LED）',
        '发光二极管（LED）',
    ])

    def is_chinese(text):
        if text in HANZI_EXC: return True
        text = re.sub('([\t0-9 （）(  ) …、/‧]||＋ number|＋ noun|number ＋|-$)', '', text)
        return len(text) >= 1 and all(ord(c) >= 0x4E00 for c in text)

    hanzi = None
    if term_id == 'B1L07-I-02':
        hanzi = 'KTV'
    elif term_id == 'B6L06-I-T-02' and 'BBC' in para:
        hanzi = 'BBC'
    else:
        cn_words = [w for w in para if is_chinese(w)]
        assert len(cn_words) == 1, (term_id, para, cn_words)
        hanzi = cn_words[0]

    i = para.index(hanzi)
    assert i >= 0
    para = para[:i] + para[(i+1):]

    hanzi = postprocess_chars(hanzi, True)

    if term_id == 'B6L03-I-03':  # incorrect hanzi in slide
        hanzi = '舞者'

    return hanzi, para

def postprocess_chars(text, hanzi):
    text = text.replace('＋ ', ' + ')
    text = text.replace(' ＋', ' + ')
    mp = {
        '‘': "'",
        '’': "'",
        '“': '"',
        '”': '"',
        'ﬃ': 'ffi',
        'ﬄ': 'ffl',
        'ﬁ': 'fi',
        'ﬂ': 'fl',
        'ﬀ': 'ff',
        '／': '/',
    }
    # generally don't want double width characters in meaning field - could lead to font problems
    if not hanzi:
        mp.update({
            '…': '...',
            '％': '%',
            '～': '~',
            '＂': '"',
            '＝': '=',
            '＋': '+',
        })
    text = ''.join(mp.get(c, c) for c in text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def parse_para(para, book_lesson, simplified):
    para = preprocess_para(para)

    term_id = [s for s in para if re.match('^(I|II)-.*[0-9]+', s)]
    assert len(term_id) == 1, para
    term_id = term_id[0]
    para = [s for s in para if s != term_id]
    term_id = term_id.replace(' ', '')
    # mangle term id to ascii and make it ascii sortable
    term_id = term_id.replace('引-', '0-') # yinyan foreword (B5, before main terms)
    term_id = term_id.replace('摘-', '0-') # zhaiyao summary (B6, before main terms)
    term_id = term_id.replace('專-', 'T-') # zhuanyoumingci technical terms (B6, after main terms)
    term_id = term_id.replace('专-', 'T-')
    term_id = book_lesson + '-' + term_id
    for n in range(10):
        if term_id.endswith('-%d' % n):
            term_id = term_id[:-2] + '-0%d' % n
            break

    term_id_mp = {
        # misindexing in slides
        'B1L14-I-19': 'B1L14-II-01',
        'B1L14-I-20': 'B1L14-II-02',
        'B1L14-I-21': 'B1L14-II-03',
        'B1L14-I-22': 'B1L14-II-04',
        'B1L14-I-23': 'B1L14-II-05',
        'B1L14-II-01': 'B1L14-II-06',
        'B1L14-II-02': 'B1L14-II-07',
        'B1L14-II-03': 'B1L14-II-08',
        'B1L14-II-04': 'B1L14-II-09',
        'B1L14-II-05': 'B1L14-II-10',
        'B1L14-II-06': 'B1L14-II-11',
        'B1L14-II-07': 'B1L14-II-12',
        'B1L14-II-08': 'B1L14-II-13',
        'B1L14-II-09': 'B1L14-II-14',
        'B1L14-II-10': 'B1L14-II-15',
        'B1L14-II-11': 'B1L14-II-16',
        'B1L14-II-12': 'B1L14-II-17',
        # skipped index in slides
        'B2L08-I-19': 'B2L08-I-18',
        'B2L08-I-20': 'B2L08-I-19',
        'B2L08-I-21': 'B2L08-I-20',
        'B2L08-I-22': 'B2L08-I-21',
        # B6L01-II-22 missing in slides, shifted indexes
        'B6L01-II-23': 'B6L01-II-24',
        # extra dup word at B6L05-I-30
        'B6L05-I-31': 'B6L05-I-30',
        'B6L05-I-32': 'B6L05-I-31',
        # misindexing, missing #45 maidan
        'B6L08-I-45': 'B6L08-I-T-01',
        # B6L10 misindexing in slides
        'B6L10-I-0-05': 'B6L10-I-01',
        'B6L10-I-0-06': 'B6L10-I-02',
        'B6L10-I-0-07': 'B6L10-I-03',
        'B6L10-I-0-08': 'B6L10-I-04',
        'B6L10-I-0-09': 'B6L10-I-05',
        'B6L10-I-35': 'B6L10-I-T-01',
        'B6L10-I-36': 'B6L10-I-T-02',
        'B6L10-I-37': 'B6L10-I-T-03',
        'B6L10-I-38': 'B6L10-I-T-04',
        'B6L10-I-39': 'B6L10-II-0-01',
        'B6L10-I-40': 'B6L10-II-0-02',
        'B6L10-I-41': 'B6L10-II-0-03',
        'B6L10-I-42': 'B6L10-II-0-04',
        'B6L10-I-43': 'B6L10-II-0-05',
        'B6L10-I-44': 'B6L10-II-01',
        'B6L10-I-45': 'B6L10-II-02',
        'B6L10-I-46': 'B6L10-II-03',
        'B6L10-I-47': 'B6L10-II-04',
        'B6L10-I-48': 'B6L10-II-05',
        'B6L10-I-49': 'B6L10-II-06',
        'B6L10-I-T-01': 'B6L10-II-07',
        'B6L10-I-T-02': 'B6L10-II-08',
        'B6L10-II-T-01': 'B6L10-II-54',
        'B6L10-II-T-02': 'B6L10-II-55',
    }
    for i in range(1, 21):
        term_id_mp['B6L10-I-%.2d' % i] = 'B6L10-I-%.2d' % (5+i)
    for i in range(21, 35):
        term_id_mp['B6L10-I-%.2d' % i] = 'B6L10-I-%.2d' % (4+i)
    for i in range(1, 46):
        term_id_mp['B6L10-II-%.2d' % i] = 'B6L10-II-%.2d' % (8+i)
    term_id = term_id_mp.get(term_id, term_id)

    term_id = {
        ('B5L07-II-39', '不勞而獲'): 'B5L07-II-38',
        ('B5L07-II-39', '不劳而获'): 'B5L07-II-38',
        ('B5L07-II-41', '拉'): 'B5L07-II-40',
        ('B6L10-II-T-03', '連接'): 'B6L10-II-56',
        ('B6L10-II-T-03', '動手'): 'B6L10-II-57',
        ('B6L10-II-T-03', '人生苦短'): 'B6L10-II-58',
        ('B6L10-II-T-03', '攻無不克'): 'B6L10-II-59',
        ('B6L10-II-T-03', '事無不成'): 'B6L10-II-60',
        ('B6L10-II-T-03', '失讀症'): 'B6L10-II-T-01',
        ('B6L10-II-T-03', '李光耀'): 'B6L10-II-T-02',
        ('B6L10-II-T-03', '妥瑞氏症'): 'B6L10-II-T-03',
        ('B6L10-II-T-03', '鏡像神經元'): 'B6L10-II-T-04',
        ('B6L10-II-T-03', '连接'): 'B6L10-II-56',
        ('B6L10-II-T-03', '动手'): 'B6L10-II-57',
        ('B6L10-II-T-03', '人生苦短'): 'B6L10-II-58',
        ('B6L10-II-T-03', '攻无不克'): 'B6L10-II-59',
        ('B6L10-II-T-03', '事无不成'): 'B6L10-II-60',
        ('B6L10-II-T-03', '失读症'): 'B6L10-II-T-01',
        ('B6L10-II-T-03', '李光耀'): 'B6L10-II-T-02',
        ('B6L10-II-T-03', '妥瑞氏症'): 'B6L10-II-T-03',
        ('B6L10-II-T-03', '镜像神经元'): 'B6L10-II-T-04',
    }.get((term_id, para[0]), term_id)

    # Fix some suboptimal text extraction cases
    para = {
        ('B2L01-I-18', '師大'): ['師大（師範大學）', 'Shīdà (Shīfàn Dàxué)', 'NTNU (National Taiwan Normal University)'],
        ('B2L01-I-18', '师大'): ['师大（师范大学）', 'Shīdà (Shīfàn Dàxué)', 'NTNU (National Taiwan Normal University)'],
        ('B2L14-II-24', '企業管理系'): ['企業管理系（企管系）', 'qìyè guǎnlǐ xì', 'department of business management'],
        ('B2L14-II-24', '企业管理系'): ['企业管理系（企管系）', 'qìyè guǎnlǐ xì', 'department of business management'],
        ('B3L08-II-46', 'EMBA（高級管理人員 工商管理碩士）'): ['EMBA（高級管理人員 工商管理碩士）', 'EMBA (gāojí guǎnlǐ rényuán gōngshāng guǎnlǐ shuòshì)', 'EMBA (Executive Master of Business Administration)'],
        ('B3L08-II-46', 'EMBA（高级管理人员 工商管理硕士）'): ['EMBA（高级管理人员 工商管理硕士）', 'EMBA (gāojí guǎnlǐ rényuán gōngshāng guǎnlǐ shuòshì)', 'EMBA (Executive Master of Business Administration)'],
    }.get((term_id, para[0]), para)

    hanzi, para = extract_hanzi(term_id, para)

    para_p = [(sum(c in 'āáǎàēéěèīíǐìōóǒòūúǔùüǖǘǚǜ' for c in w), -len(w), w) for w in para]
    para_p.sort()
    pinyin = para_p[-1][-1]
    i = para.index(pinyin)
    para = para[:i] + para[(i+1):]
    pinyin = postprocess_chars(pinyin, False)

    meaning = ' '.join(para).strip()
    meaning = re.sub(r'\s+', ' ', meaning)
    meaning = meaning.replace('（', ' (')
    meaning = meaning.replace('）', ')')
    meaning = meaning.replace('  (', ' (')
    meaning = re.sub(r' [)]', ')', meaning)
    meaning = re.sub(r'^[．]', '', meaning)
    meaning = postprocess_chars(meaning, False)

    return {'id': term_id, 'hanzi': hanzi, 'pinyin': pinyin, 'meaning': meaning}


parsed_terms = {}

for cset in ['trad', 'simp']:
    print('%s' % cset)
    files = pptx_glob(cset)
    assert len(files) == 15+15+12+12+10+10
    parsed_terms[cset] = {}
    term_ids_list = []

    for filepath in files:
        term_id_prefix, term_id_next = '', 1
        term_aux = 0
        fix_l6_tech = False
        book_lesson = os.path.basename(filepath).replace('.pptx', '').replace('-', '')

        for para in extract_para(filepath)[1:]:
            parsed = parse_para(para, book_lesson, cset == 'simp')
            term_id = parsed['id']
    
            if term_id == 'B5L01-II-11' and 'nìmíng' in para and cset == 'simp':
                continue  # dup
            if term_id == 'B6L05-I-30' and parsed['pinyin'] == 'yìtǔ wéikuài':
                continue  # dup from B6L04
            if term_id == 'B6L10-I-25' and parsed['pinyin'] == 'lǚchéng':
                continue  # dup from B6L09
            if term_id == 'B6L08-I-T-01' and parsed['pinyin'] == 'Niújīn Dàxué':
                continue  # dup

            # tech terms for second vocab in slides still have 'I-' ids leading to id dups
            term_id = parsed['id']
            if re.match('^B6L..-I-T-[0-9]*$', term_id):
                if term_id.endswith('-01') and term_id in parsed_terms[cset] and book_lesson != 'B6L08':
                    fix_l6_tech = True
                if fix_l6_tech:
                    term_id = term_id.replace('-I-T-', '-II-T-')
                    parsed['id'] = term_id
            else:
                fix_l6_tech = False

            # sequential ids check
            m = re.match('^((B.L..)(-I-|-II-)(|[0T]-))[0-9][0-9]$', term_id)
            assert m, (term_id, parsed)
            if term_id_prefix != m[1]:
                term_id_prefix = m[1]
                term_id_next = 1
            # missing terms
            if term_id not in ['B6L01-II-24']:
                assert term_id.endswith('-' + '%.2d' % term_id_next), (term_id_next, parsed)
                term_id_next += 1
            else:
                term_id_next = int(term_id.split('-')[-1]) + 1

            assert term_id not in parsed_terms[cset]  # uniq id check
            parsed_terms[cset][term_id] = parsed

            if term_ids_list:
                assert term_id > term_ids_list[-1], term_id

            term_ids_list.append(term_id)

assert set(parsed_terms['simp'].keys()) == set(parsed_terms['trad'].keys())
assert term_ids_list == list(sorted(term_ids_list))
term_ids = term_ids_list

for b in range(1, 7):
    print('B%d: %d' % (b, len([s for s in term_ids if s.startswith('B%d' % b)])))
print('Total: %d' % len(term_ids))

trad
simp
B1: 569
B2: 659
B3: 851
B4: 997
B5: 927
B6: 924
Total: 4927


In [4]:
# Extract POS / meaning proper

POS_MAP = {
  "(Adv)": "Adv",
  "(Adv/N)": "Adv/N",
  "(Conj)": "Conj",
  "(Det)": "Det",
  "(Id)": "Id",
  "(M)": "M",
  "(N)": "N",
  "(N/V)": "N/V",
  "(N/Vst)": "N/Vst",
  "(Ph)": "Ph",
  "(Prep)": "Prep",
  "(Ptc)": "Ptc",
  "(V)": "V",
  "(V-sep)": "V-sep",
  "(V/ N)": "V/N",
  "(V/N)": "V/N",
  "(Vaux)": "Vaux",
  "(Vi)": "Vi",
  "(Vi, N)": "Vi/N",
  "(Vi, V)": "Vi/V",
  "(Vi/N)": "Vi/N",
  "(Vp)": "Vp",
  "(Vp-sep)": "Vp-sep",
  "(Vpt)": "Vpt",
  "(Vs)": "Vs",
  "(Vs-attr)": "Vs-attr",
  "(Vs-pred)": "Vs-pred",
  "(Vs-sep)": "Vs-sep",
  "(Vs/N)": "Vs/N",
  "(Vst)": "Vst",
  "(Vst/N)": "Vst/N",
  "( N)": "N",
  " ( N)": "N",
  "(N/Vi)": "N/Vi",
  "Adv ": "Adv",
  "N ": "N",
}

for term_id in term_ids_list:
    trad = parsed_terms['trad'][term_id]
    simp = parsed_terms['simp'][term_id]
    assert simp['pinyin'] == trad['pinyin']
    assert simp['meaning'] == opencc_tw2s.convert(trad['meaning']), (trad, simp)

    meaning = trad['meaning'].strip()
    pos = ''
    pos_pref = ''
    for pref in POS_MAP.keys():
        if meaning.startswith(pref):
            pos = POS_MAP[pref]
            pos_pref = pref
            break

    trad['pos'] = pos
    trad['pos_pref'] = pos_pref

    meaning = trad['meaning'].strip()
    if trad['pos_pref']:
        assert meaning.startswith(trad['pos_pref']), trad
        meaning = meaning[len(trad['pos_pref']):].strip()
    trad['meaning_nopos'] = meaning

In [5]:
rows = []
dd_t2s = {}

for term_id in term_ids:
    trad = parsed_terms['trad'][term_id]
    simp = parsed_terms['simp'][term_id]
    assert simp['pinyin'] == trad['pinyin']
    assert simp['meaning'] == opencc_tw2s.convert(trad['meaning'])

    if trad['hanzi'] not in dd_t2s:
        dd_t2s[trad['hanzi']] = simp['hanzi']
    assert dd_t2s[trad['hanzi']] == simp['hanzi']

    rows.append({
        'ID': trad['id'],
        'Traditional': trad['hanzi'],
        'Simplified': simp['hanzi'],
        'Pinyin': trad['pinyin'],
        'POS': trad['pos'],
        'Meaning': trad['meaning_nopos'],
        'Tags': '',
    })

slides_df = pd.DataFrame(rows)
slides_df.to_csv('data/slides.tsv', sep='\t', index=False)
#slides_df[['ID', 'POS', 'Traditional']].to_csv('gen/term-ids-slides.tsv', sep='\t', index=False)

print(len(slides_df))

!ls -l data/slides.tsv

4927
-rw-r--r-- 1 jovyan users 300501 Oct 20 14:30 data/slides.tsv
