# Chinese (Simplified) Tokenization Model - Experiments - TODO

In [1]:
import os, sys
cwd = os.getcwd()
project_path = cwd[:cwd.find('pygents')+7]
if project_path not in sys.path: sys.path.append(project_path)
os.chdir(project_path) 

from importlib import reload  # Python 3.4+

import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

#force reimport
if 'pygents.util' in sys.modules:
    del sys.modules['pygents.util']
if 'pygents.text' in sys.modules:
    del sys.modules['pygents.text']
if 'pygents.plot' in sys.modules:
    del sys.modules['pygents.plot']
if 'pygents.token' in sys.modules:
    del sys.modules['pygents.token']

from pygents.util import * 
from pygents.text import * 
from pygents.plot import * 
from pygents.token import * 

import jieba


In [2]:
# https://www.vengaglobal.com/blog/simplified-traditional-chinese-mandarin-cantonese/

# Target Market  Written      Spoken
# -------------------------------------
# China          Simplified   Mandarin
# Singapore      Simplified   Mandarin
# Taiwan         Traditional  Mandarin
# Hong Kong      Traditional  Cantonese

# Lexicon:
# http://www.chineselexicaldatabase.com/download.php - used below
# Sun, C. C., Hendrix, P., Ma, J.Q. & Baayen, R. H. (2018). Chinese Lexical Database (CLD): A large-scale lexical database for simplified Mandarin Chinese. Behavior Research Methods, https://doi.org/10.3758/s13428-018-1038-3.

# Corpora:
# https://www.openslr.org/38/ - test-audio corpus, not relevant
# https://github.com/CLUEbenchmark/CLUECorpus2020/ - email request sent
# https://github.com/brightmart/nlp_chinese_corpus - nearly same as above downloaded, used further

# TODO:
# https://metatext.io/datasets/nlp-chinese-corpus - paper with word segmentation


In [3]:
path = '../../nlp/corpora/Chinese/'


In [4]:
def zh_clue_json2text(path,filename):
    with open(os.path.join(path,filename+'.json')) as file:
        with open(os.path.join(path,filename+'.txt'), 'w') as fout:
            while True:
                line = file.readline()
                if not line:
                    break
                j = json.loads(line)
                #print('title',j['title'])
                #print('desc',j['desc'])
                #print('content',j['content'])
                fout.write(j['title'])
                fout.write('\n')    
                fout.write(j['desc'])
                fout.write('\n')    
                fout.write(j['content'])
                fout.write('\n')    


In [5]:
#do this once!
#zh_clue_json2text(path,'clue/new2016zh/news2016zh_valid')


In [6]:
#do this once!
#zh_clue_json2text(path,'clue/new2016zh/news2016zh_train')


## Load and explore full models

In [8]:
model_name = 'data/models/zh_valid_chars_3a'
if os.path.isfile(model_name): # if pre-trained
    print('loading',model_name)
    zh_valid_chars = FreedomTokenizer(name=model_name,max_n=3,mode='chars',debug=False)
else:
    zh_valid_chars = FreedomTokenizer(max_n=3,mode='chars',debug=False)
    #zh_valid_grams = FreedomTokenizer(max_n=3,mode='grams',debug=False)
    with open(join(path, 'clue/new2016zh/news2016zh_valid.txt'),errors='ignore') as f:
        cnt = 0
        while True:
            line = f.readline()
            if not line:
                break
            cnt += 1
            if (cnt % 1000) == 0:
                print(cnt)
            zh_valid_chars.train([line])
            #zh_valid_grams.train([line])

zh_valid_chars.store(model_name)
#zh_valid_grams.store('data/models/zh_valid_grams_3a')

print(zh_valid_chars.count_params())
# 143,129,564 (max_n=3)

#print(zh_valid_grams.count_params())


loading data/models/zh_valid_chars_3a
143129564


In [9]:
del zh_valid_chars

In [10]:
model_name = 'data/models/zh_train_chars_2a'
if os.path.isfile(model_name): # if pre-trained
    print('loading',model_name)
    zh_train_chars = FreedomTokenizer(name=model_name,max_n=2,mode='chars',debug=False)
else:
    print('training',model_name)
    zh_train_chars = FreedomTokenizer(max_n=2,mode='chars',debug=False)
    with open(join(path, 'clue/new2016zh/news2016zh_train.txt'),errors='ignore') as f:
        cnt = 0
        while True:
            line = f.readline()
            if not line:
                break
            cnt += 1
            if (cnt % 100000) == 0:
                print(cnt)
            zh_train_chars.train([line])
            #zh_valid_grams.train([line])
    zh_train_chars.store(model_name)

print(zh_train_chars.count_params())
# 249,859,247 (max_n=2)


loading data/models/zh_train_chars_2a
249859247


In [None]:
del zh_train_chars

In [1]:
model_name = 'data/models/zh_train_chars_3a'
if os.path.isfile(model_name): # if pre-trained
    print('loading',model_name)
    zh_train_chars = FreedomTokenizer(name=model_name,max_n=3,mode='chars',debug=False)
else:
    print('training',model_name)
    zh_train_chars = FreedomTokenizer(max_n=3,mode='chars',debug=False)
    with open(join(path, 'clue/new2016zh/news2016zh_train.txt'),errors='ignore') as f:
        cnt = 0
        while True:
            line = f.readline()
            if not line:
                break
            cnt += 1
            if (cnt % 100000) == 0:
                print(cnt)
            zh_train_chars.train([line])
            #zh_valid_grams.train([line])
    zh_train_chars.store(model_name)

print(zh_train_chars.count_params())
# 249,859,247 (max_n=2)


NameError: name 'zh_train_chars' is not defined

In [None]:
# del zh_train_chars

In [11]:
zh_lexicon_tokenizer = LexiconIndexedTokenizer(lexicon = list(pd.read_csv(os.path.join(path,'lexicon/chineselexicaldatabase2.1.txt'))['Word']))


  exec(code_obj, self.user_global_ns, self.user_ns)


In [12]:
#'Цвели яблони и груши, над рекой плыл туман.'
text = '苹果树和梨树开花，雾气飘过河面。'
                               
expected = JebaTokenizer().tokenize(text) #[r[0] for r in jieba.tokenize(text)]
print(expected)

tokens = zh_lexicon_tokenizer.tokenize(text)
print(round(calc_f1(expected,tokens),2),tokens)
    
print()
for nlist in [[1],[1,2],[2]]:
    for threshold in [0.02,0.05,0.1,0.2,0.3,0.4,0.5]: 
        #tokens = tokenize_with_opposite_metrics(zh_train_chars.model,text,'ddf-','ddf+',[1],threshold=threshold)
        tokens = FreedomBasedTokenizer(zh_train_chars,'ddf-','ddf+',[1],threshold=threshold).tokenize(text)
        print(nlist,threshold,round(calc_f1(expected,tokens),2),tokens)


Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/lb/1m7gbdp17h578qq48pbbtxf40000gn/T/jieba.cache
Loading model cost 0.652 seconds.
Prefix dict has been built successfully.


['苹果树', '和', '梨树', '开花', '，', '雾气', '飘过', '河面', '。']
0.84 ['苹果树', '和', '梨树', '开花', '，', '雾气', '飘', '过', '河面', '。']

[1] 0.02 0.5 ['苹果', '树', '和', '梨树', '开', '花', '，', '雾气飘', '过', '河面', '。']
[1] 0.05 0.82 ['苹果树', '和', '梨树', '开花', '，', '雾气飘过', '河面', '。']
[1] 0.1 0.63 ['苹果树', '和', '梨树开花', '，', '雾气飘过', '河面', '。']
[1] 0.2 0.53 ['苹果树', '和', '梨树开花', '，', '雾气飘过河面', '。']
[1] 0.3 0.53 ['苹果树', '和', '梨树开花', '，', '雾气飘过河面', '。']
[1] 0.4 0.31 ['苹果树和梨树开花', '，', '雾气飘过河面', '。']
[1] 0.5 0.31 ['苹果树和梨树开花', '，', '雾气飘过河面', '。']
[1, 2] 0.02 0.5 ['苹果', '树', '和', '梨树', '开', '花', '，', '雾气飘', '过', '河面', '。']
[1, 2] 0.05 0.82 ['苹果树', '和', '梨树', '开花', '，', '雾气飘过', '河面', '。']
[1, 2] 0.1 0.63 ['苹果树', '和', '梨树开花', '，', '雾气飘过', '河面', '。']
[1, 2] 0.2 0.53 ['苹果树', '和', '梨树开花', '，', '雾气飘过河面', '。']
[1, 2] 0.3 0.53 ['苹果树', '和', '梨树开花', '，', '雾气飘过河面', '。']
[1, 2] 0.4 0.31 ['苹果树和梨树开花', '，', '雾气飘过河面', '。']
[1, 2] 0.5 0.31 ['苹果树和梨树开花', '，', '雾气飘过河面', '。']
[2] 0.02 0.5 ['苹果', '树', '和', '梨树', '开', '花', '，', '雾气飘', '过', '河面', '。']

In [13]:
# https://magichub.com/datasets/chinese-english-parallel-corpus-finance/
test_df = pd.read_csv(os.path.join(path,'magicdata/zh_en_ru_100/CORPUS_ZH_EN_RU.txt'),delimiter='\t')


In [14]:
evaluate_tokenizer_f1(list(test_df[:5]['zh']),JebaTokenizer(),zh_lexicon_tokenizer,debug=True)


然后医疗保险呢？就是我们家，不论是大人啊还是小孩都会去买一个保险
0.84 ['然后', '医疗', '保险', '呢', '？', '就是', '我们', '家', '，', '不论是', '大人', '啊', '还是', '小孩', '都会', '去', '买', '一个', '保险']
当他们买了保险的，按照保险合同的话，是要赔三十万的
0.8 ['当', '他们', '买', '了', '保险', '的', '，', '按照', '保险', '合同', '的话', '，', '是', '要', '赔', '三', '十', '万', '的']
需要意识到买了一个保险的重要性
1.0 ['需要', '意识', '到', '买', '了', '一个', '保险', '的', '重要性']
其实这种现象是真的很普遍，因为比如说你买一个人身意外险你那个你买的越多你那个保额就越多
0.85 ['其实', '这', '种', '现象', '是', '真的', '很', '普遍', '，', '因为', '比如说', '你', '买', '一个', '人身', '意外', '险', '你', '那个', '你', '买', '的', '越', '多', '你', '那个', '保', '额', '就', '越', '多']
这代父母真的很有必要去买一个保险
0.86 ['这', '代', '父母', '真的', '很', '有', '必要', '去', '买', '一个', '保险']


0.87

In [15]:
evaluate_tokenizer_f1(list(test_df['zh']),JebaTokenizer(),zh_lexicon_tokenizer,debug=False)


0.82

In [21]:
for nlist in [[1],[1,2],[2]]:
    for threshold in [0.005,0.01,0.02,0.05,0.1,0.2,0.3,0.4,0.5]: 
        tokenizer = FreedomBasedTokenizer(zh_train_chars,'ddf-','ddf+',nlist,threshold=threshold)
        avg_f1 = evaluate_tokenizer_f1(list(test_df['zh']),JebaTokenizer(),tokenizer,debug=False)
        print(nlist,threshold,avg_f1)


[1] 0.005 0.42
[1] 0.01 0.42
[1] 0.02 0.42
[1] 0.05 0.41
[1] 0.1 0.4
[1] 0.2 0.37
[1] 0.3 0.32
[1] 0.4 0.29
[1] 0.5 0.28
[1, 2] 0.005 0.48
[1, 2] 0.01 0.48
[1, 2] 0.02 0.48
[1, 2] 0.05 0.49
[1, 2] 0.1 0.48
[1, 2] 0.2 0.44
[1, 2] 0.3 0.4
[1, 2] 0.4 0.35
[1, 2] 0.5 0.31
[2] 0.005 0.53
[2] 0.01 0.53
[2] 0.02 0.52
[2] 0.05 0.51
[2] 0.1 0.49
[2] 0.2 0.43
[2] 0.3 0.38
[2] 0.4 0.33
[2] 0.5 0.28


In [22]:
# https://magichub.com/datasets/chinese-english-parallel-corpus-finance/
news_df = pd.read_csv(os.path.join(path,'news/news.2008.zh.shuffled.deduped'),usecols=[0], names=['zh'], header=None)
news_df

Unnamed: 0,zh
0,英国大银行巴克莱没有选择政府资助，而是将自行募集65亿英镑补充其资本金。
1,13日首天需要停驶的，是尾号2和7的车辆。
2,他被誉为当今世界上最令人瞩目的贸易理论家之一，而他在1994年对亚洲金融危机的预言，更使他在...
3,代表英格兰、威尔士和苏格兰各市、郡议会的英国地方政府协会（LGA）将在本周与冰岛大使会面。
4,此前台湾海基会董事长江丙坤曾经表示，陈云林将在10月底、11月初访问台湾。
...,...
322,美国联邦储备局批准富国银行（Wells Fargo）以120亿美元代价并购陷入财政困难的美联...
323,不过苏起强调说 ： “ 马总统在卸任以前，不论他走到哪里，坐在哪里，都是中华民国总统，这不会...
324,克鲁格曼的主要研究领域包括国际贸易、国际金融、货币危机与汇率变化理论。
325,这些机关的存款总值超过8.42亿英镑（14.32亿美元 ） ， 部分议会更把用以支付工资的款...


In [23]:
evaluate_tokenizer_f1(list(news_df['zh']),JebaTokenizer(),zh_lexicon_tokenizer,debug=False)


0.75

In [24]:
for nlist in [[1],[1,2],[2]]:
    for threshold in [0.005,0.01,0.02,0.05,0.1,0.2,0.3,0.4,0.5]: 
        tokenizer = FreedomBasedTokenizer(zh_train_chars,'ddf-','ddf+',nlist,threshold=threshold)
        avg_f1 = evaluate_tokenizer_f1(list(news_df['zh']),JebaTokenizer(),tokenizer,debug=False)
        print(nlist,threshold,avg_f1)


[1] 0.005 0.43
[1] 0.01 0.43
[1] 0.02 0.43
[1] 0.05 0.43
[1] 0.1 0.41
[1] 0.2 0.38
[1] 0.3 0.35
[1] 0.4 0.32
[1] 0.5 0.31
[1, 2] 0.005 0.5
[1, 2] 0.01 0.5
[1, 2] 0.02 0.51
[1, 2] 0.05 0.51
[1, 2] 0.1 0.48
[1, 2] 0.2 0.41
[1, 2] 0.3 0.35
[1, 2] 0.4 0.29
[1, 2] 0.5 0.24
[2] 0.005 0.58
[2] 0.01 0.58
[2] 0.02 0.57
[2] 0.05 0.54
[2] 0.1 0.48
[2] 0.2 0.35
[2] 0.3 0.25
[2] 0.4 0.19
[2] 0.5 0.15


In [None]:
#TODO improve the above compacting model!?

In [None]:
#del zh_train_chars