In [2]:
# https://www.vengaglobal.com/blog/simplified-traditional-chinese-mandarin-cantonese/

# Target Market  Written      Spoken
# -------------------------------------
# China          Simplified   Mandarin
# Singapore      Simplified   Mandarin
# Taiwan         Traditional  Mandarin
# Hong Kong      Traditional  Cantonese

# Lexicon:
# http://www.chineselexicaldatabase.com/download.php - used below
# Sun, C. C., Hendrix, P., Ma, J.Q. & Baayen, R. H. (2018). Chinese Lexical Database (CLD): A large-scale lexical database for simplified Mandarin Chinese. Behavior Research Methods, https://doi.org/10.3758/s13428-018-1038-3.

# Corpora:
# https://www.openslr.org/38/ - test-audio corpus, not relevant
# https://github.com/CLUEbenchmark/CLUECorpus2020/ - email request sent
# https://github.com/brightmart/nlp_chinese_corpus - nearly same as above downloaded, used further

# TODO:
# https://metatext.io/datasets/nlp-chinese-corpus - paper with word segmentation


In [1]:
import os, sys
cwd = os.getcwd()
project_path = cwd[:cwd.find('pygents')+7]
if project_path not in sys.path: sys.path.append(project_path)
os.chdir(project_path) 

from importlib import reload  # Python 3.4+

import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

#force reimport
if 'pygents.util' in sys.modules:
    del sys.modules['pygents.util']
if 'pygents.text' in sys.modules:
    del sys.modules['pygents.text']
if 'pygents.plot' in sys.modules:
    del sys.modules['pygents.plot']
if 'pygents.token' in sys.modules:
    del sys.modules['pygents.token']

from pygents.util import * 
from pygents.text import * 
from pygents.plot import * 
from pygents.token import * 


## Lexicon

In [3]:
path = '../../nlp/corpora/Chinese/'

cld_df = pd.read_csv(os.path.join(path,'lexicon/chineselexicaldatabase2.1.txt'))
len(cld_df)

  exec(code_obj, self.user_global_ns, self.user_ns)


48644

In [4]:
cld_df

Unnamed: 0,Word,C1,C2,C3,C4,Length,C1Structure,C2Structure,C3Structure,C4Structure,...,PSPMI,TScore,PSTScore,C1ConditionalProbability,C12ConditionalProbability,C123ConditionalProbability,C1BackwardConditionalProbability,C12BackwardConditionalProbability,C123BackwardConditionalProbability,EntropyCharacterFrequencies
0,中东,中,东,,,2,SG,SG,,,...,3.8665,-0.4499,6.1560,0.003038,,,0.043024,,,0.8095
1,马队,马,队,,,2,SG,LR,,,...,0.2038,-0.2477,0.0632,0.000725,,,0.000951,,,0.8464
2,门徒,门,徒,,,2,SG,LR,,,...,9.6434,28.4298,86.3191,0.087427,,,0.255630,,,0.3721
3,申讨,申,讨,,,2,SG,LR,,,...,3.7073,-0.0752,0.5747,0.000572,,,0.002027,,,0.8281
4,曲,曲,,,,1,SG,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48639,晓之以理,晓,之,以,理,4,LR,SG,LR,LR,...,-3.6498,-9.8059,-1.0873,1.000000,1.000000,1.0,1.000000,1.0,0.021398,1.4752
48640,人心所向,人,心,所,向,4,SG,SG,LR,HCI,...,-10.8551,-28.8502,-5.2375,0.000748,0.024096,1.0,1.000000,1.0,0.008772,1.3940
48641,门庭若市,门,庭,若,市,4,SG,UD,UD,UD,...,10.7869,0.6575,14.4676,0.444444,1.000000,1.0,1.000000,1.0,0.170213,1.5872
48642,逼良为娼,逼,良,为,娼,4,LRB,SG,SG,LR,...,12.6504,1.1159,24.8872,0.619048,1.000000,1.0,1.000000,1.0,1.000000,0.4557


In [5]:
cld_df[['Word']]


Unnamed: 0,Word
0,中东
1,马队
2,门徒
3,申讨
4,曲
...,...
48639,晓之以理
48640,人心所向
48641,门庭若市
48642,逼良为娼


## Corpora

In [6]:
#check if context is present
try:
    n_counters1 = context_save_load(None,'chinese_news2016zh_train',folder='data/models/')
    len(n_counters1)
except:
    n_counters1 = None

In [7]:
if n_counters1 is None or len(n_counters1) < 1:
    max_n = 3 # in case of Chinese!?
    n_counters1 = grams_init(max_n)
    cnt = 0
    with open(os.path.join(path, 'clue/new2016zh/news2016zh_train.txt'),errors='ignore') as f:
        while True:
            line = f.readline()
            if not line:
                break
            cnt += 1
            if (cnt % 10000) == 0:
                print(cnt,line)
            text = preprocess_text(line)
            text_grams_count(n_counters1,text,max_n)
    print(cnt)
    context_save_load(n_counters1,'chinese_news2016zh_train',folder='data/models/')


10000 郑爽和粉丝聊天近照曝光 身材纤瘦惹人怜

20000 全扫描网球赛季的红土狂欢已经于北京时间昨天在罗兰·加洛斯展开，塞雷纳·威廉姆斯是否还是一如继往的强大？一个不在最佳状态的纳达尔，是否让德约拥

30000 为加快推进广东省预拌砂浆发展应用，广东省住房和城乡建设厅于2016年8月颁发了《关于印发〈广东省住房和建设厅关于预拌砂浆生产企业备案的管理办法〉的通知》（粤建散〔2016〕151号，自2016年10月1日起施行）。为贯彻执行该办法，加强信息平台应用技能培训，广东省散装水泥管理办公室于2016年9月28日，在广州召开了预拌砂浆生产企业备案管理办法宣贯暨信息平台培训会议。来自全省各市县散装水泥主管机构、行业协会及相关企业的代表共200人参加了会议。会议由省散办林林副主任主持。 上午，林林副主任作预拌砂浆生产企业备案工作部署。他强调了四点意见：一是提高认识，加快推进；二是把住节点，停旧履新；三是区别处理，完善材料；四是有序发展，调整上报。接着余祥荣主任科员对预拌砂浆备案管理办法作起草说明和条文解释的讲解。尔后，林林副主任和余祥荣主任科员对如何正确执行预拌砂浆备案管理办法进行了现场答疑。下午，刘细华副主任科员对信息平台预拌砂浆备案子系统作了功能介绍与操作演示，黄琼副主任科员对信息平台常见问题进行了讲解。随后，刘细华、黄琼和广州粤建三和软件股份有限公司技术人员对信息平台应用相关问题共同进行了现场答疑。与会代表认真听课，积极提问，对预拌砂浆生产企业备案办法加深了理解，提升了信息平台操作水平。

40000 我的天呐！宝宝经常吃手竟然会这样

50000 2013-2014年间，全世界吹起一阵写代码的风潮，各大媒体疯狂报导，鼓吹年轻人不管所学为何，一定要学写程序。时不时就会看到“程序员是最有钱

60000  点击“阅读原文”查询详情

70000 烈焰粉怎么做 PC版烈焰粉合成攻略

80000 爱锻炼·乐生活精彩人生首选昂派体育

90000 证券代码：000837证券简称：秦川机床公告编号：2015-07秦川机床工具集团股份公司2014年度业绩快报本公司及董事会全体成员保证信息披露内容的真实、准确和完整，没有虚假记载、误导性陈述或重大遗漏。特别提示：本公告所载2014年度的财务数据，仅为初步核算数据，未经会计师事务所审定，与年度报告中披露的最终数据可能存在

KeyboardInterrupt: 

In [None]:
dfs = []
for i in range(len(n_counters1)):
    counter = n_counters1[i]
    df = pd.DataFrame([(gram, counter[gram]) for gram in counter],columns=['gram','freq'])
    df['log'] = np.log10(df['freq'])
    df.sort_values('freq',ascending=False,inplace=True)
    df.title = str(1+i)
    dfs.append(df)

In [None]:
dfs[0][:20][['gram','freq']]
# 的 - of
# 是 - yes
# 在 - exist
# 不 - do not

In [None]:
dfs[1][:20][['gram','freq']]
# 一个 - one
# 公司 - company
# 中国 - china
# 我们 - us/ourselves
# 可以 - can

In [None]:
dfs[2][:20][['gram','freq']]
# 自己的 - my own
# ，我们 - , us
# 互联网 - the internet

In [None]:
#https://chowdera.com/2022/03/202203280859161240.html

#http://anqin007.blogspot.com/2018/12/show-chinese-characters-in-matplotlib.html
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False

plt.rcParams["figure.figsize"] = (20,20)
for df in dfs:
    p = df[:100][['gram','freq']].plot.barh(x='gram'); p.invert_yaxis();
    p.set_title(df.title,fontsize = 32)
    plt.show()

In [None]:
plt.rcParams["figure.figsize"] = (20,20)
for df in dfs:
    p = df[:100][['gram','log']].plot.barh(x='gram'); p.invert_yaxis();
    p.set_title(df.title,fontsize = 32)
    plt.show()

## Load and explore full models