In [1]:
###### import packages, sentence model, and parse html from site url
import requests
import re
from bs4 import BeautifulSoup
import jieba
import torch
import string
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
sentence_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

url = 'https://www.bbc.com/zhongwen/articles/c4gl97d2rzjo/simp'
site = requests.get(url)
site_soup = BeautifulSoup(site.text, 'html.parser')
# print(site_soup.prettify())

  from .autonotebook import tqdm as notebook_tqdm


In [84]:
###### get text from the html and tokenize
site_p = site_soup.find_all('p')[0:100]
text = ""
punc = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏." # possible punctuation

for p in site_p:
    text += str(p.get_text()) # append new text to the text string

text.translate(str.maketrans('', '', string.punctuation)) # convert str format
text_re = re.sub(r"[%s]+" %punc, "", text) # remove punctuation marks

tokens = jieba.lcut(text_re) # tokenize
tokens_l = a = list(dict.fromkeys(tokens))
tokens_pd = pd.DataFrame(tokens)
tokens_pd.drop_duplicates(inplace=True)
tokens_pd.rename({0: "character"}, axis=1, inplace=True)
print(tokens_pd.shape)

(747, 1)


In [3]:
###### load HSK dictionary data and create sentence embeddings of definitions
HSK_nums = [('1', 1), ('2', 2), ('3', 3), ('4', 4), ('5', 5), ('6', 6), ('7_9', 7)] # HSK definitions to load files

HSK = pd.DataFrame()
for hsk in HSK_nums:
    hsk_df = pd.read_csv("../data/Chinese/HSK/HSK"+hsk[0]+".tsv", sep="\t", header=None)
    hsk_df.drop(0, axis=1, inplace=True)
    hsk_df.rename(columns={1: "character", 2: "pinyin", 3: "definition"}, inplace=True)
    hsk_df.insert(loc=2, column="HSK", value=[hsk[1]]*hsk_df.shape[0])
    hsk_df["embedding"] = hsk_df["definition"].map(sentence_model.encode) # create sentence embedding with model
    HSK = pd.concat([HSK, hsk_df])
HSK.drop_duplicates("character", keep="first", inplace=True)
HSK.reset_index(inplace=True, drop=True)
HSK.head()

Unnamed: 0,character,pinyin,HSK,definition,embedding
0,爱,ài,1,"love, like, be fond of, be keen on, cherish, b...","[0.37996447, 0.29299688, 0.55704415, 0.0736245..."
1,爱好,àihào,1,"love, like, be fond of, be keen on","[0.4404698, 0.29555953, 0.5669606, -0.0190896,..."
2,爸爸,bàba,1,"old man, father, papa, pappa, daddy, pa, beget...","[0.05222196, 0.18916288, 0.2597713, -0.585589,..."
3,白,bái,1,"white, clear, pure, plain, wrongly written/mis...","[-0.17480375, 0.23118941, -0.27152503, 0.10098..."
4,八,bā,1,det.: eight,"[0.25673103, 0.25089717, 0.5315905, -0.3529175..."


In [80]:
###### Cross compute the top embeddings for each vocab word and add to HSK dataframe
num_hsk = HSK.index.size
similarity_t = sentence_model.similarity(HSK["embedding"], HSK["embedding"])
top_choice = torch.flip(np.argsort(similarity_t, axis=1)[:,-21:-1], dims=(1,)).numpy().tolist()
top_choice_HSK = [[HSK["HSK"][i] for i in row] for row in top_choice] # convert from index values to HSK values

In [56]:
max_HSK = 3
char_idx = np.where(HSK["character"]==tokens_pd.iloc[40,0])[0][0]
top_idx = top_choice[char_idx][next(x[0] for x in enumerate(top_choice_HSK[char_idx]) if x[1] < max_HSK)] # iterate through and find index of first element below max_HSK
HSK["character"].loc[top_idx]

'完成'

In [83]:
tokens_pd.head()

Unnamed: 0,character
0,图像
1,来源
2,Getty
3,
4,Images


In [88]:
char_idx = np.where(HSK["character"]==tokens_pd.iloc[3,0])[0][0]

IndexError: index 0 is out of bounds for axis 0 with size 0

In [116]:
def simplify(tokens, HSK_level):
    simplified_tokens = tokens.copy()
    for idx, token in enumerate(tokens):
        try:
            char_idx = np.where(HSK["character"]==token)[0][0]
            if HSK["HSK"][char_idx]>max_HSK:
                top_idx = top_choice[char_idx][next(x[0] for x in enumerate(top_choice_HSK[char_idx]) if x[1] <= max_HSK)] # iterate through and find index of first element below max_HSK
                simplified_tokens[idx] = HSK["character"].loc[top_idx]
            else:
                simplified_tokens[idx] = 0
        except:
            simplified_tokens[idx] = 1
    return simplified_tokens

In [132]:
top_choice[5032]

[6694,
 6675,
 6676,
 4536,
 4657,
 1482,
 10015,
 6098,
 6690,
 1849,
 6686,
 4977,
 2038,
 6688,
 2295,
 4537,
 6674,
 7601,
 8030,
 6691]

In [134]:
HSK.iloc[np.where(HSK["character"]=="推出")]

Unnamed: 0,character,pinyin,HSK,definition,embedding
5032,推出,tuīchū,6,present to public,"[0.08026089, 0.23065105, -0.049383704, -0.2228..."


In [126]:
tokens_new = simplify(tokens_l, 4)
[(tokens_l[i], tokens_new[i]) for i in range(len(tokens_l))]

[('图像', '图'),
 ('来源', '农业'),
 ('Getty', 1),
 (' ', 1),
 ('Images', 1),
 ('今年', 0),
 ('四季度', 1),
 ('以来', 0),
 ('中国政府', 1),
 ('刺激', 1),
 ('经济', 0),
 ('的', 0),
 ('意图', '作用'),
 ('明显', 0),
 ('从', 0),
 ('货币', '钱'),
 ('到', 0),
 ('财政', '资金'),
 ('股市', '市场'),
 ('楼市', 1),
 ('不断', 0),
 ('推出', '公共'),
 ('新', 0),
 ('措施', '办法'),
 ('效果', 0),
 ('喜忧参半', 1),
 ('同时', 0),
 ('经济学家', 1),
 ('和', 0),
 ('金融机构', 1),
 ('对于', '向'),
 ('中国', 0),
 ('前景', '景色'),
 ('分析', '观察'),
 ('开始', 0),
 ('分化', '消失'),
 ('有', 0),
 ('观点', 0),
 ('认为', 0),
 ('回升', '上升'),
 ('达到', 0),
 ('5%', 1),
 ('目标', 0),
 ('并', 0),
 ('无', '没有'),
 ('压力', 0),
 ('也', 0),
 ('两位', 1),
 ('对', 0),
 ('困境', '命运'),
 ('进行', 0),
 ('深入分析', 1),
 ('但', 0),
 ('他们', 0),
 ('发言', 0),
 ('在', 0),
 ('网络', '互联网'),
 ('疯传后', 1),
 ('最终', '最后'),
 ('被', 0),
 ('各个', '各自'),
 ('社交', '交通'),
 ('媒体', 0),
 ('删除', '退出'),
 ('即将', '马上'),
 ('召开', '等'),
 ('中共中央', 1),
 ('工作', 0),
 ('会', 0),
 ('预计', 0),
 ('不会', 1),
 ('给出', 1),
 ('市场', 0),
 ('关注', 0),
 ('赤字', '经济'),
 ('率', '带领'),
 ('目前', 0),
 (