In [None]:
###### import packages, sentence model, and parse html from site url
import requests
import re
from bs4 import BeautifulSoup
import jieba
import nltk
import string
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
import csv
from sentence_transformers import SentenceTransformer, util
sentence_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

url = 'https://www.bbc.com/zhongwen/articles/c4gl97d2rzjo/simp'
site = requests.get(url)
site_soup = BeautifulSoup(site.text, 'html.parser')
# print(site_soup.prettify())

  from .autonotebook import tqdm as notebook_tqdm


In [57]:
###### get text from the html and tokenize
site_p = site_soup.find_all('p')[0:100]
text = ""
punc = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏." # possible punctuation

for p in site_p:
    text += str(p.get_text()) # append new text to the text string

text.translate(str.maketrans('', '', string.punctuation)) # convert str format
text_re = re.sub(r"[%s]+" %punc, "", text) # remove punctuation marks

tokens = jieba.lcut(text_re) # tokenize
tokens_pd = pd.DataFrame(tokens)
tokens_pd.drop_duplicates(inplace=True)
tokens_pd.rename({0: "character"}, axis=1, inplace=True)
print(tokens_pd.shape)

(747, 1)


In [52]:
###### load HSK dictionary data and create sentence embeddings of definitions
HSK_nums = [('1', 1), ('2', 2), ('3', 3), ('4', 4), ('5', 5), ('6', 6), ('7_9', 7)] # HSK definitions to load files

HSK = pd.DataFrame()
for hsk in HSK_nums:
    hsk_df = pd.read_csv("../data/Chinese/HSK/HSK"+hsk[0]+".tsv", sep="\t", header=None)
    hsk_df.drop(0, axis=1, inplace=True)
    hsk_df.rename(columns={1: "character", 2: "pinyin", 3: "definition"}, inplace=True)
    hsk_df.insert(loc=2, column="HSK", value=[hsk[1]]*hsk_df.shape[0])
    hsk_df["embedding"] = hsk_df["definition"].map(sentence_model.encode) # create sentence embedding with model
    HSK = pd.concat([HSK, hsk_df])
HSK.reset_index(inplace=True, drop=True)
HSK.head()

Unnamed: 0,character,pinyin,HSK,definition,embedding
0,爱,ài,1,"love, like, be fond of, be keen on, cherish, b...","[0.37996447, 0.29299688, 0.55704415, 0.0736245..."
1,爱好,àihào,1,"love, like, be fond of, be keen on","[0.4404698, 0.29555953, 0.5669606, -0.0190896,..."
2,爸爸,bàba,1,"old man, father, papa, pappa, daddy, pa, beget...","[0.05222196, 0.18916288, 0.2597713, -0.585589,..."
3,白,bái,1,"white, clear, pure, plain, wrongly written/mis...","[-0.17480375, 0.23118941, -0.27152503, 0.10098..."
4,八,bā,1,det.: eight,"[0.25673103, 0.25089717, 0.5315905, -0.3529175..."


In [60]:
###### inner merge HSK with tokens to get dictionary data of tokens in the text
tokens_with_embed = pd.merge(left=tokens_pd, right=HSK, how='inner', on="character")
tokens_with_embed.iloc[0,:]

character                                                    图像
pinyin                                                  túxiàng
HSK                                                           7
definition                          picture, graph, icon, image
embedding     [-0.13980588, 0.34705573, -0.3828236, -0.55116...
Name: 0, dtype: object

In [61]:
HSK_levels = {'1': (0, 496), 
              '2': (497, 1259),
              '3': (1260, 2225),
              '4': (2226, 3219),
              '5': (3220, 4286),
              '6': (4287, 5420),
              '7': (5421, 11035)}

In [65]:
HSK_level = 3
num_hsk = HSK_levels[str(HSK_level)][1]
num_tokens = tokens_with_embed.index.size

simple_char = []
for jj in range(num_tokens):
    cos_sim = []
    if tokens_with_embed["HSK"][jj]<=HSK_level: # token is already simple enough
        simple_char.append(np.nan)
    else:
        for ii in range(num_hsk):
            cos_sim.append(sentence_model.similarity(tokens_with_embed["embedding"][jj], HSK["embedding"][ii])[0][0].float())
        simple_char.append(HSK.iloc[int(np.argmax(cos_sim)),0])

tokens_with_embed["simplified"] = simple_char