In [129]:
###### import packages, sentence model, and parse html from site url
import requests
import re
from bs4 import BeautifulSoup
import jieba
import torch
import string
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
sentence_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
language_model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v1')

url = 'https://www.bbc.com/zhongwen/articles/c4gl97d2rzjo/simp'
site = requests.get(url)
site_soup = BeautifulSoup(site.text, 'html.parser')
# print(site_soup.prettify())

In [4]:
##### import CNER model and function
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
ner_pipeline = pipeline(Tasks.named_entity_recognition, 'damo/nlp_raner_named-entity-recognition_chinese-base-news')

In [5]:
HSK = pd.read_pickle("../data/Chinese/HSK_full")
HSK.head()

Unnamed: 0,character,POS,pinyin,HSK,definition,embedding,top_choice,top_choice_level
0,爱,V,ài,1,"love, like, be fond of, be keen on, cherish, b...","[0.37996447, 0.29299688, 0.55704415, 0.0736245...","[1, 3048, 414, 9871, 1893, 8870, 8113, 4108, 2...","[1, 4, 1, 7, 3, 7, 7, 5, 4, 7, 4, 7, 7, 3, 7, ..."
1,爱好,V/N,àihào,1,"love, like, be fond of, be keen on","[0.4404698, 0.29555953, 0.5669606, -0.0190896,...","[0, 3048, 414, 9871, 1893, 2533, 8870, 10518, ...","[1, 4, 1, 7, 3, 4, 7, 7, 4, 7, 5, 4, 7, 7, 5, ..."
2,爸爸,,bàba,1,"old man, father, papa, pappa, daddy, pa, beget...","[0.05222196, 0.18916288, 0.2597713, -0.585589,...","[6333, 453, 229, 3202, 292, 5445, 2729, 1518, ...","[7, 1, 1, 4, 1, 6, 4, 3, 1, 5, 7, 1, 4, 1, 3, ..."
3,白,Adj,bái,1,"white, clear, pure, plain, wrongly written/mis...","[-0.17480375, 0.23118941, -0.27152503, 0.10098...","[3, 6129, 5506, 6064, 6575, 546, 2081, 1559, 7...","[1, 7, 7, 7, 7, 2, 3, 3, 7, 2, 7, 7, 7, 4, 7, ..."
4,白,Adv,bái,1,"white, clear, pure, plain, wrongly written/mis...","[-0.17480375, 0.23118941, -0.27152503, 0.10098...","[3, 6129, 5506, 6064, 6575, 546, 2081, 1559, 7...","[1, 7, 7, 7, 7, 2, 3, 3, 7, 2, 7, 7, 7, 4, 7, ..."


In [80]:
###### get text from the html and tokenize
site_s = site_soup.find_all(string=True) # if paragraph, choose ('p') as func arg
text = ""
punc = ",!?！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.-" # possible punctuation

for s in site_s:
    text += str(s.get_text()) # append new text to the text string

text.translate(str.maketrans('', '', string.punctuation)) # convert str format
text_re = re.sub(r"[%s]+" %punc, "", text) # remove punctuation marks

tokens = jieba.lcut(text_re, cut_all=False) # tokenize
tokens_l = list(dict.fromkeys(tokens)) #TODO: try: list(set(tokens))

In [None]:
def batch(iterable, n=1): # define batching function to run NER pipeline (accepts max 512 characters)
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

In [100]:
ner_output = []
for b in batch(text_re, n=510): # batch text for processing in NER pipeline
    ner_output += ner_pipeline(b)['output'] # add batch to ongoing list
tokens_ner = list(set([d['span'] for d in ner_output]))
ner_overlap = list(set(tokens_l) & set(tokens_ner)) # find intersection of NER words and jieba tokens
tokens_no_ner = list(set(tokens_l) - set(tokens_ner))



In [102]:
top_choice = torch.tensor(HSK['top_choice'])
top_choice_HSK = torch.tensor(HSK['top_choice_level'])

In [103]:
def simplify(tokens, max_HSK):
    simplified_tokens = dict()
    for token in tokens:
        try:
            char_idx = np.where(HSK["character"]==token)[0][0]
            if HSK["HSK"][char_idx]>max_HSK:
                top_idx = top_choice[char_idx][next(x[0] for x in enumerate(top_choice_HSK[char_idx]) if x[1] <= max_HSK)] # iterate through and find index of first element below max_HSK
                simplified_tokens[token] = HSK["character"].loc[int(top_idx)]
            else:
                pass# simplified_tokens[idx] = 0
        except:
            pass# simplified_tokens[idx] = 1
    return simplified_tokens

In [127]:
replacement_dict = simplify(tokens_no_ner, 3)

In [130]:
for element in site_soup.find_all(string=True):  # Get all text nodes
    text = element
    for entity in tokens_ner:
        text = text.replace(entity, f'<span style="color: blue;">{entity}</span>')
    for old_word, new_word in replacement_dict.items():
        if old_word in text:
            text = text.replace(old_word, f'<span style="color: red;">{new_word}</span>')
    element.replace_with(BeautifulSoup(text, 'html.parser'))

In [131]:
print(site_soup.prettify())

html
<html class="no-js" dir="ltr" lang="zh-hans">
 <head>
  <script>
   window.__reverb = {};
            window.__reverb.__reverbLoadedPromise = new Promise((resolve, reject) => {
              window.__reverb.__resolveReverbLoaded = resolve;
              window.__reverb.__rejectReverbLoaded = reject;
            });

            window.__reverb.__reverbTimeout = setTimeout(() => {
              window.__reverb.__rejectReverbLoaded();
            }, 5000);
  </script>
  <script async="" src="https://mybbc-analytics.files.bbci.co.uk/reverb-client-js/reverb-3.9.2.js">
  </script>
  <title data-react-helmet="true">
   <span style="color: blue;">
    中
   </span>
   国经济：
   <span style="color: blue;">
    中
   </span>
   央经济工作会前刺激信号频出，两位经济学家被禁言 - BBC News
   <span style="color: blue;">
    中
   </span>
   文
  </title>
  <meta content="IE=edge" data-react-helmet="true" http-equiv="X-UA-Compatible"/>
  <meta charset="utf-8" data-react-helmet="true"/>
  <meta content="noodp, noydir, max-im

In [132]:
with open("output.html", "w", encoding = 'utf-8') as file: 
    # prettify the soup object and convert it into a string 
    file.write(str(site_soup.prettify())) 