In [None]:
import jieba.posseg as pseg
import nltk
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import torch

In [4]:
df = pd.read_csv('../data/BLCU/literature_wordfreq.release_UTF-8.txt', header = None, sep="\t",)
df.rename(columns={0:"character", 1:"frequency"}, inplace=True)
df.set_index("character", inplace=True)
df["frequency"] = df["frequency"].rank(pct=True)
df.head()

Unnamed: 0_level_0,frequency
character,Unnamed: 1_level_1
的,1.0
了,0.999997
是,0.999994
我,0.999992
在,0.999989


In [5]:
with open("../data/HSK/HSK_levels.pickle", 'rb') as handle:
    HSK_dict = pickle.load(handle)
HSK = pd.DataFrame.from_dict(HSK_dict, orient='index', columns=["HSK"])
HSK.head()

Unnamed: 0,HSK
阿姨,4
啊,4
矮,4
矮小,4
爱国,4


In [None]:
HSK.loc["矮"]["HSK"] #returns the HSK level of the character
df.loc["矮"] #returns the freq rank of the character

np.int64(4)

In [58]:
HSK_data = []
for level in range(4,8):
    # get all characters at HSK level
    HSK4 = HSK.loc[HSK["HSK"]==level]
    # merge to get freq of HSK chars
    HSK4_freq = pd.merge(df, HSK4, how='right', left_index=True, right_index=True)["frequency"]
    print(f"HSK{level} data--- mean:", np.mean(HSK4_freq), "max:", np.max(HSK4_freq), "min:", np.min(HSK4_freq))
    HSK_data.append(HSK4_freq)

HSK4 data--- mean: 0.9753306336389995 max: 0.9999776228783691 min: 0.5621762310214037
HSK5 data--- mean: 0.972203775539278 max: 0.9998545487093995 min: 0.4506150911308278
HSK6 data--- mean: 0.9635792048980317 max: 0.9999021000928651 min: 0.6699668259171823
HSK7 data--- mean: 0.9476793222619491 max: 0.9999104915134767 min: 0.2998436398626045


In [116]:
l1 = list(df.iloc[:50000].index)
l2 = list(df.iloc[:50000].index)

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('BAAI/bge-large-zh-v1.5')
embeddings1 = model.encode(l1)
embeddings2 = model.encode(l2)
similarity_t = model.similarity(embeddings1, embeddings2)
top_choice = torch.flip(np.argsort(similarity_t, axis=1)[:,-16:-1], dims=(1,)).numpy().tolist()

In [6]:
import pickle
with open("../data/BLCU/top_similar.pickle", 'rb') as handle:
    top_similar = pickle.load(handle)

In [57]:
idx = 15000
top_list = list(df.index[top_similar[idx]])

for word in top_list:
    tagged_word = list(pseg.cut(word))  # POS tagging
    print(f"{word}: {[(w.word, w.flag) for w in tagged_word]}")

大方: [('大方', 'n')]
宽宏大量: [('宽宏大量', 'i')]
气度: [('气度', 'n')]
豁达: [('豁达', 'a')]
宽大: [('宽大', 'a')]
宽容: [('宽容', 'a')]
宽广: [('宽广', 'a')]
态度: [('态度', 'n')]
风度: [('风度', 'n')]
宽阔: [('宽阔', 'a')]
大气: [('大气', 'n')]
大大方方: [('大大方方', 'nz')]
尺度: [('尺度', 'n')]
厚道: [('厚道', 'n')]
从容: [('从容', 'v')]
