In [1]:
import spacy
import jieba
import pandas as pd
from tqdm import tqdm
from math import log

jieba.set_dictionary('data/dict.txt.big')

In [2]:
with open("data/clean/zh_book1.txt", 'r', encoding='utf-8') as file:
    text = file.read()
tokens = jieba.lcut(text, cut_all = False)

Building prefix dict from c:\Users\hugo_\Desktop\iql_lab3\data\dict.txt.big ...
Loading model from cache C:\Users\hugo_\AppData\Local\Temp\jieba.uceaac3f4f1db143f008c31874e8bc8f8.cache
Loading model cost 1.540 seconds.
Prefix dict has been built successfully.


In [3]:
punc = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.　 ﹔\n[]"
clean_tokens = [token for token in tokens if token not in punc]
text_without_punctuation = ','.join(clean_tokens)
print(len(clean_tokens),len(tokens),-len(clean_tokens)+len(tokens))

93188 132398 39210


In [4]:
clean_tokens[0]

'天豹'

In [5]:
# Function to create a DataFrame for each element
def create_dataframe_for_element(element, my_list):
    indexes = [i for i, x in enumerate(my_list) if x == element]
    df = pd.DataFrame({"Token": [element], "Index": [indexes]})
    return df

# Iterate through unique elements in the list and create DataFrames
dfs = [create_dataframe_for_element(element, clean_tokens) for element in tqdm(set(clean_tokens))]

# Concatenate DataFrames into a single DataFrame
positions = pd.concat(dfs, ignore_index=True)

100%|██████████| 14161/14161 [01:32<00:00, 152.42it/s]


In [6]:
positions

Unnamed: 0,Token,Index
0,指點,[123]
1,送母,[5915]
2,監牢,"[26184, 73563]"
3,學生,[5422]
4,力大無窮,"[71428, 89175]"
...,...,...
14156,上牀,"[33297, 33390, 36851, 43612, 48345, 62168, 833..."
14157,田爺,[92557]
14158,九霄雲外,[8944]
14159,燒完,[10861]


In [7]:
#Frequencies
def get_pos(df, token):
    pos = df.loc[df['Token'] == token, 'Index'].iloc[0]
    return pos

def f(x_token:str, y_token:str , tokens:list[str], set_df:pd.DataFrame, distance:int) -> int:
    fr = 0
    for i in get_pos(set_df, x_token):
        y_pos = i+distance
        if y_pos < len(tokens) and y_pos >= 0 and y_token == tokens[y_pos]:
           #print(True, y_pos/len(tokens), x_token, y_token)
           fr += 1
    return fr

def f_x(x_token:str, tokens:list[str], set_df:pd.DataFrame, distance:int) -> int:
    fr = 0
    for y_token in tqdm(set_df.iloc[:,0]):
        fr += f(x_token, y_token, tokens, set_df, distance)
    return fr

def f_y(y_token:str, tokens:list[str], set_df:pd.DataFrame, distance:int) -> int:
    fr = 0
    for x_token in tqdm(set_df.iloc[:,0]):
        fr += f(x_token, y_token, tokens, set_df, distance)
    return fr

def big_F(tokens:list[str], set_df:pd.DataFrame, distance:int) -> int:
    fr = 0
    for x_token in tqdm(set_df.iloc[:,0]):
        for y_token in set_df.iloc[:,0]:
            fr += f(x_token, y_token, tokens, set_df, distance)
    return fr

#--------------------------------------------------#

def fast_f_x(x_token:str, tokens:list[str], set_df:pd.DataFrame, distance:int) -> int:
    fr = 0
    for el in get_pos(set_df, x_token):
        pos = el + distance
        if pos < len(tokens) and pos >= 0:
            fr += 1
    return fr

def fast_f_y(y_token:str, tokens:list[str], set_df:pd.DataFrame, distance:int) -> int:
    fr = 0
    for el in get_pos(set_df, y_token):
        pos = el - distance
        if pos < len(tokens) and pos >= 0:
            fr += 1
    return fr

def fast_big_f(tokens:list[str], set_df:pd.DataFrame, distance:int) -> int:
    return (len(tokens)-distance)

In [8]:
#Probabilities
def p_x_y(x_token:str, y_token:str , tokens:list[str], set_df:pd.DataFrame, distance:int) -> float:
    p = f(x_token, y_token, tokens, set_df, distance) / fast_big_f(tokens, set_df, distance)
    return p

def p_x(x_token:str, tokens:list[str], set_df:pd.DataFrame, distance:int) -> float:
    p = fast_f_x(x_token, tokens, set_df, distance) / fast_big_f(tokens, set_df, distance)
    return p

def p_y(y_token:str , tokens:list[str], set_df:pd.DataFrame, distance:int) -> float:
    p = fast_f_y(y_token, tokens, set_df, distance) / fast_big_f(tokens, set_df, distance)
    return p

In [13]:
#Entropies
def H_X(tokens:list[str], set_df:pd.DataFrame, distance:int) -> float:
    F = fast_big_f(tokens, set_df, distance)
    H = 0
    for x_token in tqdm(set_df.iloc[:,0]):
        fx = fast_f_x(x_token, tokens, set_df, distance)
        if fx != 0:
            H += fx * log(fx)
    H = H/F
    H = log(F) - H 
    return H

def H_Y(tokens:list[str], set_df:pd.DataFrame, distance:int) -> float:
    F = fast_big_f(tokens, set_df, distance)
    H = 0
    for y_token in tqdm(set_df.iloc[:,0]):
        fy = fast_f_y(y_token, tokens, set_df, distance)
        if fy != 0:
            H += fy * log(fy)
    H = H/F
    H = log(F) - H 
    return H

def H_XY(tokens:list[str], set_df:pd.DataFrame, distance:int) -> float:
    F = fast_big_f(tokens, set_df, distance)
    H = 0
    for x_token in tqdm(set_df.iloc[:,0]):
        for y_token in set_df.iloc[:,0]:
            fr = f(x_token, y_token, tokens, set_df, distance)
            if fr != 0:
                H += fr * log(fr)
    H = H/F
    H = log(F) - H 
    return H

In [14]:
#Mutual information

def I(tokens:list[str], set_df:pd.DataFrame, distance:int) -> float:
    HX = H_X(tokens, set_df, distance)
    HY = H_Y(tokens, set_df, distance)
    H = H_XY(tokens, set_df, distance)
    return HX + HY - H

In [None]:
I(clean_tokens, positions, 2) #TODO This takes too long (ask ramon)

In [39]:
fast = fast_f_y('隨即', clean_tokens, positions, 2)
notfast = f_y('隨即', clean_tokens, positions, 2)
print(fast == notfast, fast, notfast)

100%|██████████| 14161/14161 [00:18<00:00, 783.90it/s]

True 22 22





In [None]:
len(clean_tokens)

93188