In [332]:
import spacy
import jieba
import pandas as pd
from tqdm import tqdm
import numpy as np
from collections import Counter
import plotly.graph_objects as go
import plotly.express as px

jieba.set_dictionary('data/dict.txt.big')

In [333]:
#Dictionary for the pairs of words
def create_dataframe(words_list, distance):
    #Find all the pairs at given distance 
    pairs = [(words_list[i], words_list[i + distance], i) for i in range(len(words_list) - distance)]
    #Save into a dataframe
    df = pd.DataFrame(pairs, columns=['Token x', 'Token y', 'Position of Token x'])
    df['Distance'] = distance
    return df

def collect_positions(df):
    df_grouped = df.groupby(['Token x', 'Token y']).agg({'Position of Token x': list}).reset_index()
    return df_grouped

#Entropies
def H_X(pairs) -> float:
    token_counts = Counter(pairs[0])
    F = big_f(pairs)
    H = 0

    for token_x, fx in token_counts.items():
        if fx != 0:
            H += fx * np.log(fx)

    H /= F
    H = np.log(F) - H

    return H

def H_Y(pairs) -> float:
    token_counts = Counter(pairs[1])
    F = big_f(pairs)
    H = 0

    for token_y, fy in token_counts.items():
        if fy != 0:
            H += fy * np.log(fy)

    H /= F
    H = np.log(F) - H
    return H

def H_XY(pairs, pairs_gr) -> float:
    F = big_f(pairs)
    H = 0
    for pair in pairs_gr:
        fr = len(pair[2])
        if fr != 0:
            H += fr * np.log(fr)

    H /= F
    H = np.log(F) - H

    return H

#Mutual information

def I(pairs, pairs_gr) -> float: 
    HX = H_X(pairs)
    HY = H_Y(pairs)
    H = H_XY(pairs, pairs_gr)
    return HX + HY - H

In [334]:
with open("data/clean/zh_book1.txt", 'r', encoding='utf-8') as file:
    text = file.read()
tokens = jieba.lcut(text, cut_all = False)

Building prefix dict from c:\Users\hugo_\Desktop\iql_lab3\data\dict.txt.big ...
Loading model from cache C:\Users\hugo_\AppData\Local\Temp\jieba.uceaac3f4f1db143f008c31874e8bc8f8.cache


Loading model cost 2.087 seconds.
Prefix dict has been built successfully.


In [335]:
punc = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.　 ﹔\n[]﹍"
clean_tokens = [token for token in tokens if token not in punc]
text_without_punctuation = ','.join(clean_tokens)
print(len(clean_tokens),len(tokens),-len(clean_tokens)+len(tokens))

93186 132398 39212


In [337]:
max_d = 50
MI = np.zeros(max_d)

for i in tqdm(range(1, max_d)):

    pairs = create_dataframe(clean_tokens, i)
    pairs_grouped = collect_positions(pairs)

    pairs_np = np.transpose(pairs.to_numpy())
    pairs_grouped_np = pairs_grouped.to_numpy()

    MI[i] = I(pairs_np, pairs_grouped_np)

print(MI)

100%|██████████| 49/49 [01:29<00:00,  1.82s/it]

[0.         4.4785212  4.0716655  3.97574931 3.94599839 3.93358154
 3.92621809 3.91956053 3.91610042 3.91780302 3.91306368 3.91412179
 3.91311793 3.91322618 3.91335384 3.9129089  3.90697638 3.90767422
 3.90706941 3.91110741 3.91153067 3.90911863 3.90764111 3.9058589
 3.9057672  3.90277385 3.90555785 3.90520785 3.90664827 3.9052327
 3.90500558 3.90300452 3.90828146 3.90480953 3.9027206  3.90154009
 3.90405181 3.90223376 3.90257428 3.8997887  3.90239686 3.90033816
 3.90165543 3.90136538 3.9053431  3.8992504  3.90098382 3.89771988
 3.89673642 3.90039487]





In [361]:
fig = go.Figure()
fig.add_trace(go.Scatter(y=MI[1:], x=list(range(1, len(MI))), mode='markers'))
fig.update_layout(xaxis=dict(range=[0, np.log10(len(MI))]))
fig.update_layout(
    xaxis_title="d",
    xaxis_type="log",
    yaxis_title="I(d)",
    yaxis_type="log",
)
fig.show()