In [70]:
import pandas as pd
from tqdm import tqdm
import numpy as np
from collections import Counter
import plotly.graph_objects as go
import plotly.express as px
import os
import re #Regular expressions

#Tokenizers 
import spacy
import jieba
import es_core_news_sm, en_core_web_sm, ja_core_news_sm, pl_core_news_sm, de_core_news_sm #spacy models

#NLP objects for (as we can't use shortcuts for loading the objects)
nlp_es= spacy.load("es_core_news_sm") #Spanish
nlp_ja= spacy.load("ja_core_news_sm") #Japanese
nlp_en= spacy.load("en_core_web_sm") #English
jieba.set_dictionary('data/dict.txt.big') #Chinese
from spacy.lang.ru import Russian
nlp_ru= Russian()

In [90]:
#Extracts raw text from each file 
codes_langs= ['zh','es', 'en', 'ru', 'ja'] #ISO code of languages: chinese, spanish, english, russian, japanese.
def extract_raw_texts(list_of_codes):
  raw_files_texts= {}
  path= "data/no_boilerplate/"
  all_file_names= [file for file in os.listdir(path) if file.endswith('.txt')] #enlists names of .txt files 
  for language in list_of_codes:
    files= [f for f in all_file_names if re.findall(language, f)] #separating files by language 
    number_texts= int(len(files))
    for f in files:
       raw= []
       with open(path + f, "r", encoding="utf-8") as file:
         t= file.read()
         raw.append(t)
       raw_files_texts[f]=raw
  return raw_files_texts

#Tokenizer 

def tokenizer(text, model_lang):
    nlp= model_lang #Opens spacy object
    nlp.max_length = 9000000
    doc=nlp(text) #Process text with spacy 
    tokens = [token.text for token in doc if not token.is_space and not token.is_punct and not token.is_digit]
    return tokens

def tokens(dict_raw_texts): #takes real_tokenizer and filters by language to tokenize
    tokens_langs= {} #dictionary to store output
    for item in list(dict_raw_texts.keys()):
        if 'es' in item:
            text= dict_raw_texts[item][0] #gets text from dict in raw_files_names 
            model_lang= nlp_es #loads corresponding model
            tokens= tokenizer(text, model_lang) #tokenizes
            tokens_langs[item]=tokens #appends to output dictionary 
        elif 'zh' in item:
            punc= ["！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.　 ﹔\\n[]﹍\\n", "\n", ", "," ","，",": ",'。',"-"]
            text= dict_raw_texts[item][0]
            tok= jieba.lcut(text, cut_all = False)
            tokens= [t for t in tok if t not in punc]
            tokens_langs[item]=tokens
       # elif 'ja' in item: CAMBIAR A OTRA LENGUA. Esta da muchos errores
        #    text= dict_raw_texts[item][0]
         #   model_lang= nlp_ja
          #  tokens= tokenizer(text, model_lang) 
           # tokens_langs[item]=tokens 
        elif 'ru' in item:
            text= dict_raw_texts[item][0]
            model_lang= nlp_es 
            tokens= tokenizer(text, model_lang) 
            tokens_langs[item]=tokens 
    return tokens_langs

#Dictionary for the pairs of words
def create_dataframe(words_list, distance):
    #Find all the pairs at given distance 
    pairs = [(words_list[i], words_list[i + distance], i) for i in range(len(words_list) - distance)]
    #Save into a dataframe
    df = pd.DataFrame(pairs, columns=['Token x', 'Token y', 'Position of Token x'])
    df['Distance'] = distance
    return df

def collect_positions(df):
    df_grouped = df.groupby(['Token x', 'Token y']).agg({'Position of Token x': list}).reset_index()
    return df_grouped

#Entropies
def H_X(pairs) -> float:
    token_counts = Counter(pairs[0])
    F = big_f(pairs)
    H = 0

    for token_x, fx in token_counts.items():
        if fx != 0:
            H += fx * np.log(fx)

    H /= F
    H = np.log(F) - H

    return H

def H_Y(pairs) -> float:
    token_counts = Counter(pairs[1])
    F = big_f(pairs)
    H = 0

    for token_y, fy in token_counts.items():
        if fy != 0:
            H += fy * np.log(fy)

    H /= F
    H = np.log(F) - H
    return H

def H_XY(pairs, pairs_gr) -> float:
    F = big_f(pairs)
    H = 0
    for pair in pairs_gr:
        fr = len(pair[2])
        if fr != 0:
            H += fr * np.log(fr)

    H /= F
    H = np.log(F) - H

    return H

#Mutual information

def I(pairs, pairs_gr) -> float: 
    HX = H_X(pairs)
    HY = H_Y(pairs)
    H = H_XY(pairs, pairs_gr)
    return HX + HY - H

In [80]:
raw_texts= extract_raw_texts(codes_langs)

In [91]:
tokenized_texts= tokens(raw_texts)

In [92]:
tokenized_texts['zh_book1.txt']

['Produced',
 'by',
 'Nai',
 'Wen',
 'Cai',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '天豹',
 '圖',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '\u3000',
 '[',
 '清',
 ']',
 '佚名',
 '序',
 '\u3000',
 '\u3000',
 '若夫',
 '指帝',
 '天而喻',
 '美',
 '賦',
 '雲雨',
 '以',
 '傳奇',
 '此固',
 '小說家',
 '鏗金戛玉',
 '多',
 '存嬿婉之詞',
 '是',
 '世人',
 '之',
 '不可',
 '與',
 '莊語',
 '也',
 '然謂',
 '柳絮',
 '之才',
 '罕柏舟',
 '之操',
 '如雲',
 '之媛罔崩城',
 '之烈',
 '辭華',
 '之',
 '媖',
 '多',
 '同車',
 '之行',
 '苧羅之姝靡',
 '坐檯',
 '之守',
 '竊香',
 '之姬',
 '無',
 '墜樓',
 '之志',
 '琴心',
 '之女',
 '乏',
 '投梭',
 '之貞',
 '何哉',
 '？',
 '竹箭',
 '不花',
 '芙',
 '蕖',
 '寡節',
 '豈非',
 '騷人墨客',
 '借',
 '古人',
 '以',
 '澆',
 '胸中壘塊',
 '也',
 '\u3000',
 '\u3000',
 '細閱',
 '此書',
 '寓旨',
 '隱躍',
 '如諷',
 '如嘲',
 '全在',
 '浩然之氣',
 '耳',
 '觀施碧霞',
 '賣身',
 '葬母',
 '陷入',
 

In [337]:
max_d = 50
MI = np.zeros(max_d)

for i in tqdm(range(1, max_d)):

    pairs = create_dataframe(clean_tokens, i)
    pairs_grouped = collect_positions(pairs)

    pairs_np = np.transpose(pairs.to_numpy())
    pairs_grouped_np = pairs_grouped.to_numpy()

    MI[i] = I(pairs_np, pairs_grouped_np)

print(MI)

100%|██████████| 49/49 [01:29<00:00,  1.82s/it]

[0.         4.4785212  4.0716655  3.97574931 3.94599839 3.93358154
 3.92621809 3.91956053 3.91610042 3.91780302 3.91306368 3.91412179
 3.91311793 3.91322618 3.91335384 3.9129089  3.90697638 3.90767422
 3.90706941 3.91110741 3.91153067 3.90911863 3.90764111 3.9058589
 3.9057672  3.90277385 3.90555785 3.90520785 3.90664827 3.9052327
 3.90500558 3.90300452 3.90828146 3.90480953 3.9027206  3.90154009
 3.90405181 3.90223376 3.90257428 3.8997887  3.90239686 3.90033816
 3.90165543 3.90136538 3.9053431  3.8992504  3.90098382 3.89771988
 3.89673642 3.90039487]





In [361]:
fig = go.Figure()
fig.add_trace(go.Scatter(y=MI[1:], x=list(range(1, len(MI))), mode='markers'))
fig.update_layout(xaxis=dict(range=[0, np.log10(len(MI))]))
fig.update_layout(
    xaxis_title="d",
    xaxis_type="log",
    yaxis_title="I(d)",
    yaxis_type="log",
)
fig.show()