In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
from collections import Counter
import plotly.graph_objects as go
import plotly.express as px
import os
import re #Regular expressions

#Tokenizers 
import spacy
import jieba
import es_core_news_sm, en_core_web_sm, ja_core_news_sm#, pl_core_news_sm, de_core_news_sm #spacy models

#NLP objects for (as we can't use shortcuts for loading the objects)
nlp_es= spacy.load("es_core_news_sm") #Spanish
nlp_ja= spacy.load("ja_core_news_sm") #Japanese
nlp_en= spacy.load("en_core_web_sm") #English
jieba.set_dictionary('data/dict.txt.big') #Chinese
from spacy.lang.ru import Russian
nlp_ru= Russian()

In [2]:
#Extracts raw text from each file 
codes_langs= ['zh','es', 'en', 'ru', 'ja'] #ISO code of languages: chinese, spanish, english, russian, japanese.
def extract_raw_texts(list_of_codes):
  raw_files_texts= {}
  path= "data/no_boilerplate/"
  all_file_names= [file for file in os.listdir(path) if file.endswith('.txt')] #enlists names of .txt files 
  for language in list_of_codes:
    files= [f for f in all_file_names if re.findall(language, f)] #separating files by language 
    number_texts= int(len(files))
    for f in files:
       raw= []
       with open(path + f, "r", encoding="utf-8") as file:
         t= file.read()
         raw.append(t)
       raw_files_texts[f]=raw
  return raw_files_texts

#Tokenizer 

def tokenizer(text, model_lang):
    nlp= model_lang #Opens spacy object
    nlp.max_length = 9000000
    doc=nlp(text) #Process text with spacy 
    tokens = [token.text for token in doc if not token.is_space and not token.is_punct and not token.is_digit]
    return tokens

def tokens(dict_raw_texts): #takes real_tokenizer and filters by language to tokenize
    tokens_langs= {} #dictionary to store output
    for item in list(dict_raw_texts.keys()):
        print(item)
        if 'es' in item:
            text= dict_raw_texts[item][0] #gets text from dict in raw_files_names 
            model_lang= nlp_es #loads corresponding model
            tokens= tokenizer(text, model_lang) #tokenizes
            tokens_langs[item]=tokens #appends to output dictionary 
        elif 'en' in item:
            text= dict_raw_texts[item][0] #gets text from dict in raw_files_names 
            model_lang= nlp_en #loads corresponding model
            tokens= tokenizer(text, model_lang) #tokenizes
            tokens_langs[item]=tokens #appends to output dictionary 
        elif 'zh' in item:
            punc= ["！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.　 ﹔\\n[]﹍\\n 「", "\n", ", "," ","，",": ",'。',"-"]
            text= dict_raw_texts[item][0]
            tok= jieba.lcut(text, cut_all = False)
            tokens= [t for t in tok if t not in punc]
            tokens_langs[item]=tokens
        elif 'ru' in item:
            text= dict_raw_texts[item][0]
            model_lang= nlp_ru 
            tokens= tokenizer(text, model_lang) 
            tokens_langs[item]=tokens 
    return tokens_langs

#Dictionary for the pairs of words
def create_dataframe(words_list, distance):
    #Find all the pairs at given distance 
    pairs = [(words_list[i], words_list[i + distance], i) for i in range(len(words_list) - distance)]
    #Save into a dataframe
    df = pd.DataFrame(pairs, columns=['Token x', 'Token y', 'Position of Token x'])
    df['Distance'] = distance
    return df

def collect_positions(df):
    df_grouped = df.groupby(['Token x', 'Token y']).agg({'Position of Token x': list}).reset_index()
    return df_grouped

#Entropies
def H_X(pairs) -> float:
    token_counts = Counter(pairs[0])
    F = len(pairs[0])
    H = 0

    for token_x, fx in token_counts.items():
        if fx != 0:
            H += fx * np.log(fx)

    H /= F
    H = np.log(F) - H

    return H

def H_Y(pairs) -> float:
    token_counts = Counter(pairs[1])
    F = len(pairs[0])
    H = 0

    for token_y, fy in token_counts.items():
        if fy != 0:
            H += fy * np.log(fy)

    H /= F
    H = np.log(F) - H
    return H

def H_XY(pairs, pairs_gr) -> float:
    F = len(pairs[0])
    H = 0
    for pair in pairs_gr:
        fr = len(pair[2])
        if fr != 0:
            H += fr * np.log(fr)

    H /= F
    H = np.log(F) - H

    return H

#Mutual information

def I(pairs, pairs_gr) -> float: 
    HX = H_X(pairs)
    HY = H_Y(pairs)
    H = H_XY(pairs, pairs_gr)
    return HX + HY - H

def mutual_information(tokens, max_d):
    MI = np.zeros(max_d)
    for i in range(1, max_d):

        pairs = create_dataframe(tokens, i)
        pairs_grouped = collect_positions(pairs)

        pairs_np = np.transpose(pairs.to_numpy())
        pairs_grouped_np = pairs_grouped.to_numpy()

        MI[i] = I(pairs_np, pairs_grouped_np)
    return MI

In [3]:
raw_texts= extract_raw_texts(codes_langs)

In [4]:
tokenized_texts= tokens(raw_texts)

Building prefix dict from c:\Users\hugo_\Desktop\iql_lab3\data\dict.txt.big ...
Loading model from cache C:\Users\hugo_\AppData\Local\Temp\jieba.uceaac3f4f1db143f008c31874e8bc8f8.cache


zh_book1.txt


Loading model cost 1.493 seconds.
Prefix dict has been built successfully.


zh_book2.txt
es_book1.txt
es_book2.txt
en_book3.txt
en_macbeth.txt
en_quixote.txt
ru_book1.txt
ru_book2.txt
ru_book3.txt


In [10]:
for key in tokenized_texts.keys():
    tokens = tokenized_texts[key]
    print("# of tokens in "+key+"\t -->\t ",len(tokens))

# of tokens in zh_book1.txt	 -->	  108918
# of tokens in zh_book2.txt	 -->	  88237
# of tokens in es_book1.txt	 -->	  188480
# of tokens in es_book2.txt	 -->	  51028
# of tokens in en_book3.txt	 -->	  13219
# of tokens in en_macbeth.txt	 -->	  18747
# of tokens in en_quixote.txt	 -->	  62631
# of tokens in ru_book1.txt	 -->	  3684
# of tokens in ru_book2.txt	 -->	  17826
# of tokens in ru_book3.txt	 -->	  6003


In [None]:
for key in tokenized_texts.keys():
    tokens = tokenized_texts[key]

    MI = mutual_information(tokens, 10)

    print(MI)

In [16]:
tokens = tokenized_texts["en_macbeth.txt"]

MI = mutual_information(tokens, 50)

print(MI)

  0%|          | 0/49 [00:00<?, ?it/s]

100%|██████████| 49/49 [00:18<00:00,  2.64it/s]

[0.         4.14746325 3.9999998  3.92327878 3.92286143 3.90103882
 3.89736819 3.89025172 3.89059414 3.88793221 3.88299102 3.88841594
 3.88389643 3.87846295 3.88353184 3.88352941 3.8803807  3.88809519
 3.88299228 3.88024125 3.88116169 3.88327248 3.87965437 3.87843692
 3.87846146 3.87712468 3.88364628 3.87943087 3.88169933 3.88525184
 3.88706069 3.87769807 3.88168711 3.8808143  3.88360101 3.88125044
 3.87994639 3.8786995  3.87975323 3.88839786 3.87733383 3.88256563
 3.87892979 3.88102478 3.87127757 3.88163412 3.87646517 3.88376992
 3.88330657 3.88242943]





In [20]:
fig = go.Figure()
fig.add_trace(go.Scatter(y=MI[1:], x=list(range(1, len(MI))), mode='markers'))
fig.update_layout(xaxis=dict(range=[0, np.log10(len(MI))]))
fig.update_layout(
    xaxis_title="d",
    xaxis_type="log",
    yaxis_title="I(d)",
    yaxis_type="log",
)
fig.show()

In [3]:
import numpy as np
import random
from collections import Counter
from scipy.stats import norm
from multiprocessing import Pool, cpu_count

# Shuffle the text
def shuffle_text(tokens):
    shuffled = tokens[:]
    random.shuffle(shuffled)
    return shuffled

# Calculate the p-value
def calculate_p_value(observed_mi, shuffled_mis):
    mean_shuffled = np.mean(shuffled_mis)
    std_shuffled = np.std(shuffled_mis)
    z_score = (observed_mi - mean_shuffled) / std_shuffled
    p_value = 1 - norm.cdf(z_score)
    return p_value

In [24]:
tokens = tokenized_texts["es_book1.txt"]

# Worker function to compute mutual information for shuffled text
def worker(shuffled_tokens, max_d):
    return mutual_information(shuffled_tokens, max_d)

max_d = 15
# Number of shuffles
num_shuffles = 20

# Calculate I(d) for original text
print("Calculating I(d) until d =", max_d)
original_mi = mutual_information(tokens, max_d)
print(original_mi)

# Calculate I(d) for shuffled texts
print("Suffling the text ", num_shuffles, " times")
shuffled_mis = np.zeros((num_shuffles, max_d))
pool = Pool(cpu_count())

# Generate shuffled texts
shuffled_tokens_list = [shuffle_text(tokens) for _ in range(num_shuffles)]

# Calculate I(d) for shuffled texts in parallel
print("Parallel computing of I(d) for the suffled texts")
results = pool.starmap(worker, [(shuffled_tokens, max_d) for shuffled_tokens in shuffled_tokens_list])
pool.close()
pool.join()

shuffled_mis = np.array(results)

# Calculate p-values
p_values = calculate_p_value(original_mi, shuffled_mis)
print("\n\n")
# Print results
for d in range(1, max_d):
    print(f"d = {d}, I(d) = {original_mi[d]:.4f}, p-value = {p_values[d]:.4f}")

Calculating I(d) until d =  15
[0.         2.87277418 2.60486295 2.4880772  2.43077449 2.43216144
 2.42425016 2.42358621 2.42435584 2.41881864 2.4200168  2.42016595
 2.41800355 2.41629756 2.42083745]
Suffling the text  20  times
Parallel computing of I(d) for the suffled texts


In [1]:
import os
import time
from multiprocessing import Pool, cpu_count

# Function to compute the square of a number and print the process ID and timestamp
def compute_square(n):
    pid = os.getpid()
    timestamp = time.time()
    print(f"Process {pid} is computing the square of {n} at {timestamp}", flush=True)
    # Simulate a more time-consuming computation
    result = 0
    for _ in range(1000000):
        result += n * n
    return result

# Sequential execution for comparison
def sequential_execution(numbers):
    results = []
    for n in numbers:
        results.append(compute_square(n))
    return results

# Parallel execution using multiprocessing
def parallel_execution(numbers):
    with Pool(cpu_count()) as pool:
        results = pool.map(compute_square, numbers)
    return results

# Test data
numbers = list(range(10))

# Sequential execution
print("Sequential execution:")
start_time = time.time()
sequential_results = sequential_execution(numbers)
sequential_time = time.time() - start_time

print(f"Sequential results: {sequential_results}")
print(f"Sequential execution time: {sequential_time:.2f} seconds")

# Parallel execution
print("\nParallel execution:")
start_time = time.time()
parallel_results = parallel_execution(numbers)
parallel_time = time.time() - start_time

print(f"Parallel results: {parallel_results}")
print(f"Parallel execution time: {parallel_time:.2f} seconds")

# Verify if the results are correct
assert sequential_results == parallel_results, "Results do not match!"
print("Parallelization test completed successfully.")


Sequential execution:
Process 24692 is computing the square of 0 at 1715973024.201843
Process 24692 is computing the square of 1 at 1715973024.2508934
Process 24692 is computing the square of 2 at 1715973024.3164656
Process 24692 is computing the square of 3 at 1715973024.3901784
Process 24692 is computing the square of 4 at 1715973024.4624872
Process 24692 is computing the square of 5 at 1715973024.5299168
Process 24692 is computing the square of 6 at 1715973024.5982296
Process 24692 is computing the square of 7 at 1715973024.6691754
Process 24692 is computing the square of 8 at 1715973024.7398918
Process 24692 is computing the square of 9 at 1715973024.8062575
Sequential results: [0, 1000000, 4000000, 9000000, 16000000, 25000000, 36000000, 49000000, 64000000, 81000000]
Sequential execution time: 0.67 seconds

Parallel execution:
