<a href="https://colab.research.google.com/github/agrigoridou/Tokenization-Zipf-s-Law-N-gram-Models/blob/main/%CE%91_Tokens%2C_Types%2C_Zipf%E2%80%99s_Law_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Προαπαιτούμενες βιβλιοθήκες

In [3]:
!pip install nltk spacy transformers
!python -m spacy download en_core_web_sm

import nltk
from nltk.tokenize import word_tokenize
import spacy
from transformers import BertTokenizer
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import math

Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# Φόρτωση του αρχείου

In [5]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [9]:
with open("/content/drive/MyDrive/Tokenization-Zipf-s-Law-N-gram-Models/wsj_untokenized.txt", "r") as file:
    text = file.read()


# Μέθοδοι Tokenization

## NLTK Tokenization

In [12]:
nltk.download('punkt')
nltk_tokens = word_tokenize(text)
nltk_token_count = len(nltk_tokens)
nltk_unique_tokens = len(set(nltk_tokens))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## spaCy Tokenization

In [13]:
nlp = spacy.load("en_core_web_sm")
spacy_tokens = [token.text for token in nlp(text)]
spacy_token_count = len(spacy_tokens)
spacy_unique_tokens = len(set(spacy_tokens))

## BERT Tokenization

In [14]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
bert_tokens = tokenizer.tokenize(text)
bert_token_count = len(bert_tokens)
bert_unique_tokens = len(set(bert_tokens))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



# 1. Συνολικά tokens και unique tokens

In [16]:
# Χρησιμοποιούμε λογαρίθμους για να μετατρέψουμε τις τιμές σε πιο κατανοητές
log_nltk_token_count = np.log1p(nltk_token_count)  # log1p για να περιλαμβάνει και το 0
log_nltk_unique_tokens = np.log1p(nltk_unique_tokens)

log_spacy_token_count = np.log1p(spacy_token_count)
log_spacy_unique_tokens = np.log1p(spacy_unique_tokens)

log_bert_token_count = np.log1p(bert_token_count)
log_bert_unique_tokens = np.log1p(bert_unique_tokens)

# Δημιουργία πίνακα με τα λογαριθμικά αποτελέσματα
df_log_token_counts = pd.DataFrame({
    'Method': ['NLTK', 'spaCy', 'BERT'],
    'Log of Total Tokens': [log_nltk_token_count, log_spacy_token_count, log_bert_token_count],
    'Log of Unique Tokens': [log_nltk_unique_tokens, log_spacy_unique_tokens, log_bert_unique_tokens]
})

print("Log Token Counts per Method:\n", df_log_token_counts)

Log Token Counts per Method:
   Method  Log of Total Tokens  Log of Unique Tokens
0   NLTK                  0.0                   0.0
1  spaCy                  0.0                   0.0
2   BERT                  0.0                   0.0


# 2. Επιλογή πρότασης 10 tokens και tokenization της

In [None]:
import random
sentences = nltk.sent_tokenize(text)
sample_sentence = random.choice([s for s in sentences if len(word_tokenize(s)) >= 10])
print("Sample Sentence:", sample_sentence)

nltk_sample_tokens = word_tokenize(sample_sentence)
spacy_sample_tokens = [token.text for token in nlp(sample_sentence)]
bert_sample_tokens = tokenizer.tokenize(sample_sentence)

print("NLTK Tokens:", nltk_sample_tokens)
print("spaCy Tokens:", spacy_sample_tokens)
print("BERT Tokens:", bert_sample_tokens)

# 3. Πιο συχνά types για κάθε μέθοδο

In [None]:
def get_top_20_freq(tokens):
    counter = Counter(tokens)
    top_20 = counter.most_common(20)
    total_tokens = sum(counter.values())
    df_top_20 = pd.DataFrame(top_20, columns=['Type', 'Frequency'])
    df_top_20['Probability'] = df_top_20['Frequency'] / total_tokens
    df_top_20['Rank*Probability'] = [(i+1) * df_top_20['Probability'][i] for i in range(len(df_top_20))]
    return df_top_20

df_nltk_top20 = get_top_20_freq(nltk_tokens)
df_spacy_top20 = get_top_20_freq(spacy_tokens)
df_bert_top20 = get_top_20_freq(bert_tokens)

print("NLTK Top 20 Types:\n", df_nltk_top20)
print("spaCy Top 20 Types:\n", df_spacy_top20)
print("BERT Top 20 Types:\n", df_bert_top20)

# 4. Ποσοστά types που εμφανίζονται 1, 2, και 3 φορές και σύγκριση με τον Νόμο του Zipf

In [None]:
def calculate_type_frequencies(tokens):
    counter = Counter(tokens)
    total_types = len(counter)
    freq_counts = [count for count in counter.values()]
    count_1 = freq_counts.count(1) / total_types
    count_2 = freq_counts.count(2) / total_types
    count_3 = freq_counts.count(3) / total_types
    zipf_1 = 1 / (1 * (1 + 1))
    zipf_2 = 1 / (2 * (2 + 1))
    zipf_3 = 1 / (3 * (3 + 1))
    return count_1, count_2, count_3, zipf_1, zipf_2, zipf_3

nltk_count_1, nltk_count_2, nltk_count_3, zipf_1, zipf_2, zipf_3 = calculate_type_frequencies(nltk_tokens)
print("NLTK Type Frequencies vs Zipf Predictions:\n", nltk_count_1, zipf_1, nltk_count_2, zipf_2, nltk_count_3, zipf_3)


# 5. Διάγραμμα Zipf με λογαριθμική κλίμακα

In [None]:
def plot_zipf_law(tokens, A_values=[0.1, 0.3, 0.5]):
    counter = Counter(tokens)
    sorted_freqs = [freq for _, freq in counter.most_common()]
    ranks = np.arange(1, len(sorted_freqs) + 1)
    plt.figure(figsize=(10, 6))
    plt.loglog(ranks, sorted_freqs, label="Actual Frequencies", color='blue')

    for A in A_values:
        zipf_freqs = [A / rank for rank in ranks]
        plt.loglog(ranks, zipf_freqs, linestyle='--', label=f"Zipf Prediction A={A}")

    plt.xlabel("Rank (Log Scale)")
    plt.ylabel("Frequency (Log Scale)")
    plt.legend()
    plt.title("Zipf's Law Prediction vs Actual Frequency")
    plt.show()

plot_zipf_law(nltk_tokens)