<a href="https://colab.research.google.com/github/agrigoridou/Tokenization-Zipf-s-Law-N-gram-Models/blob/main/%CE%92_N_gram_Language_Models_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install nltk numpy



# Προετοιμασία των δεδομένων

Πρώτα, θα φορτώσουμε τα δεδομένα και θα προετοιμάσουμε τα αρχεία για την εκπαίδευση και αξιολόγηση:

In [45]:
import nltk
from nltk.corpus import treebank
from collections import Counter

# Φορτώνουμε τα δεδομένα
nltk.download('treebank')

# Φορτώνουμε τα πρώτα 150 αρχεία για εκπαίδευση και τα υπόλοιπα 49 για αξιολόγηση
train_files = treebank.fileids()[:150]
test_files = treebank.fileids()[150:]

# Λήψη των προτάσεων
train_sents = [sent for file in train_files for sent in treebank.sents(file)]
test_sents = [sent for file in test_files for sent in treebank.sents(file)]


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


# 1: Υπολογισμός perplexity στα κείμενα αξιολόγησης

In [46]:
def calculate_perplexity(model, sents, n=2):
    log_prob = 0
    total_ngrams = 0

    for sent in sents:
        if n == 2:
            ngrams_list = bigrams(sent)
        elif n == 3:
            ngrams_list = trigrams(sent)

        for ngram in ngrams_list:
            if n == 2:
                prob = model.get(ngram, 1e-6)  # Για bigram
            elif n == 3:
                prob = model.get(ngram, 1e-6)  # Για trigram

            log_prob += math.log(prob)
            total_ngrams += 1

    perplexity = math.exp(-log_prob / total_ngrams)
    return perplexity


## Υπολογισμός perplexity για τα μοντέλα (bigram και trigram)

In [47]:
perplexity_bigram_1 = calculate_perplexity(bigram_model_1, test_sents, n=2)
perplexity_bigram_2 = calculate_perplexity(bigram_model_2, test_sents, n=2)
perplexity_trigram_1 = calculate_perplexity(trigram_model_1, test_sents, n=3)
perplexity_trigram_2 = calculate_perplexity(trigram_model_2, test_sents, n=3)

print(f"Perplexity Bigram k=1: {perplexity_bigram_1}")
print(f"Perplexity Bigram k=0.01: {perplexity_bigram_2}")
print(f"Perplexity Trigram k=1: {perplexity_trigram_1}")
print(f"Perplexity Trigram k=0.01: {perplexity_trigram_2}")

Perplexity Bigram k=1: 48383.25849647069
Perplexity Bigram k=0.01: 20958.120746457113
Perplexity Trigram k=1: 538873.5152463855
Perplexity Trigram k=0.01: 320035.6195985542


# 2: Μετατροπή όλων των κειμένων σε πεζά

In [48]:
def preprocess_lowercase(sents):
    return [[token.lower() for token in sent] for sent in sents]


## Προετοιμασία πεζών δεδομένων

In [49]:
train_lower = preprocess_lowercase(train_sents)
test_lower = preprocess_lowercase(test_sents)

## Υπολογισμός perplexity για τα μοντέλα με πεζά γράμματα

In [50]:
perplexity_bigram_lower_1 = calculate_perplexity(bigram_model_1, test_lower, n=2)
perplexity_bigram_lower_2 = calculate_perplexity(bigram_model_2, test_lower, n=2)
perplexity_trigram_lower_1 = calculate_perplexity(trigram_model_1, test_lower, n=3)
perplexity_trigram_lower_2 = calculate_perplexity(trigram_model_2, test_lower, n=3)

print(f"Perplexity Bigram (Lowercase) k=1: {perplexity_bigram_lower_1}")
print(f"Perplexity Bigram (Lowercase) k=0.01: {perplexity_bigram_lower_2}")
print(f"Perplexity Trigram (Lowercase) k=1: {perplexity_trigram_lower_1}")
print(f"Perplexity Trigram (Lowercase) k=0.01: {perplexity_trigram_lower_2}")

Perplexity Bigram (Lowercase) k=1: 20016.401066044844
Perplexity Bigram (Lowercase) k=0.01: 6759.1366038389315
Perplexity Trigram (Lowercase) k=1: 390205.9014657962
Perplexity Trigram (Lowercase) k=0.01: 179792.22119202127


#3: Αντικατάσταση ψηφίων με '#'

In [51]:
import re

def replace_digits_with_hash(sents):
    return [[re.sub(r'\d', '#', token) for token in sent] for sent in sents]


## Προετοιμασία δεδομένων με αντικατάσταση ψηφίων

In [52]:
train_hash = replace_digits_with_hash(train_sents)
test_hash = replace_digits_with_hash(test_sents)

## Υπολογισμός perplexity για τα μοντέλα με αντικατάσταση ψηφίων

In [53]:
perplexity_bigram_hash_1 = calculate_perplexity(bigram_model_1, test_hash, n=2)
perplexity_bigram_hash_2 = calculate_perplexity(bigram_model_2, test_hash, n=2)
perplexity_trigram_hash_1 = calculate_perplexity(trigram_model_1, test_hash, n=3)
perplexity_trigram_hash_2 = calculate_perplexity(trigram_model_2, test_hash, n=3)

print(f"Perplexity Bigram (Hash) k=1: {perplexity_bigram_hash_1}")
print(f"Perplexity Bigram (Hash) k=0.01: {perplexity_bigram_hash_2}")
print(f"Perplexity Trigram (Hash) k=1: {perplexity_trigram_hash_1}")
print(f"Perplexity Trigram (Hash) k=0.01: {perplexity_trigram_hash_2}")

Perplexity Bigram (Hash) k=1: 79285.10986917574
Perplexity Bigram (Hash) k=0.01: 39430.24056678657
Perplexity Trigram (Hash) k=1: 649137.7630972476
Perplexity Trigram (Hash) k=0.01: 446736.02785341314


# 4: Δημιουργία νέων προτάσεων

In [54]:
def generate_sentence(model, n=2, max_len=20):
    sentence = ['<BOS>']
    while len(sentence) < max_len:
        last_word = sentence[-1]
        possible_next_words = {key[1] for key in model if key[0] == last_word}

        if not possible_next_words:
            break

        next_word = random.choices(list(possible_next_words), weights=[model.get((last_word, word), 1e-6) for word in possible_next_words])[0]
        sentence.append(next_word)

        if next_word == '<EOS>':
            break

    return sentence

## Δημιουργία νέων προτάσεων

In [55]:
new_sentence_1 = generate_sentence(bigram_model_1, n=2)
new_sentence_2 = generate_sentence(trigram_model_1, n=3)
new_sentence_3 = generate_sentence(bigram_model_2, n=2)

print(f"New Sentence 1: {new_sentence_1}")
print(f"New Sentence 2: {new_sentence_2}")
print(f"New Sentence 3: {new_sentence_3}")

New Sentence 1: ['<BOS>', 'terms', 'were', 'no', 'mechanism', '*ich*-2', 'yesterday', "'s", '<UNK>', '<UNK>', 'million', '*u*', 'in', 'an', 'entirely', 'new', 'members', 'of', 'south', 'korea']
New Sentence 2: ['<BOS>', 'huge', 'new', 'era', 'when', 'every', 'month', 'urged', 'the', 'daily', 'average', 'forecasts', 'related', 'to', 'position', ',', '65', 'in', 'legal', ',']
New Sentence 3: ['<BOS>', '<UNK>', '%', '.', '<EOS>']
