<a href="https://colab.research.google.com/github/agrigoridou/Tokenization-Zipf-s-Law-N-gram-Models/blob/main/%CE%92_N_gram_Language_Models_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install nltk numpy



# Εισαγωγή βιβλιοθηκών και φόρτωση του Treebank από το NLTK

In [12]:
import nltk
from nltk.corpus import treebank
from nltk.util import ngrams
from nltk.probability import FreqDist
from nltk import bigrams, trigrams
import numpy as np
import math

# Λήψη των απαραίτητων δεδομένων από το NLTK

In [13]:
nltk.download('treebank')
nltk.download('punkt')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Λήψη των πρώτων 150 κειμένων για εκπαίδευση και των επόμενων 49 για αξιολόγηση

In [14]:
train_files = treebank.fileids()[:150]
test_files = treebank.fileids()[150:]

# Συνάρτηση για την προετοιμασία των δεδομένων με <BOS> και <EOS> tokens

In [15]:
def preprocess_text(files):
    sentences = []
    for file in files:
        for sent in treebank.sents(file):
            sentences.append(['<BOS>'] + sent + ['<EOS>'])
    return sentences

train_sentences = preprocess_text(train_files)
test_sentences = preprocess_text(test_files)

# Συνάρτηση για αντικατάσταση σπάνιων tokens (< 3 φορές) με <UNK>

In [16]:
def replace_rare_tokens(sentences):
    all_tokens = [token for sent in sentences for token in sent]
    fdist = FreqDist(all_tokens)

    # Εύρεση σπάνιων tokens
    rare_tokens = {token for token, count in fdist.items() if count < 3}

    # Αντικατάσταση σπάνιων tokens με <UNK>
    processed_sentences = []
    for sent in sentences:
        processed_sentences.append([token if token not in rare_tokens else '<UNK>' for token in sent])

    return processed_sentences

train_sentences = replace_rare_tokens(train_sentences)
test_sentences = replace_rare_tokens(test_sentences)

# Συνάρτηση για υπολογισμό της πιθανότητας Bigram με add-k smoothing

In [17]:
def bigram_probability(bigrams, k):
    bigram_fd = FreqDist(bigrams)
    unigram_fd = FreqDist([bigram[0] for bigram in bigrams])

    vocab_size = len(unigram_fd)

    def prob(w1, w2):
        count_bigram = bigram_fd[(w1, w2)]
        count_unigram = unigram_fd[w1]
        return (count_bigram + k) / (count_unigram + k * vocab_size)

    return prob

# Δημιουργία bigrams και υπολογισμός της πιθανότητας με add-k smoothing για k=1 και k=0.01

In [18]:
train_bigrams = list(bigrams([token for sent in train_sentences for token in sent]))
bigram_model_k1 = bigram_probability(train_bigrams, k=1)
bigram_model_k01 = bigram_probability(train_bigrams, k=0.01)

# Συνάρτηση για υπολογισμό της πιθανότητας Trigram με add-k smoothing

In [19]:
def trigram_probability(trigrams, k):
    trigram_fd = FreqDist(trigrams)
    bigram_fd = FreqDist([bigram[:2] for bigram in trigrams])

    vocab_size = len(bigram_fd)

    def prob(w1, w2, w3):
        count_trigram = trigram_fd[(w1, w2, w3)]
        count_bigram = bigram_fd[(w1, w2)]
        return (count_trigram + k) / (count_bigram + k * vocab_size)

    return prob

# Δημιουργία trigrams και υπολογισμός της πιθανότητας με add-k smoothing για k=1 και k=0.01

In [20]:
train_trigrams = list(trigrams([token for sent in train_sentences for token in sent]))
trigram_model_k1 = trigram_probability(train_trigrams, k=1)
trigram_model_k01 = trigram_probability(train_trigrams, k=0.01)

# 1. Υπολογισμός του Perplexity για τα bigrams και trigrams με k=1 και k=0.01 για το σύνολο αξιολόγησης

In [21]:
test_bigrams = list(bigrams([token for sent in test_sentences for token in sent]))
test_trigrams = list(trigrams([token for sent in test_sentences for token in sent]))

def calculate_perplexity(model, ngrams, k=1):
    N = len(ngrams)
    log_prob_sum = 0
    for ngram in ngrams:
        if len(ngram) == 2:  # Bigram
            w1, w2 = ngram
            prob = model(w1, w2)
        elif len(ngram) == 3:  # Trigram
            w1, w2, w3 = ngram
            prob = model(w1, w2, w3)
        log_prob_sum += math.log(prob)

    perplexity = math.exp(-log_prob_sum / N)
    return perplexity

perplexity_bigram_k1 = calculate_perplexity(bigram_model_k1, test_bigrams, k=1)
perplexity_bigram_k01 = calculate_perplexity(bigram_model_k01, test_bigrams, k=0.01)
perplexity_trigram_k1 = calculate_perplexity(trigram_model_k1, test_trigrams, k=1)
perplexity_trigram_k01 = calculate_perplexity(trigram_model_k01, test_trigrams, k=0.01)

print("Perplexity for Bigram (k=1):", perplexity_bigram_k1)
print("Perplexity for Bigram (k=0.01):", perplexity_bigram_k01)
print("Perplexity for Trigram (k=1):", perplexity_trigram_k1)
print("Perplexity for Trigram (k=0.01):", perplexity_trigram_k01)

Perplexity for Bigram (k=1): 248.22110679031928
Perplexity for Bigram (k=0.01): 101.11029624213403
Perplexity for Trigram (k=1): 8200.760713641386
Perplexity for Trigram (k=0.01): 1371.6775932858716


# 2. Συνάρτηση για μετατροπή κειμένων σε πεζά γράμματα

In [22]:
def lowercase_text(sentences):
    return [[token.lower() for token in sent] for sent in sentences]

train_sentences_lower = lowercase_text(train_sentences)
test_sentences_lower = lowercase_text(test_sentences)

# Δημιουργία bigrams και trigrams από τα πεζά κείμενα
train_bigrams_lower = list(bigrams([token for sent in train_sentences_lower for token in sent]))
train_trigrams_lower = list(trigrams([token for sent in train_sentences_lower for token in sent]))

# Επαναυπολογισμός perplexity με πεζά γράμματα
perplexity_bigram_k1_lower = calculate_perplexity(bigram_model_k1, test_bigrams_lower, k=1)
perplexity_bigram_k01_lower = calculate_perplexity(bigram_model_k01, test_bigrams_lower, k=0.01)
perplexity_trigram_k1_lower = calculate_perplexity(trigram_model_k1, test_trigrams_lower, k=1)
perplexity_trigram_k01_lower = calculate_perplexity(trigram_model_k01, test_trigrams_lower, k=0.01)

print("Perplexity for Bigram (k=1) with lowercase:", perplexity_bigram_k1_lower)
print("Perplexity for Bigram (k=0.01) with lowercase:", perplexity_bigram_k01_lower)
print("Perplexity for Trigram (k=1) with lowercase:", perplexity_trigram_k1_lower)
print("Perplexity for Trigram (k=0.01) with lowercase:", perplexity_trigram_k01_lower)


NameError: name 'test_bigrams_lower' is not defined

# 3. Συνάρτηση για αντικατάσταση αριθμών με το token <NUM>

In [23]:
import re

def replace_numbers(sentences):
    return [[re.sub(r'\d+', '<NUM>', token) for token in sent] for sent in sentences]

train_sentences_no_numbers = replace_numbers(train_sentences)
test_sentences_no_numbers = replace_numbers(test_sentences)

# Δημιουργία bigrams και trigrams από τα κείμενα χωρίς αριθμούς
train_bigrams_no_numbers = list(bigrams([token for sent in train_sentences_no_numbers for token in sent]))
train_trigrams_no_numbers = list(trigrams([token for sent in train_sentences_no_numbers for token in sent]))

# Επαναυπολογισμός perplexity με αντικατάσταση αριθμών
perplexity_bigram_k1_no_numbers = calculate_perplexity(bigram_model_k1, test_bigrams_no_numbers, k=1)
perplexity_bigram_k01_no_numbers = calculate_perplexity(bigram_model_k01, test_bigrams_no_numbers, k=0.01)
perplexity_trigram_k1_no_numbers = calculate_perplexity(trigram_model_k1, test_trigrams_no_numbers, k=1)
perplexity_trigram_k01_no_numbers = calculate_perplexity(trigram_model_k01, test_trigrams_no_numbers, k=0.01)

print("Perplexity for Bigram (k=1) without numbers:", perplexity_bigram_k1_no_numbers)
print("Perplexity for Bigram (k=0.01) without numbers:", perplexity_bigram_k01_no_numbers)
print("Perplexity for Trigram (k=1) without numbers:", perplexity_trigram_k1_no_numbers)
print("Perplexity for Trigram (k=0.01) without numbers:", perplexity_trigram_k01_no_numbers)


NameError: name 'test_bigrams_no_numbers' is not defined

# 4. Δημιουργία νέων προτάσεων με το καλύτερο μοντέλο

In [9]:
def generate_sentence(model, k=1, ngram_type='bigram'):
    sentence = ['<BOS>']

    while sentence[-1] != '<EOS>' and len(sentence) < 20:
        if ngram_type == 'bigram':
            next_word = max(set(train_bigrams), key=lambda bigram: model(sentence[-1], bigram[1]))
        elif ngram_type == 'trigram':
            next_word = max(set(train_trigrams), key=lambda trigram: model(sentence[-2], sentence[-1], trigram[2]))

        sentence.append(next_word[1] if ngram_type == 'bigram' else next_word[2])

    return ' '.join(sentence[1:])

# Δοκιμή γεννήτριας προτάσεων για το καλύτερο μοντέλο (π.χ. Trigram με k=1)
generated_sentence = generate_sentence(trigram_model_k1, k=1, ngram_type='trigram')
print("Generated Sentence:", generated_sentence)