## Packages

In [2]:
from collections import Counter
import random

## Dataset

In [8]:
dataset_path = "dummy_amharic.txt"
with open(dataset_path, "r", encoding="utf8") as file:
    corpus_text = file.read()

## Create n-grams for n=1, 2, 3, 4.

Since the dataset is too large, we will read the first 10,000,000 words to create the n-grams for demonstration.

In [9]:
def create_ngrams(text, n):
    words = text[:10000000]
    words = words.split()
    ngrams = [tuple(words[i:i+n]) for i in range(len(words)-n+1)]
    return ngrams

### N = 1 (Unigrams)

In [10]:
unigrams = create_ngrams(corpus_text, 1)

print("N-grams for n=1: ")
for i in range(5):
    print(unigrams[i][0])

N-grams for n=1: 
ምን
መሰላችሁ?
(አንባቢያን)
ኢትዮጵያ
በተደጋጋሚ


## N = 2 (Bigrams)

In [11]:
bigrams = create_ngrams(corpus_text, 2)

print("N-grams for n=2: ")
for i in range(5):
    print(bigrams[i])

N-grams for n=2: 
('ምን', 'መሰላችሁ?')
('መሰላችሁ?', '(አንባቢያን)')
('(አንባቢያን)', 'ኢትዮጵያ')
('ኢትዮጵያ', 'በተደጋጋሚ')
('በተደጋጋሚ', 'ጥሪው')


## N = 3 (Trigrams)

In [12]:
trigrams = create_ngrams(corpus_text, 3)

print("N-grams for n=3: ")
for i in range(5):
    print(trigrams[i])

N-grams for n=3: 
('ምን', 'መሰላችሁ?', '(አንባቢያን)')
('መሰላችሁ?', '(አንባቢያን)', 'ኢትዮጵያ')
('(አንባቢያን)', 'ኢትዮጵያ', 'በተደጋጋሚ')
('ኢትዮጵያ', 'በተደጋጋሚ', 'ጥሪው')
('በተደጋጋሚ', 'ጥሪው', 'ደርሷት')


## N = 4 (Quadgrams)

In [13]:
quadgrams = create_ngrams(corpus_text, 4)

print("N-grams for n=4: ")
for i in range(5):
    print(quadgrams[i])

N-grams for n=4: 
('ምን', 'መሰላችሁ?', '(አንባቢያን)', 'ኢትዮጵያ')
('መሰላችሁ?', '(አንባቢያን)', 'ኢትዮጵያ', 'በተደጋጋሚ')
('(አንባቢያን)', 'ኢትዮጵያ', 'በተደጋጋሚ', 'ጥሪው')
('ኢትዮጵያ', 'በተደጋጋሚ', 'ጥሪው', 'ደርሷት')
('በተደጋጋሚ', 'ጥሪው', 'ደርሷት', 'ልትታደመው')


### Probabilities of n-grams and the top 10 most likely n-grams for all n. 

In [14]:
# Precalculate the count of N-grams
unigram_counts = Counter(unigrams)
bigram_counts = Counter(bigrams)
trigram_counts = Counter(trigrams)
quadgram_counts = Counter(quadgrams)

In [15]:
# A function that calculates the n-gram probabilities
def calculate_unigram_probabilities(ngrams):
    total_ngrams = len(ngrams)
    probabilities = {ngram: count/total_ngrams for ngram, count in unigram_counts.items()}
    return probabilities
    
def calculate_bigram_probabilities(ngrams):
    probabilities = {ngram: bigram_counts[ngram] / unigram_counts[(ngram[0],)] for ngram in ngrams}
    return probabilities
        
def calculate_trigram_probabilities(ngrams):
    probabilities = {ngram: trigram_counts[ngram] / bigram_counts[ngram[:-1]] for ngram in ngrams}
    return probabilities

def calculate_quadgram_probabilities(ngrams):
    probabilities = {ngram: quadgram_counts[ngram] / trigram_counts[ngram[:-1]] for ngram in ngrams}
    return probabilities

### Unigram Probabilities

In [16]:
# Calculate probabilities
unigram_probabilities = calculate_unigram_probabilities(unigrams)

top_unigrams = dict(sorted(unigram_probabilities.items(), key=lambda x: x[1], reverse=True)[:10])

print("The top 10 Unigrams are: ")
for top, prob in top_unigrams.items():
    print(f'\n{top[0]} ==> {prob}')

The top 10 Unigrams are: 

ላይ ==> 0.008816936727776477

ነው፡፡ ==> 0.007893395073803164

ነው ==> 0.006644903578617018

ግን ==> 0.004631604151234221

ወደ ==> 0.00455784908859052

ውስጥ ==> 0.004460577919016941

እና ==> 0.004305585396070031

ነገር ==> 0.003673857250817452

ጋር ==> 0.003634841891592885

ጊዜ ==> 0.00308648903454623


### Bigram Probabilities

In [17]:
# Calculate probabilities
bigram_probabilities = calculate_bigram_probabilities(bigrams)

top_bigrams = dict(sorted(bigram_probabilities.items(), key=lambda x: x[1], reverse=True)[:10])

print("The top 10 Bigrams are: ")
for top, prob in top_bigrams.items():
    print(f'\n{top} ==> {prob}')

The top 10 Bigrams are: 

('ልትታደመው', 'ያልቻለችው') ==> 1.0

('ለ19ኛ', 'ጊዜ') ==> 1.0

('ባረረ', 'ልክ') ==> 1.0

('ልትታደም', 'ሁለት') ==> 1.0

('ላከች፡፡6ኛው', 'ቢግ') ==> 1.0

('የሚገጥሟቸውን', 'የተለያዩ') ==> 1.0

('በትእግስትና', 'በጥበብ') ==> 1.0

('ለ91', 'ቀናት') ==> 1.0

('እንደሚሸለሙም', 'ሲናገር') ==> 1.0

('ብታሰልፍም', 'ዳኒ') ==> 1.0


### Trigram Probabilities

In [18]:
# Calculate probabilities
trigram_probabilities = calculate_trigram_probabilities(trigrams)

top_trigrams = dict(sorted(trigram_probabilities.items(), key=lambda x: x[1], reverse=True)[:10])

print("The top 10 Trigrams are: ")
for top, prob in top_trigrams.items():
    print(f'\n{top} ==> {prob}')

The top 10 Trigrams are: 

('መሰላችሁ?', '(አንባቢያን)', 'ኢትዮጵያ') ==> 1.0

('(አንባቢያን)', 'ኢትዮጵያ', 'በተደጋጋሚ') ==> 1.0

('በተደጋጋሚ', 'ጥሪው', 'ደርሷት') ==> 1.0

('ጥሪው', 'ደርሷት', 'ልትታደመው') ==> 1.0

('ደርሷት', 'ልትታደመው', 'ያልቻለችው') ==> 1.0

('ልትታደመው', 'ያልቻለችው', 'የአለም') ==> 1.0

('ያልቻለችው', 'የአለም', 'የእግር') ==> 1.0

('የአለም', 'የእግር', 'ኳስ') ==> 1.0

('ኳስ', 'ዋ', 'ለ19ኛ') ==> 1.0

('ዋ', 'ለ19ኛ', 'ጊዜ') ==> 1.0


### Quadgram Probabilities

In [19]:
# Calculate probabilities
quadgram_probabilities = calculate_quadgram_probabilities(quadgrams)

top_quadgrams = dict(sorted(quadgram_probabilities.items(), key=lambda x: x[1], reverse=True)[:10])

print("The top 10 Quadgrams are: ")
for top, prob in top_quadgrams.items():
    print(f'\n{top} ==> {prob}')

The top 10 Quadgrams are: 

('ምን', 'መሰላችሁ?', '(አንባቢያን)', 'ኢትዮጵያ') ==> 1.0

('መሰላችሁ?', '(አንባቢያን)', 'ኢትዮጵያ', 'በተደጋጋሚ') ==> 1.0

('(አንባቢያን)', 'ኢትዮጵያ', 'በተደጋጋሚ', 'ጥሪው') ==> 1.0

('ኢትዮጵያ', 'በተደጋጋሚ', 'ጥሪው', 'ደርሷት') ==> 1.0

('በተደጋጋሚ', 'ጥሪው', 'ደርሷት', 'ልትታደመው') ==> 1.0

('ጥሪው', 'ደርሷት', 'ልትታደመው', 'ያልቻለችው') ==> 1.0

('ደርሷት', 'ልትታደመው', 'ያልቻለችው', 'የአለም') ==> 1.0

('ልትታደመው', 'ያልቻለችው', 'የአለም', 'የእግር') ==> 1.0

('ያልቻለችው', 'የአለም', 'የእግር', 'ኳስ') ==> 1.0

('የእግር', 'ኳስ', 'ዋ', 'ለ19ኛ') ==> 1.0


### Lets take a random sentence and calculate it's probability. "ኢትዮጵያ ታሪካዊ ሀገር ናት "?

Let's calculate the probability of the sentence using different n-gram models: Unigram, Bigram, Trigram, and Quadgram.

#### Unigram Estimation

Finding the probability of the sentence using Unigram Estimation

In [20]:
def unigram_probability_estimation(sentence):
    # Find probability using the Unigrams
    sentence_ngrams = create_ngrams(sentence, 1)
    sentence_probability = 1.0
    for ngram in sentence_ngrams:
        sentence_probability *= unigram_probabilities.get(ngram, 1e-10)  # to avoid division by zero
    return sentence_probability

sentence = "ኢትዮጵያ ታሪካዊ ሀገር ናት"
sentence_probability = unigram_probability_estimation(sentence)
print(f"Probability of the sentence '{sentence}': {sentence_probability}\n")

Probability of the sentence 'ኢትዮጵያ ታሪካዊ ሀገር ናት': 4.200190248383804e-15



#### Bigram Estimation

Finding the probability of the sentence using Bigram Estimation

In [21]:
def bigram_probability_estimation(sentence):
    # Find probability using the Unigrams
    sentence_ngrams = create_ngrams(sentence, 2)
    sentence_probability = 1.0
    for ngram in sentence_ngrams:
        sentence_probability *= bigram_probabilities.get(ngram, 1e-10)  # to avoid division by zero
    return sentence_probability

sentence = "ኢትዮጵያ ታሪካዊ ሀገር ናት"
sentence_probability = bigram_probability_estimation(sentence)
print(f"Probability of the sentence '{sentence}': {sentence_probability}\n")

Probability of the sentence 'ኢትዮጵያ ታሪካዊ ሀገር ናት': 1.9796198140147185e-16



#### Trigram Estimation

Finding the probability of the sentence using Trigram Estimation

In [22]:
def trigram_probability_estimation(sentence):
    # Find probability using the Unigrams
    sentence_ngrams = create_ngrams(sentence, 3)
    sentence_probability = 1.0
    for ngram in sentence_ngrams:
        sentence_probability *= trigram_probabilities.get(ngram, 1e-10) # to avoid division by zero
    return sentence_probability
        
sentence = "ኢትዮጵያ ታሪካዊ ሀገር ናት"
sentence_probability = trigram_probability_estimation(sentence)
print(f"Probability of {sentence} using Trigram Estimation is: ': {sentence_probability}\n")

Probability of ኢትዮጵያ ታሪካዊ ሀገር ናት using Trigram Estimation is: ': 1.0000000000000001e-20



#### Quadgram Estimation

Finding the probability of the sentence using Quadgram Estimation

In [23]:
def quadgram_probability_estimation(sentence):
    # Find probability using the Unigrams
    sentence_ngrams = create_ngrams(sentence, 4)
    sentence_probability = 1.0
    for ngram in sentence_ngrams:
        sentence_probability *= quadgram_probabilities.get(ngram, 1e-10)  # to avoid division by zero
    return sentence_probability

sentence = "ኢትዮጵያ ታሪካዊ ሀገር ናት"
sentence_probability = quadgram_probability_estimation(sentence)
print(f"Probability of {sentence} using Quadgram Estimation is: ': {sentence_probability}\n")

Probability of ኢትዮጵያ ታሪካዊ ሀገር ናት using Quadgram Estimation is: ': 1e-10



#### Finiding the probability of the sentence using the Chain Rule

In [24]:
def chain_rule_probability_estimation(sentence):

    sentence = sentence.split()
    # Find probability using the Unigrams
    sentence_probability = 1.0
    sentence_probability *= unigram_probabilities.get((sentence[0],), 1e-10)
    sentence_probability *= bigram_probabilities.get(tuple(sentence[:2]), 1e-10)
    sentence_probability *= trigram_probabilities.get(tuple(sentence[:3]), 1e-10)
    sentence_probability *= quadgram_probabilities.get(tuple(sentence[:4]), 1e-10)
    return sentence_probability


sentence = "ኢትዮጵያ ታሪካዊ ሀገር ናት"
sentence_probability = chain_rule_probability_estimation(sentence)
print(f"Probability of {sentence} using Chain Rule is: ': {sentence_probability}")

Probability of ኢትዮጵያ ታሪካዊ ሀገር ናት using Chain Rule is: ': 5.3445697567900095e-27


## Lets generate random sentences using n-grams; see what happens as n increases 


In [25]:
def generate_random_sentence_for_unigrams(seed_word, ngram_probabilities, n, reps = 10):
    sentence = [*seed_word]
    choices = list(ngram_probabilities.keys())
    for _ in range(reps):
        next_word = random.choice(choices)
        sentence.append(next_word[-1])
    return " ".join(sentence)
        
def generate_random_sentence_for_ngrams(seed_word, ngram_probabilities, n, reps = 10):
    sentence = [*seed_word]
    for _ in range(reps): 
        # Get the possible next words based on the n-gram
        next_words = [word[-1] for word in ngram_probabilities if word[:-1] == tuple(sentence[-(n-1):])]
        if not next_words:
            break  # Stop if there are no valid next words

        # Probabilistically choose the next word
        next_word = random.choice(next_words)
        sentence.append(next_word)
        
    return " ".join(sentence)



In [26]:
# Using Unigrams
seed_word = random.choice(list(unigram_probabilities.keys()))
generated_sentence = generate_random_sentence_for_unigrams(seed_word, unigram_probabilities, 1)
print("Generated random sentence using Unigrams:", generated_sentence)

# Using Bigrams
seed_word = random.choice(list(bigram_probabilities.keys()))
generated_sentence = generate_random_sentence_for_ngrams(seed_word, bigram_probabilities, 2)
print("Generated random sentence using bigrams:", generated_sentence)


# Using Trigrams
seed_word = random.choice(list(trigram_probabilities.keys()))
generated_sentence = generate_random_sentence_for_ngrams(seed_word, trigram_probabilities, 3)
print("Generated random sentence using trigrams:", generated_sentence)

# Using Quadgrams
seed_word = random.choice(list(quadgram_probabilities.keys()))
generated_sentence = generate_random_sentence_for_ngrams(seed_word, quadgram_probabilities, 4)
print("Generated random sentence using quadgrams:", generated_sentence)


Generated random sentence using Unigrams: በአወዛጋቢ የፊልምንና ጠቀየው፡፡ ልውውጦችና ያሰኝ ይታወቃሉ፡፡ግንቦት ኮምፒዩተሮችም ሃዘኑን ነበር?(ይቀጥላል) በተመሳሠብ ያሸነፈችኝ?
Generated random sentence using bigrams: ሲገልፁ ኖረዋል። ይህ ባይሆን ድሪንክዬው በቀለም የመሙላት የከንፈር ወዳጅ፣ አንደበተ መልካም አሽከር
Generated random sentence using trigrams: የፓርቲውን ውሳኔ በግሌ በጣም ነው የተደሰትኩ ለሙዚቃ ያለው ፍቅር በጣም ይገርማል በፊት ከመጫወት
Generated random sentence using quadgrams: ደግሞ ጥሎብኝ ውበት አደንቃለሁ፡፡ ፍሬነገሩ ብቻ አይማርከኝም፡፡ውበት መፍጠርም ውድ ተሰጥዖ ነው፡፡ የተመኘ ሁሉ አያገኘውም፡፡


#### Explanation

As n increases, the model takes more context into account when generating text. 

<b>1. More Contextual Relevance:</b> With larger n, the generated sentences tend to be more contextually relevant and coherent. This is because the model considers a longer history of words when predicting the next word.

<b>2. Computational Complexity: </b> The models became more resource-intensive with larger n.


## Evaluating these Language Models Using Intrinsic Evaluation Method


In [27]:
import math

def calculate_probability(sentence, ngram_probabilities, n, probability_function):
    splitted_sentence = sentence.split()
    sentence_ngrams = [tuple(splitted_sentence[i:i+n]) for i in range(len(splitted_sentence)-n+1)]
    
    # Calculate probability
    sentence_probability = probability_function(sentence)
    return sentence_probability

def evaluate_language_model(test_set, ngram_probabilities, n, probability_function):
    N = len(test_set)
    total_perplexity = 1
    for sentence in test_set:
        probability = calculate_probability(sentence, ngram_probabilities, n, probability_function)
        total_perplexity *= 1 / probability

    total_perplexity = pow(total_perplexity, 1/N)
    return total_perplexity

# Example usage for bigrams
test_set = ["ኢትዮጵያ በተደጋጋሚ ጥሪው ደርሷት ልትታደመው ያልቻለችው", "ኢትዮጵያ ታሪካዊ ሀገር ናት", "ተወዳዳሪዎች እያንዳንዳቸው 200 ሺህ"]

intrinsic_evaluation_result = evaluate_language_model(test_set, unigram_probabilities, 1, unigram_probability_estimation)
print(f"Average Perplexity on Test Set using Unigrams: {intrinsic_evaluation_result}")

intrinsic_evaluation_result = evaluate_language_model(test_set, bigram_probabilities, 2, bigram_probability_estimation)
print(f"Average Perplexity on Test Set using Bigrams: {intrinsic_evaluation_result}")

intrinsic_evaluation_result = evaluate_language_model(test_set, trigram_probabilities, 3, trigram_probability_estimation)
print(f"Average Perplexity on Test Set using Trigrams: {intrinsic_evaluation_result}")

intrinsic_evaluation_result = evaluate_language_model(test_set, quadgram_probabilities, 4, quadgram_probability_estimation)
print(f"Average Perplexity on Test Set using Quadgrams: {intrinsic_evaluation_result}")


Average Perplexity on Test Set using Unigrams: 2.0785114808271317e+20
Average Perplexity on Test Set using Bigrams: 1693942611.7401881
Average Perplexity on Test Set using Trigrams: 8434326.653017484
Average Perplexity on Test Set using Quadgrams: 2154.434690031883


## Evaluating these Language Models Using Extrinsic Evaluation Method

We chose sentence completion as a task to evaluate these language models

We can use the functions that we created before to generate random sentence but for generating the next word for the given initial sentence.

In [55]:
# Examples
initial_sentence = "ደቡብ አፍሪካ ለስድስተኛ"
initial_sentence = tuple(initial_sentence.split())
# Test for all four n-gram models


completed_word = generate_random_sentence_for_unigrams(initial_sentence, unigram_probabilities, 1)
print(f"1-gram Completed Sentence: {completed_word} \n")

completed_word = generate_random_sentence_for_ngrams(initial_sentence, bigram_probabilities, 2)
print(f"2-gram Completed Sentence: {completed_word} \n")

completed_word = generate_random_sentence_for_ngrams(initial_sentence, trigram_probabilities, 3)
print(f"3-gram Completed Sentence: {completed_word}")
print("\n")
completed_word = generate_random_sentence_for_ngrams(initial_sentence, quadgram_probabilities, 4)
print(f"4-gram Completed Sentence:  {completed_word}")

1-gram Completed Sentence: ደቡብ አፍሪካ ለስድስተኛ እንደቀለበት ይታዘዛል፡፡ እንደሚመለከቱ በይዘቱ እያለች፣ በሞኒተሪ ሲጠብቁ፣ ዲናርና አልገጠማችሁም?በህጋዊ በሲቪክ 

2-gram Completed Sentence: ደቡብ አፍሪካ ለስድስተኛ ጊዜ ለማጣራትና ምዝገባ በአዲሱ ጭማሪ ተደርጐባቸዋል የሚሉት ምንድን ነበር መሰለህ 

3-gram Completed Sentence: ደቡብ አፍሪካ ለስድስተኛ ጊዜ ካዘጋጀችው ቢግ ብራዘርስ ሶስት ወር አካባቢ ተገቢውን ምክር እየለገስናቸው


4-gram Completed Sentence:  ደቡብ አፍሪካ ለስድስተኛ ጊዜ ካዘጋጀችው ቢግ ብራዘርስ የመረረ ውድድር ለሺህ አመታት በተከታታይ ስታዘጋጅ


### Next Word Prediction

In [67]:
def generate_next_words(seed_word, n):
    if n == 1:
        ngram_probabilities = unigram_probabilities
    elif n == 2:
        ngram_probabilities = bigram_probabilities
    elif n == 3:
        ngram_probabilities = trigram_probabilities
    elif n == 4:
        ngram_probabilities = quadgram_probabilities
        
    sentence = [*seed_word]

    next_words = [word[-1] for word in ngram_probabilities if word[:-1] == tuple(sentence[-(n-1):])]

        
    return next_words[:5]

In [85]:
initial_sentence = "ደቡብ መሮጫዎች"
initial_sentence = tuple(initial_sentence.split())

next_words = generate_next_words(initial_sentence, 2)
next_words

['ተገጥሞላቸው']