**#1 N-gram language model**

**1.1 Create n-grams for n=1, 2, 3, 4. You can show sample prints**                                                        

*   In order to create n-grams, first I have to read the text file, which is the given Amharic corpus  





In [None]:
corpus = open('GPAC.txt', 'r', encoding='utf-8', errors = 'ignore').read()




*   Then let's start creating n-grams


In [None]:
#install natural language toolkit
!pip install nltk

#download punkt
import nltk
nltk.download('punkt')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
#import the necessary libraries
from nltk.util import ngrams
from nltk.tokenize import word_tokenize

#tokenize the text into words
words = word_tokenize(corpus)

#list of possible punctuations in the corpus
punctuations = {'፡', '።', '፣', '፤', '፥', '፦', '፧',  '፤፤', '.' ,'(', ')', '?', '::','!'}

#remove punctuations from the corpus
cleaned_words = []

for word in words:
    if any(char in punctuations for char in word):
        cleaned_word = ''.join(char for char in word if char not in punctuations)
        if cleaned_word:
            cleaned_words.append(cleaned_word)
    else:
        cleaned_words.append(word)

#define a function to create n-grams
def create_ngrams(n):
  ngram = list(ngrams(cleaned_words, n))
  return ngram

#call a function and show sample prints
unigrams = create_ngrams(1)
print("Unigrams-->", unigrams[:3])

bigrams = create_ngrams(2)
print("Bigrams-->", bigrams[:3])

trigrams = create_ngrams(3)
print("Trigrams-->", trigrams[:3])

four_grams = create_ngrams(4)
print("Fourgrams-->", four_grams[:3])


Unigrams--> [('ምን',), ('መሰላችሁ',), ('አንባቢያን',)]
Bigrams--> [('ምን', 'መሰላችሁ'), ('መሰላችሁ', 'አንባቢያን'), ('አንባቢያን', 'ኢትዮጵያ')]
Trigrams--> [('ምን', 'መሰላችሁ', 'አንባቢያን'), ('መሰላችሁ', 'አንባቢያን', 'ኢትዮጵያ'), ('አንባቢያን', 'ኢትዮጵያ', 'በተደጋጋሚ')]
Fourgrams--> [('ምን', 'መሰላችሁ', 'አንባቢያን', 'ኢትዮጵያ'), ('መሰላችሁ', 'አንባቢያን', 'ኢትዮጵያ', 'በተደጋጋሚ'), ('አንባቢያን', 'ኢትዮጵያ', 'በተደጋጋሚ', 'ጥሪው')]


**1.2 Calculate probabilities of n-grams and find the top 10 most likely n-grams for all n.**



In [None]:
#import frequency distribution class
from nltk import FreqDist

#define a function to calculate probabilities and find 10 most likely n-grams
def calculate(ngrams):
  freq_dist = FreqDist(ngrams)

  total = sum(freq_dist.values())
  probabilities = []

  for ngram, freq in freq_dist.items():
      probability = freq / total
      probabilities.append([ngram, probability])

  probabilities.sort(key=lambda x: x[1], reverse=True)

  return probabilities[:10]

#top 10 most likely unigrams
print("Top 10 Unigrams-->", calculate(unigrams))

#top 10 most likely bigrams
print("Top 10 Bigrams-->", calculate(bigrams))

#top  10 most likely trigrams
print("Top 10 Trigrams-->", calculate(trigrams))

#top 10 likely fourgrams
print("Top 10 Fourgrams-->", calculate(four_grams))

Top 10 Unigrams--> [[('ነው',), 0.016446757837111477], [('ላይ',), 0.008632521757771754], [('ወደ',), 0.004956772032930116], [('ነበር',), 0.00485219878329024], [('ግን',), 0.004792069164747312], [('እና',), 0.004609065977877529], [('ውስጥ',), 0.004342404191295846], [('ጋር',), 0.0038718245679164043], [('ነገር',), 0.00333588666351204], [('አንድ',), 0.003192098445257211]]
Top 10 Bigrams--> [[('ዓ', 'ም'), 0.0014169712370524906], [('ነገር', 'ግን'), 0.0007424720135108992], [('ብቻ', 'ሳይሆን'), 0.0005490109959059465], [('ማለት', 'ነው'), 0.0005359393055272335], [('ብቻ', 'ነው'), 0.0005124102628455502], [('አዲስ', 'አበባ'), 0.00034247828792228097], [('ምን', 'ያህል'), 0.00030587755486188455], [('ኤ', 'አ'), 0.0002980345406346567], [('እ', 'ኤ'), 0.0002771198360287159], [('ሚሊዮን', 'ዶላር'), 0.00024574777911980463]]
Top 10 Trigrams--> [[('እ', 'ኤ', 'አ'), 0.00027450621560502476], [('2004', 'ዓ', 'ም'), 0.00012025986588410609], [('በ2003', 'ዓ', 'ም'), 0.0001098024862420099], [('ነው', 'ነገር', 'ግን'), 8.10446922262454e-05], [('ቀን', '2004', 'ዓ'), 8.1044692






**1.3 What is the probability of the sentence. "ኢትዮጵያ ታሪካዊ ሀገር ናት ". You can
also try more sentences.**

In [None]:
sentence = "ኢትዮጵያ ታሪካዊ ሀገር ናት"

# Tokenize the sentence
words = word_tokenize(sentence)

# Calculate the probabilities using interpolation
def calculate(words, ngrams):
    # Set interpolation weights
    lambdas = [0.4, 0.3, 0.2, 0.1]

    # Initialize probability
    probability = 1.0

    # Loop through each token in the sentence
    for i in range(len(words)):
        # Get the n-gram context
        context = words[0:i+1]

        # Check if the n-gram context exists in the n-grams
        if tuple(context) in ngrams[i]:

            # Calculate the conditional probability using the n-gram
            conditional_probability = ngrams[i][tuple(context)] / sum(ngrams[i].values())

            # Update the overall probability using interpolation
            probability *= (lambdas[i] * conditional_probability)

    return probability

# define list of frequency distributions of the large corpus for the four ngrams
freqDistList = [FreqDist(unigrams), FreqDist(bigrams), FreqDist(trigrams), FreqDist(four_grams)]


print(f"The interpolated probability of the sentence '{sentence}' is: {calculate(words, freqDistList)}")


The interpolated probability of the sentence 'ኢትዮጵያ ታሪካዊ ሀገር ናት' is: 2.1980542190308178e-10


**1.4 Generate random sentences using n-grams; explain what happens as n increases, based on your output.**

In [None]:
import random

def generate_sentence(ngrams, n, length=6):
    sentence = []

    # Randomly choose a starting n-gram
    current_ngram = random.choice(ngrams[n])

    # Add the words from the starting n-gram to the sentence
    sentence.extend(current_ngram)

    # Generate the rest of the sentence
    for _ in range(length - n):
        # Use the last (n-1) words as the context
        context = tuple(sentence[-(n-1):])

        # Get the next word based on the context
        next_words = [gram[-1] for gram in ngrams[n] if gram[:-1] == context]
        next_word = random.choice(next_words) if next_words else None

        # Add the next word to the sentence
        if next_word:
            sentence.append(next_word)
        else:
            break  # Break if no next word is found based on the context

    return ' '.join(sentence)

# Example usage for generating sentences with different values of n
ngrams = {1: unigrams, 2: bigrams, 3: trigrams, 4: four_grams}
for n_value in range(1, 5):
    generated_sentence = generate_sentence(ngrams, n_value)
    print(f"Generated sentence with {ngrams[n_value]}:", generated_sentence)
    print()


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)





*   **From the output we can conclude, as the value of n increases, we can get a more meaningful and contextually coherent sentence.**



### **#2 Evaluate these Language Models Using Intrinsic Evaluation Method**

In [None]:
import nltk
from nltk.util import ngrams
from collections import Counter
import math
import string


nltk.download('punkt')


def preprocess_text(text):
    cleaned_text = text.replace('\n', ' ').replace('\r', '')
    cleaned_text = cleaned_text.translate(str.maketrans('', '', string.punctuation))
    return cleaned_text


def train_language_model(train_data, n, k):
    tokens = nltk.word_tokenize(train_data)
    ngrams_list = list(ngrams(tokens, n))
    ngrams_freq = Counter(ngrams_list)


    model = {}
    for ngram, count in ngrams_freq.items():
        context = tuple(ngram[:-1])
        word = ngram[-1]
        if context not in model:
            model[context] = {}
        model[context][word] = math.log((count + k) / (ngrams_freq[context] + k * len(ngrams_freq)))


    return model


def calculate_perplexity(model, test_data):
    test_tokens = nltk.word_tokenize(test_data)
    num_words = len(test_tokens)
    log_prob_sum = 0.0


    for i in range(len(test_tokens)):
        if i < n-1:
            context = tuple(test_tokens[:i])
        else:
            context = tuple(test_tokens[i-n+1:i])
        word = test_tokens[i]


        if context in model and word in model[context]:
            log_prob = model[context][word]
            log_prob_sum += log_prob


    perplexity = math.exp(-log_prob_sum / num_words)


    return perplexity


with open('/content/GPAC.txt', 'r', encoding='utf-8', errors='ignore') as file:
    text = file.read()


train_size = int(0.8 * len(text))
train_data = text[:train_size]
test_data = text[train_size:]


train_data = preprocess_text(train_data)
test_data = preprocess_text(test_data)


n_values = [1, 2, 3, 4]
k = 0.1


for n in n_values:
    model = train_language_model(train_data, n, k)
    perplexity = calculate_perplexity(model, test_data)

    print(f"Perplexity for n={n}: {perplexity}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Perplexity for n=1: 144.9949783208397
Perplexity for n=2: 14.43831658248446
Perplexity for n=3: 1.5596752088530137
Perplexity for n=4: 1.0804521313169733


### **#3 Evaluate these Language Models Using extrinsic Evaluation Method**

In [None]:
import random
import re
from collections import defaultdict

import codecs
# Read the text file
file_path = '/content/GPAC.txt'

# Read the corpus from the text file with explicit encoding
with codecs.open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()
import re



def splitData(text, train_ratio=0.8):
  # Split the preprocessed text into words using Amharic punctuation marks as separators
  words = re.split('[።፥፤፦]+', text)

  # Determine the index to split the data
  split_index = int(len(words) * train_ratio)

  # Split the data into training and testing sets
  train_data = words[:split_index]

  test_data = words[split_index:]
  return train_data, test_data
train_data , test_data = splitData(text)
s,e,step = 4,0,-1
d = {2:"trigrams",4:"unigrams",
3:"bigrams",1:"fourgrams"}
train_data = ' '.join(train_data)



def train_ngram_model(text, n):
    ngram_model = defaultdict(list)
    words = text.split()


    for i in range(len(words) - n):
        context = tuple(words[i:i+n])
        target_word = words[i+n]
        ngram_model[context].append(target_word)

    return ngram_model



def next_word_prediction(context, ngram_model):
    if context not in ngram_model:
        return None

    possible_words = ngram_model[context]
    word_counts = defaultdict(int)
    for word in possible_words:
        word_counts[word] += 1

    total_count = sum(word_counts.values())
    probabilities = {word: count / total_count for word, count in word_counts.items()}

    predicted_word = max(probabilities, key=probabilities.get)
    return predicted_word




def evaluate_next_word_prediction(test_data, ngram_model, n):
    sentences = test_data.split('።')
    total_predictions = 0
    correct_predictions = 0

    for sentence in sentences:
        words = sentence.split()
        for i in range(len(words) - n):  # Adjusted loop range
            context = tuple(words[i:i+n])
            target_word = words[i+n]
            predicted_word = next_word_prediction(context, ngram_model)
            if predicted_word == target_word:
                correct_predictions += 1
            total_predictions += 1

    accuracy = correct_predictions / total_predictions
    return accuracy
predict_test = ' '.join(test_data)
for n in range(s,e,step):
#predict_test = ' '.join(test_data)



  #In order to check the accuracy
  #start predicting for different n values
  ngram_model = train_ngram_model(train_data, n)
  accuracy = evaluate_next_word_prediction(predict_test, ngram_model, n)
  print(f"{d[n]} accuracy is: {accuracy}")




unigrams accuracy is: 0.0009821327390171118
bigrams accuracy is: 0.003298944841306483
trigrams accuracy is: 0.016108520559559136
fourgrams accuracy is: 0.052774280198102916
