# Week 37 - Language Models

## 1. Setup

### 1.1. Libraries

#### 1.1.1. New Libraries

In [7]:
# new libraries for Google Colab
!pip3 install nltk
!pip3 install datasets
!pip install bnlp-toolkit



#### 1.1.2. Load Libraries

In [8]:
from datasets import load_dataset                       # library to import data from huggingface
import pandas as pd                                     # library to transform to dataframe. helps for statistics
from bnlp import BasicTokenizer                         # library for bengali tokenizer
from nltk.tokenize import word_tokenize                 # library for tokenize arabic and indonesian
import numpy as np                                      # library for math operations and matrices
import seaborn as sns                                   # library for making plots
from scipy.stats.mstats import gmean                    # libreary for geometric mean and perplexity

### 1.2. Data

#### 1.2.1. Read Data

In [9]:
# load training dataset
datasets_train = load_dataset("copenlu/answerable_tydiqa", split='train')
# load validation dataset
datasets_val = load_dataset("copenlu/answerable_tydiqa", split='validation')

In [10]:
# Bengali Tolkenizer
bengali_tokenizer = BasicTokenizer()

#### 1.2.2. Transform Data

In [11]:
# define languages for the project
languages = ['arabic', 'bengali','indonesian']

# transform to pandas dataframe
pandas_datasets_train = pd.DataFrame(datasets_train)
pandas_datasets_val = pd.DataFrame(datasets_val)

# filter languaje
df_train_filter = pandas_datasets_train[pandas_datasets_train['language'].isin(languages)]
df_val_filter = pandas_datasets_val[pandas_datasets_val['language'].isin(languages)]

In [12]:
# tokens documents
df_train_filter['tokens_document'] = np.where(df_train_filter['language'].isin(['indonesian','arabic']),
         df_train_filter.document_plaintext.apply(lambda x: word_tokenize(x)),
         df_train_filter.document_plaintext.apply(lambda x: bengali_tokenizer(x)))
df_train_filter['tokens_document_len'] = df_train_filter['tokens_document'].apply(lambda x: len(x))

df_val_filter['tokens_document'] = np.where(df_val_filter['language'].isin(['indonesian','arabic']),
         df_val_filter.document_plaintext.apply(lambda x: word_tokenize(x)),
         df_val_filter.document_plaintext.apply(lambda x: bengali_tokenizer(x)))
df_val_filter['tokens_document_len'] = df_val_filter['tokens_document'].apply(lambda x: len(x))


# tokens question
df_train_filter['tokens_question'] = np.where(df_train_filter['language'].isin(['indonesian','arabic']),
         df_train_filter.question_text.apply(lambda x: word_tokenize(x)),
         df_train_filter.question_text.apply(lambda x: bengali_tokenizer(x)))
df_train_filter['tokens_question_len'] = df_train_filter['tokens_question'].apply(lambda x: len(x))

df_val_filter['tokens_question'] = np.where(df_val_filter['language'].isin(['indonesian','arabic']),
         df_val_filter.question_text.apply(lambda x: word_tokenize(x)),
         df_val_filter.question_text.apply(lambda x: bengali_tokenizer(x)))
df_val_filter['tokens_question_len'] = df_val_filter['tokens_question'].apply(lambda x: len(x))


# tokens answer
df_train_filter['tokens_answer'] = np.where(df_train_filter['language'].isin(['indonesian','arabic']),
         df_train_filter['annotations'].apply(lambda x: x['answer_text'][0]).apply(lambda x: word_tokenize(x)),
         df_train_filter['annotations'].apply(lambda x: x['answer_text'][0]).apply(lambda x: bengali_tokenizer(x)))
df_train_filter['tokens_answer_len'] = df_train_filter['tokens_answer'].apply(lambda x: len(x))

df_val_filter['tokens_answer'] = np.where(df_val_filter['language'].isin(['indonesian','arabic']),
         df_val_filter['annotations'].apply(lambda x: x['answer_text'][0]).apply(lambda x: word_tokenize(x)),
         df_val_filter['annotations'].apply(lambda x: x['answer_text'][0]).apply(lambda x: bengali_tokenizer(x)))
df_val_filter['tokens_answer_len'] = df_val_filter['tokens_answer'].apply(lambda x: len(x))


# Insert START-OF-SENTENCE [SOS] AND END-OF-SENTENCE [EOS] for the language model
df_train_filter['tokens_document_endsentence'] = df_train_filter['tokens_document'].apply(lambda x: ["[SOS]"] + x + ["[EOS]"])
df_train_filter['tokens_question_endsentence'] = df_train_filter['tokens_question'].apply(lambda x: ["[SOS]"] + x + ["[EOS]"])

df_val_filter['tokens_document_endsentence'] = df_val_filter['tokens_document'].apply(lambda x: ["[SOS]"] + x + ["[EOS]"])
df_val_filter['tokens_question_endsentence'] = df_val_filter['tokens_question'].apply(lambda x: ["[SOS]"] + x + ["[EOS]"])

## 2. Language Models

### 2.1. Uniform

In [None]:
def function_count_words(list_tokens = []):
  """
  Funtion that counts frecuency of words in a list
  """
  token_count = {}
  for token_i in list_tokens:
    if token_i in token_count.keys():
      token_count[token_i] += 1
    else:
      token_count[token_i] = 1
  return token_count

In [None]:
def replace_OOV(frequency_dict:{}, frequency_bottom =5 ):
  """
  Words Out Of Vocabulary
  Replace Out Of Vocabulary ('[OOV]') for frequencies less than frequency_bottom
  """
  updated_vocabulary_question = {}
  count_oov_times = 0

  for word in frequency_dict.keys():
    if frequency_dict[word] >= frequency_bottom:
      updated_vocabulary_question[word] = frequency_dict[word]
    else:
      count_oov_times += frequency_dict[word]
      updated_vocabulary_question['[OOV]'] = count_oov_times

  return updated_vocabulary_question

In [None]:
def model_uniform(dict_vocabulary = {}):
  """
  Define language uniform model
  """
  size_vocab_questions = len(dict_vocabulary.keys())
  probability = {}
  for word_ in dict_vocabulary.keys():
    # in an uniform language model, every token has the same probability
    probability[word_] = (1 /size_vocab_questions, dict_vocabulary[word_])
  return  probability

for language in languages:
  print("#"*50)
  print(language)
  print("#"*50)
  # documents
  sentences_document_train = df_train_filter[df_train_filter['language']==language]['tokens_document_endsentence']
  list_documents_tokens_train = sentences_document_train.to_list()
  flat_document_tokens_train = ([item for sublist in list_documents_tokens_train for item in sublist])
  print('Length Train Vocabulary Document',len(set(flat_document_tokens_train)))

  # questions
  sentences_questions_train = df_train_filter[df_train_filter['language']==language]['tokens_question_endsentence']
  list_questions_tokens_train = sentences_questions_train.to_list()
  flat_questions_tokens_train = ([item for sublist in list_questions_tokens_train for item in sublist])
  print('Length Train Vocabulary Questions',len(set(flat_questions_tokens_train)))

  # frequency
  counts_documents_train = function_count_words(flat_document_tokens_train)
  print('Length Train Vocabulary Document',len(counts_documents_train))

  counts_questions_train = function_count_words(flat_questions_tokens_train)
  print('Length Train Vocabulary Question',len(counts_questions_train))

  # replace low frequency with OOV
  updated_vocabulary_documents_train = replace_OOV(counts_documents_train)
  print('Length Train Vocabulary Document Using OOV',len(set(updated_vocabulary_documents_train)))


  updated_vocabulary_question_train = replace_OOV(counts_questions_train)
  print('Length Train Vocabulary Document Using OOV',len(set(updated_vocabulary_question_train)))

  # model (Train)
  word_distribution_uniform_documents = model_uniform(updated_vocabulary_documents_train)
  word_distribution_uniform_questions = model_uniform(updated_vocabulary_question_train)

  # Example of a sample uniform model
  sample_question_size_10 =[]
  sample_document_size_10 =[]
  for i in range(0,10):
    # random sample (10) according to the probability (Uniform)
    sample_document_i = np.random.choice(list(word_distribution_uniform_documents.keys()),
                  p=[x[0] for x in list(word_distribution_uniform_documents.values())])
    sample_question_i = np.random.choice(list(word_distribution_uniform_questions.keys()),
                  p=[x[0] for x in list(word_distribution_uniform_questions.values())])
    # append to the list
    sample_document_size_10.append(sample_document_i)
    sample_question_size_10.append(sample_question_i)


  print(f'Sample documents:',' '.join(sample_document_size_10))
  print(f'Sample questions:',' '.join(sample_question_size_10))
  print("-"*50)
  # documents
  sentences_document_val = df_val_filter[df_val_filter['language']==language]['tokens_document_endsentence']
  list_documents_tokens_val = sentences_document_val.to_list()
  flat_document_tokens_val = ([item for sublist in list_documents_tokens_val for item in sublist])
  print('Length Test Vocabulary Document',len(set(flat_document_tokens_val)))

  # questions
  sentences_questions_val = df_val_filter[df_val_filter['language']==language]['tokens_question_endsentence']
  list_questions_tokens_val = sentences_questions_val.to_list()
  flat_questions_tokens_val = ([item for sublist in list_questions_tokens_val for item in sublist])
  print('Length Test Vocabulary Question',len(set(flat_questions_tokens_val)))

  # Replace OOV
  updated_vocabulary_documents_val = ([word if word in word_distribution_uniform_documents.keys() else '[OOV]' for word in flat_document_tokens_val])
  print('Length Test Sequence Document Using OOV',len((updated_vocabulary_documents_val)))


  updated_vocabulary_questions_val = ([word if word in word_distribution_uniform_questions.keys() else '[OOV]' for word in flat_questions_tokens_val])
  print('Length Test Sequence Questions Using OOV',len((updated_vocabulary_questions_val)))

  # Perplexity
  prob_documents_val = []
  for word in updated_vocabulary_documents_val:
    touple_observed = (word, word_distribution_uniform_documents[word][0])
    prob_documents_val.append(touple_observed)
  # tokens apear many times
  #prob_documents_val = set(prob_documents_val)
  print('Perplexity documents:', np.round(gmean([1/x for x in [y for x,y in prob_documents_val] ])))

  prob_questions_val = []
  for word in updated_vocabulary_questions_val:
    touple_observed = (word, word_distribution_uniform_questions[word][0])
    prob_questions_val.append(touple_observed)
  # tokens apear many times
  #prob_documents_val = set(prob_documents_val)
  print('Perplexity questions:', np.round(gmean([1/x for x in [y for x,y in prob_questions_val] ])))

##################################################
arabic
##################################################
Length Train Vocabulary Document 225191
Length Train Vocabulary Questions 16361
Length Train Vocabulary Document 225191
Length Train Vocabulary Question 16361
Length Train Vocabulary Document Using OOV 46238
Length Train Vocabulary Document Using OOV 3733
Sample documents: المذكرات وحضور ومغامرات حرجة البورمان لتصبح ماريّ خروجه مترين رفض
Sample questions: غير مخترع حروب تعمل الاقتصادية؟ المتجمد الكيميائي ديكارت المحافظين الأب
--------------------------------------------------
Length Test Vocabulary Document 37483
Length Test Vocabulary Question 2274
Length Test Sequence Document Using OOV 155827
Length Test Sequence Questions Using OOV 14554
Perplexity documents: 46238.0
Perplexity questions: 3733.0
##################################################
bengali
##################################################
Length Train Vocabulary Document 50094
Length Train Vocabulary Questions

### 2.2. Unigram

In [None]:
k_smoothing = 1
# Define unigram model with Laplace Smoothing
def model_unigram(dict_vocabulary={}, k=k_smoothing):
    total_words = sum(dict_vocabulary.values())
    V = len(dict_vocabulary)  # Vocabulary size
    probability = {}
    for word_, count in dict_vocabulary.items():
        probability[word_] = (count + k) / (total_words + k*V)
    return probability

# Sample
def generate_sample_unigrams(distribution, n=10):
    words, probs = zip(*[(word, prob) for word, prob in distribution.items()])
    words = np.array(words)
    probs = np.array(probs) / sum(probs)
    sample_unigrams = np.random.choice(words, size=n, p=probs)
    return sample_unigrams

In [None]:
languages = ['arabic', 'bengali', 'indonesian']

for language in languages:
    print("#"*50)
    print(language)
    print("#"*50)

    # Training
    sentences_doc_train = df_train_filter[df_train_filter['language'] == language]['tokens_document_endsentence']
    sentences_ques_train = df_train_filter[df_train_filter['language'] == language]['tokens_question_endsentence']

    flat_doc_tokens_train = [item for sublist in sentences_doc_train for item in sublist]
    flat_ques_tokens_train = [item for sublist in sentences_ques_train for item in sublist]

    counts_documents_train = function_count_words(flat_doc_tokens_train)
    counts_questions_train = function_count_words(flat_ques_tokens_train)

    #updated_vocab_docs_train = replace_OOV(counts_documents_train)
    #updated_vocab_ques_train = replace_OOV(counts_questions_train)

    word_distribution_docs = model_unigram(counts_documents_train)
    word_distribution_ques = model_unigram(counts_questions_train)

    sample_unigram_docs = generate_sample_unigrams(word_distribution_docs)
    sample_unigram_ques = generate_sample_unigrams(word_distribution_ques)

    # Validation
    sentences_doc_val = df_val_filter[df_val_filter['language'] == language]['tokens_document_endsentence']
    sentences_ques_val = df_val_filter[df_val_filter['language'] == language]['tokens_question_endsentence']




    flat_doc_tokens_val = [item for sublist in sentences_doc_val for item in sublist]
    flat_ques_tokens_val = [item for sublist in sentences_ques_val for item in sublist]

    print(f'Total Words Document Test {language}',len(flat_doc_tokens_val))
    print(f'Total Vocabulary Document Test {language}',len(set(flat_doc_tokens_val)))

    print(f'Total Words Questions Test {language}',len(flat_ques_tokens_val))
    print(f'Total Vocabulary Questions Test {language}',len(set(flat_ques_tokens_val)))


    count = 0
    # smooth prob for tokens OOV (document)
    V_size_docs = len(word_distribution_docs)
    total_words_documents = np.sum(list(counts_documents_train.values()))
    min_prob_docs =(count + k_smoothing) / (total_words_documents + k_smoothing*V_size_docs)
    print(f'Total Words Document Training {language}',total_words_documents)
    print(f'Total Vocabulary Document Training {language}',V_size_docs)


    # smooth prob for tokens OOV (question)
    V_size_ques = len(word_distribution_ques)
    total_words_ques = np.sum(list(counts_questions_train.values()))
    min_prob_ques = (count + k_smoothing) / (total_words_ques + k_smoothing*V_size_ques)
    print(f'Total Words Question Training {language}',total_words_ques)
    print(f'Total Vocabulary Question Training {language}',V_size_ques)

    word_prob_documents_val = []
    for word_document_val_i in flat_doc_tokens_val:
        # If the token is in the training, assing prob of training
        if word_document_val_i in word_distribution_docs.keys():
          prob_i = word_distribution_docs.get(word_document_val_i)
        # if the token is not in the training, assing pro of smooth
        else:
          prob_i = min_prob_docs
        # save
        word_prob_documents_val.append(prob_i)

    word_prob_question_val = []
    for word_question_val_i in flat_ques_tokens_val:
        # If the token is in the training, assing prob of training
        if word_question_val_i in word_distribution_ques.keys():
          prob_i = word_distribution_ques.get(word_question_val_i)
        # if the token is not in the training, assing pro of smooth
        else:
          prob_i = min_prob_ques
        # save
        word_prob_question_val.append(prob_i)


    #prob_ques_val = [word_distribution_ques.get(word) for word in flat_ques_tokens_val]


    #print(f'Sample documents unigrams:',' | '.join(sample_unigram_docs))
    #print(f'Sample questions unigrams:',' | '.join(sample_unigram_ques))
    print('Perplexity for documents:', np.round(gmean([1/y for y in word_prob_documents_val])))
    print('Perplexity for questions:', np.round(gmean([1/y for y in word_prob_question_val])))
    print("-"*50)

##################################################
arabic
##################################################
Total Words Document Test arabic 155827
Total Vocabulary Document Test arabic 37483
Total Words Questions Test arabic 14554
Total Vocabulary Questions Test arabic 2274
Total Words Document Training arabic 2706703
Total Vocabulary Document Training arabic 225191
Total Words Question Training arabic 230005
Total Vocabulary Question Training arabic 16361
Perplexity for documents: 8509.0
Perplexity for questions: 538.0
--------------------------------------------------
##################################################
bengali
##################################################
Total Words Document Test bengali 22687
Total Vocabulary Document Test bengali 6727
Total Words Questions Test bengali 2368
Total Vocabulary Questions Test bengali 438
Total Words Document Training bengali 483302
Total Vocabulary Document Training bengali 50094
Total Words Question Training bengali 48674
Total

### 2.3. Bigram

In [None]:
# Bigram Counting Function
def function_count_bigrams(list_words=[], unigram_vocab={}):
    bigram_count = {}
    for i in range(1, len(list_words)):
        # Replace words not in our unigram training model with '[OOV]'
        # Different from replace_OOV() because it counts bigrams
        w1 = list_words[i-1] if list_words[i-1] in unigram_vocab.keys() else '[OOV]'
        w2 = list_words[i] if list_words[i] in unigram_vocab.keys() else '[OOV]'
        bigram_i = (w1, w2)
        if bigram_i in bigram_count.keys():
            bigram_count[bigram_i] += 1
        else:
            bigram_count[bigram_i] = 1
    return bigram_count

# Bigram Model Function with Laplace Smoothing
def model_bigram(dict_bigrams={}, dict_unigrams={}, k=1):
    bigram_probabilities = {}
    V = len(dict_unigrams)  # Vocabulary size
    for bigram, count in dict_bigrams.items():
        word1 = bigram[0]
        bigram_probabilities[bigram] = (count + k) / (dict_unigrams[word1] + k*V)
    return bigram_probabilities

In [None]:
languages = ['arabic', 'bengali', 'indonesian']

for language in languages:
    # Training
    sentences_doc_train = df_train_filter[df_train_filter['language'] == language]['tokens_document_endsentence']
    sentences_ques_train = df_train_filter[df_train_filter['language'] == language]['tokens_question_endsentence']

    flat_doc_tokens_train = [item for sublist in sentences_doc_train for item in sublist]
    flat_ques_tokens_train = [item for sublist in sentences_ques_train for item in sublist]

    counts_documents_train = function_count_words(flat_doc_tokens_train)
    counts_questions_train = function_count_words(flat_ques_tokens_train)

    updated_vocab_docs_train = replace_OOV(counts_documents_train)
    updated_vocab_ques_train = replace_OOV(counts_questions_train)

    bigram_counts_docs = function_count_bigrams(flat_doc_tokens_train, updated_vocab_docs_train)
    bigram_counts_ques = function_count_bigrams(flat_ques_tokens_train, updated_vocab_ques_train)

    bigram_distribution_docs = model_bigram(bigram_counts_docs, updated_vocab_docs_train)
    bigram_distribution_ques = model_bigram(bigram_counts_ques, updated_vocab_ques_train)

    # Example of a sample bigram model
    sample_doc_bigram_size_10 = []
    sample_ques_bigram_size_10 = []
    for i in range(0, 10):
       # Normalize the probabilities
        doc_probs = np.array(list(bigram_distribution_docs.values()))
        doc_probs /= doc_probs.sum()

        ques_probs = np.array(list(bigram_distribution_ques.values()))
        ques_probs /= ques_probs.sum()

        sample_doc_bigram_i = np.random.choice([' '.join(x) for x in bigram_distribution_docs.keys()],
                           p=doc_probs)
        sample_ques_bigram_i = np.random.choice([' '.join(x) for x in bigram_distribution_ques.keys()],
                            p=ques_probs)

        sample_doc_bigram_size_10.append(sample_doc_bigram_i)
        sample_ques_bigram_size_10.append(sample_ques_bigram_i)



    # Validation
    sentences_doc_val = df_val_filter[df_val_filter['language'] == language]['tokens_document_endsentence']
    sentences_ques_val = df_val_filter[df_val_filter['language'] == language]['tokens_question_endsentence']

    flat_doc_tokens_val = [item for sublist in sentences_doc_val for item in sublist]
    flat_ques_tokens_val = [item for sublist in sentences_ques_val for item in sublist]

    # Convert unseen unigrams to [OOV]
    updated_vocab_docs_val = [word if word in updated_vocab_docs_train.keys() else '[OOV]' for word in flat_doc_tokens_val]
    updated_vocab_ques_val = [word if word in updated_vocab_ques_train.keys() else '[OOV]' for word in flat_ques_tokens_val]

    # Convert unseen bigrams to use [OOV] or take unigram probabilities if one of the words in the bigram is unseen
    log_prob_docs_val = []
    for i in range(1, len(updated_vocab_docs_val)):
        bigram = (updated_vocab_docs_val[i-1], updated_vocab_docs_val[i])
        if bigram in bigram_distribution_docs:
            log_prob_docs_val.append(np.log(bigram_distribution_docs[bigram]))
        else:
            # If bigram not seen, fall back to unigram probability of the second word
            log_prob_docs_val.append(np.log(updated_vocab_docs_train.get(updated_vocab_docs_val[i], updated_vocab_docs_train['[OOV]'])))

    log_prob_ques_val = []
    for i in range(1, len(updated_vocab_ques_val)):
        bigram = (updated_vocab_ques_val[i-1], updated_vocab_ques_val[i])
        if bigram in bigram_distribution_ques:
            log_prob_ques_val.append(np.log(bigram_distribution_ques[bigram]))
        else:
            # If bigram not seen, fall back to unigram probability of the second word
            log_prob_ques_val.append(np.log(updated_vocab_ques_train.get(updated_vocab_ques_val[i], updated_vocab_ques_train['[OOV]'])))

    # Compute perplexity based on the log bigram probabilities
    perplexity_docs_val = np.exp(-np.mean(log_prob_docs_val))
    perplexity_ques_val = np.exp(-np.mean(log_prob_ques_val))

    print("#"*50)
    print(language)
    print("#"*50)
    print(f'Sample documents bigrams:',' | '.join(sample_doc_bigram_size_10))
    print(f'Sample questions bigrams:',' | '.join(sample_ques_bigram_size_10))
    print('Perplexity for documents:', np.round(perplexity_docs_val, 2))
    print('Perplexity for questions:', np.round(perplexity_ques_val, 2))

    print("-"*50)

##################################################
arabic
##################################################
Sample documents bigrams: من أي | كما يلعب | ولد نيفيل | و شكلوا | جميع الأطفال | ماري كوري | سم عرضاً | كوسوفو . | منذ ذلك | النظير 235U
Sample questions bigrams: ظهرت الديانة | هو مبتكر | ماري [OOV] | [EOS] [SOS] | الانجليزي ؟ | جنسية العالمة | كانت مذبحة | عمر وحيد | البحر؟ [EOS] | العزيز آل
Perplexity for documents: 64.08
Perplexity for questions: 11.19
--------------------------------------------------
##################################################
bengali
##################################################
Sample documents bigrams: অধ্যাপক নিযুক্ত | ফলে পশ্চিম | ছিলেন একজন | [OOV] ভূমিকায় | ( এপ্রিল | , [OOV] | ছাড়া অন্য | তাঁদের [OOV] | অত্যধিক [OOV] | একটি সর্বেশ্বরবাদী
Sample questions bigrams: রচনা করেন | আল [OOV] | ভারতের পশ্চিমবঙ্গ | কী ? | স্ত্রীর নাম | জে. কে. | [EOS] [SOS] | অস্ট্রেলীয় আন্তর্জাতিক | মায়ের নাম | দেশ জয়ী
Perplexity for documents: 34.67
Perplex

Updated last cell for Bigram

In [None]:
k =1
languages = ['arabic', 'bengali', 'indonesian']

for language in languages:
    # Training
    sentences_doc_train = df_train_filter[df_train_filter['language'] == language]['tokens_document_endsentence']
    sentences_ques_train = df_train_filter[df_train_filter['language'] == language]['tokens_question_endsentence']

    flat_doc_tokens_train = [item for sublist in sentences_doc_train for item in sublist]
    flat_ques_tokens_train = [item for sublist in sentences_ques_train for item in sublist]

    counts_documents_train = function_count_words(flat_doc_tokens_train)
    counts_questions_train = function_count_words(flat_ques_tokens_train)

    updated_vocab_docs_train = replace_OOV(counts_documents_train)
    updated_vocab_ques_train = replace_OOV(counts_questions_train)

    bigram_counts_docs = function_count_bigrams(flat_doc_tokens_train, updated_vocab_docs_train)
    bigram_counts_ques = function_count_bigrams(flat_ques_tokens_train, updated_vocab_ques_train)

    bigram_distribution_docs = model_bigram(bigram_counts_docs, updated_vocab_docs_train)
    bigram_distribution_ques = model_bigram(bigram_counts_ques, updated_vocab_ques_train)

    # Example of a sample bigram model
    sample_doc_bigram_size_10 = []
    sample_ques_bigram_size_10 = []
    for i in range(0, 10):
       # Normalize the probabilities
        doc_probs = np.array(list(bigram_distribution_docs.values()))
        doc_probs /= doc_probs.sum()

        ques_probs = np.array(list(bigram_distribution_ques.values()))
        ques_probs /= ques_probs.sum()

        sample_doc_bigram_i = np.random.choice([' '.join(x) for x in bigram_distribution_docs.keys()],
                           p=doc_probs)
        sample_ques_bigram_i = np.random.choice([' '.join(x) for x in bigram_distribution_ques.keys()],
                            p=ques_probs)

        sample_doc_bigram_size_10.append(sample_doc_bigram_i)
        sample_ques_bigram_size_10.append(sample_ques_bigram_i)



    # Validation
    sentences_doc_val = df_val_filter[df_val_filter['language'] == language]['tokens_document_endsentence']
    sentences_ques_val = df_val_filter[df_val_filter['language'] == language]['tokens_question_endsentence']

    flat_doc_tokens_val = [item for sublist in sentences_doc_val for item in sublist]
    flat_ques_tokens_val = [item for sublist in sentences_ques_val for item in sublist]

    # Convert unseen unigrams to [OOV]
    updated_vocab_docs_val = [word if word in updated_vocab_docs_train.keys() else '[OOV]' for word in flat_doc_tokens_val]
    updated_vocab_ques_val = [word if word in updated_vocab_ques_train.keys() else '[OOV]' for word in flat_ques_tokens_val]

    # Define the vocabulary size including the OOV token
    V_docs = len(updated_vocab_docs_train) + 1  # Adding 1 for the OOV token
    V_ques = len(updated_vocab_ques_train) + 1

    # Convert unseen bigrams to use [OOV] or take unigram probabilities if one of the words in the bigram is unseen
    log_prob_docs_val = [np.log(updated_vocab_docs_train.get(flat_doc_tokens_val[0], updated_vocab_docs_train['[OOV]']) + k) - np.log(sum(updated_vocab_docs_train.values()) + k*V_docs)]

    # Then proceed with the rest of the tokens
    for i in range(1, len(flat_doc_tokens_val)):
        bigram = (updated_vocab_docs_val[i-1], updated_vocab_docs_val[i])
        if bigram in bigram_distribution_docs:
           log_prob_docs_val.append(np.log(bigram_distribution_docs[bigram]))
        else:
           # If bigram not seen, fall back to Laplace-smoothed unigram probability of the second word
           log_prob_docs_val.append(np.log(updated_vocab_docs_train.get(updated_vocab_docs_val[i], k) + k) - np.log(sum(updated_vocab_docs_train.values()) + k*V_docs))

    # Do the same for ques
    log_prob_ques_val = [np.log(updated_vocab_ques_train.get(flat_ques_tokens_val[0], updated_vocab_ques_train['[OOV]']) + k) - np.log(sum(updated_vocab_ques_train.values()) + k*V_ques)]

    # Then proceed with the rest of the tokens
    for i in range(1, len(flat_ques_tokens_val)):
        bigram = (updated_vocab_ques_val[i-1], updated_vocab_ques_val[i])
        if bigram in bigram_distribution_ques:
           log_prob_ques_val.append(np.log(bigram_distribution_ques[bigram]))
        else:
           # If bigram not seen, fall back to Laplace-smoothed unigram probability of the second word
           log_prob_ques_val.append(np.log(updated_vocab_ques_train.get(updated_vocab_ques_val[i], k) + k) - np.log(sum(updated_vocab_ques_train.values()) + k*V_ques))

    # Compute perplexity based on the log bigram probabilities
    perplexity_docs_val = np.exp(-sum(log_prob_docs_val) / (len(flat_doc_tokens_val) - 1))
    perplexity_ques_val = np.exp(-sum(log_prob_ques_val) / (len(flat_ques_tokens_val) - 1))

    print("#"*50)
    print(language)
    print("#"*50)
    print(f'Sample documents bigrams:',' | '.join(sample_doc_bigram_size_10))
    print(f'Sample questions bigrams:',' | '.join(sample_ques_bigram_size_10))
    print('Perplexity for documents:', np.round(perplexity_docs_val, 2))
    print('Perplexity for questions:', np.round(perplexity_ques_val, 2))

    print("-"*50)

##################################################
arabic
##################################################
Sample documents bigrams: هيغز ، | تحليل لتاريخ | من بدء | إبراهيموفيتش عدة | الجزيرة الأيبيرية، | المعدن الملكي | اختبار نسبة | العليا للإمبراطورية | عام 2005 | يصل إلى
Sample questions bigrams: [EOS] [SOS] | عاصمة ألمانيا | في العراق؟ | الهيثم؟ [EOS] | هي الالعاب | استلم محمود | [OOV] [EOS] | العشرين ؟ | [EOS] [SOS] | [OOV] بيكاسو؟
Perplexity for documents: 2374.59
Perplexity for questions: 30.45
--------------------------------------------------
##################################################
bengali
##################################################
Sample documents bigrams: গ্রামে বেড়ে | শহরের জনসংখ্যা | নির্বাচিত হয়েছিলেন | - [OOV] | খলিফা রাসুলুল্লাহ | পৌত্র [OOV] | চাকরি নাই | আরএনএ ) | মার্চ [OOV] | চলচ্চিত্রে সুপারহিরো
Sample questions bigrams: ? [EOS] | কোন [OOV] | তৈরী হয় | অন্তর্গত ? | ? [EOS] | প্রথম [OOV] | করেছিলেন ? | মোট কয়টি | কাব্যগ্রন্থটি কত | [SOS] আলী