<a href="https://colab.research.google.com/github/andbusch/n-gram-language-prediction/blob/main/N_gram_language_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [88]:
import collections
import numpy as np

In [89]:
!wget -q --show-progress --no-check-certificate 'https://docs.google.com/uc?export=download&id=1ghGrMxU03QxEBcqt292bcC6Ej-7Oso0h' -O czech.txt



In [90]:
!wget -q --show-progress --no-check-certificate 'https://docs.google.com/uc?export=download&id=1HYbaf-zECkGPJIq2su13AkLHBOapgLHh' -O english.txt



In [91]:
!wget -q --show-progress --no-check-certificate 'https://docs.google.com/uc?export=download&id=18mBhXOWPNX1WtTBFnZyFG_ayGFJIlsaS' -O japanese.txt



In [92]:
!wget -q --show-progress --no-check-certificate 'https://docs.google.com/uc?export=download&id=1WQyJlNtqEmMTwgLKcOVKuIRCha5JVLAk' -O german.txt



In [93]:
class N_Gram():
  def __init__(self, n, lang):
    self.bigram_dict = collections.defaultdict(lambda: 0)
    self.__n = n
    self.lang = lang
    self.bigram_counter = 0

  def train_data(self, filename):
    print("Training " + self.lang + " Model")
    with open(filename) as f0:
      self.word_list = f0.readlines()
      for wd in self.word_list[0:int(0.60 * len(self.word_list))]:
        self.process_word(wd)

  def process_word(self, wd): # add word to list of n-grams
    wd = '#'+wd+'#'
    for i in range(len(wd) - self.__n + 1):
      self.bigram_dict[wd[i:i+self.__n]] += 1
      self.bigram_counter += 1


  def compute_probability(self, wd) -> float: # find the probability given a word and a language, will be used on testing data
    wd = '#'+wd+'#'
    prob = 0.0
    for i in range(len(wd) - self.__n + 1):
      if self.bigram_dict[wd[i:i+self.__n]] == 0:
        #prob += -10000
        prob += np.log(1 / self.bigram_counter)
      else:
        prob += np.log(self.bigram_dict[wd[i:i+self.__n]] / self.bigram_counter)
    return prob




In [114]:
bigram_size = 2

def test_data(langs, i):
    num_correct = 0
    total_counted = 0
    with open(f'predictions_{langs[i].lang}_{bigram_size}.txt', 'w') as file:
      for word in langs[i].word_list[int(0.60 * len(langs[i].word_list)):]:  # iterate through remaining 40% of word list
        predicted = [-1000.0, "language"]  # Re-initialize for each word
        for lang in langs:
          tot_prob = total_prob(word, lang, langs)
          if tot_prob > predicted[0]:
            predicted = [tot_prob, lang.lang]
          file.write(f"Word: {word} predicted to language {predicted[1]} with probability {predicted[0]}\n")
          if predicted[1] == langs[i].lang:
            num_correct+=1
          total_counted+=1
      file.write(f"Total Correct: {num_correct} out of {total_counted} success % = {num_correct / total_counted}")

def total_prob(word, lang, langs) -> float:
    return lang.compute_probability(word) * (1 / 4) / prob_word(word, langs)

def prob_word(word, langs) -> float:
    num_bigrams = 0
    num_occ = 0

    for lang in langs:
        num_bigrams += lang.bigram_counter

    if num_bigrams == 0:
        return 0

    word = '#' + word + '#'
    for i in range(len(word) - bigram_size + 1):
        for lang in langs:
            num_occ += lang.bigram_dict.get(word[i:i+bigram_size], 0)  # Use .get to handle missing bigrams

    return num_occ / num_bigrams if num_bigrams > 0 else 0


In [115]:
english = N_Gram(bigram_size, "english")
english.train_data("english.txt")

czech = N_Gram(bigram_size, "czech")
czech.train_data("czech.txt")

german = N_Gram(bigram_size, "german")
german.train_data("german.txt")

japanese = N_Gram(bigram_size, "japanese")
japanese.train_data("japanese.txt")

langs = [english, czech, german, japanese]

for i in range(0,4):
  test_data(langs, i)

# test_data(langs, "english.txt")

Training english Model
Training czech Model
Training german Model
Training japanese Model
