In [2]:
import re
import os
import sys
import numpy as np
import time
import math
from datetime import datetime

In [19]:
class DictionaryFunctions:
  def __init__(self) -> None:
    pass
  def addVal_to_dictKey(self, D, k, val=1): # Inputs: Dictionary, Key, Value; Ouput: Dictionary (Updated)
    if k in D:
      if type(val)==int:
        D[k] += val
      elif (type(val)==str):
        D[k].append(val)
    else:
      if type(val)==int:
        D[k] = val
      elif (type(val)==str):
        D[k] = [val]
    return D

  def find_probs_of_dict(self, D):
    total = 0
    for key in D.keys():
      total += D[key]
    for key in D.keys():
      D[key] /= total
    return D 

  def get_dict_from_list(self, L):
    D = {}
    for char in L:
      if char in D:
        D[char] += 1
      else:
        D[char] = 1
    return D

  def max_of_dict(self, D):  
    maxProb = 0
    maxKey = ''
    for key, probVal in D.items():
      if probVal>maxProb:
        maxProb = probVal
        maxKey = key
    return (maxKey, maxProb)

In [20]:
class Language_Model:
  def __init__(self, n, lang="") -> None:
    self.n = n
    self.lang = lang
    self.DF = DictionaryFunctions()
    self.ASCII_Chars = ' !"\'(),-.0123456789:;?abcdefghijklmnopqrstuvwxyz'
    self.sentence_EndChars = '.!?'
    self.ngrams = {}
    self.ngrams_nextChars = {}
    self._ngrams_List = []
    self._ngrams_nextChars_List = []
    self._ngrams_possChars = {}
    self.startPad = ['']
    self.endPad = ['']
    self._ngrams_Available = []
    self._ngrams_nextChars_Available = []
    self._ngrams_notAvailable = []
    self._ngrams_nextChars_notAvailable = []
    self.train_chars_len = None
    self.train_char_prob = None
  
  def _char_prob(self, chars_List):
    print(chars_List)
    char_prob = self.DF.get_dict_from_list(chars_List)
    char_prob = self.DF.find_probs_of_dict(char_prob)
    return char_prob

  def _get_ngrams(self, text):
    n = self.n
    chars_tokens = (n)*self.startPad + text + (n)*self.endPad
    ngrams_nextChars = [(tuple(chars_tokens[i:i+n]),chars_tokens[i+n]) for i in range(len(chars_tokens)-n)]
    return ngrams_nextChars

  def clean_data(self, dataset: str) -> list:  
    ASCII_Chars = self.ASCII_Chars
    all_characters_in_data = re.findall(r"[%s]"%ASCII_Chars, dataset)
    # data_clean = "".join(all_characters_in_data)
    self.train_chars_len = len(all_characters_in_data)
    return all_characters_in_data
    
  def get_Sentences_from_Text(self, text):
    if type(text)==list:
      all_characters_in_data = text
    elif type(text) == str:
      all_characters_in_data = list(text)
    else:
      return "Please pass input either 'List_of_Characters' or 'Text_String'."
    endChars = self.sentence_EndChars # endChars=".!?" ; where '.' -> (period), '!' -> (exclamation mark) and '?' -> (question mark)
    sentences = []
    sentence = []
    for c in all_characters_in_data:
      if c not in endChars:
        sentence.append(c)
      else:
        sentence.append(c)
        sentences.append(sentence)
        sentence = []
    return sentences
  
  def store_ngrams(self, sentences):
    for sent in sentences:
      self.train_char_prob = self._char_prob(sent)
      ngrams_nextChars = self._get_ngrams(sent)

      for ngram_nextChar in ngrams_nextChars: # ((ngram, nextchar), )
        self._ngrams_nextChars_List.append(ngram_nextChar)
        self._ngrams_List.append(ngram_nextChar[0])
        self._ngrams_possChars = self.DF.addVal_to_dictKey(self._ngrams_possChars, ngram_nextChar[0], str(ngram_nextChar[1]))

    self.ngrams_nextChars = self.DF.get_dict_from_list(self._ngrams_nextChars_List)
    self.ngrams = self.DF.get_dict_from_list(self._ngrams_List)
    
    # conProb((ngram1, 'c')) = ngrams_nextChars[(ngram1, 'c')]/ngrams[ngram1]
    # "ngrams_nextChars are stored successfully"

    return "Success: ngrams & ngrams_nextChars are stored successfully"
  
  def fit(self, data):
    all_characters_in_data = self.clean_data(data)
    # sentences = self.get_Sentences_from_Text(all_characters_in_data)
    statusMessage = self.store_ngrams([all_characters_in_data])
    print("==== TRAINING IS COMPLETED ====")
    print(statusMessage)
  
  def get_prob(self, ngram, char):
    if ngram in self.ngrams:
      A = self.ngrams[ngram]
      self._ngrams_Available.append(ngram)
    else:
      A = 0
      self._ngrams_notAvailable.append(ngram)
    
    ngram_nextChar = tuple((ngram, char))
    if ngram_nextChar in self.ngrams_nextChars:
      B = self.ngrams_nextChars[ngram_nextChar]
      self._ngrams_nextChars_Available.append(ngram_nextChar)
    else:
      B = 0
      self._ngrams_nextChars_notAvailable.append(ngram_nextChar)
    
    #if A!=0:
    #  result = float(B/A)
    #else:
    #  result = 0
    result = float((B+1)/(A+len(ngram))) 
    return result
  
  def evaluate(self, text: str):
    text = self.clean_data(text)
    self._ngrams_Available = []
    self._ngrams_nextChars_Available = []
    self._ngrams_notAvailable = []
    self._ngrams_nextChars_notAvailable = []
    
    total_log2loss = 0
    ngram = self.n * self.startPad
    inputList = list(text)
    for char in text:
      result = self.get_prob(tuple(ngram), char)
      if result != 0:
        total_log2loss -= np.log2(result)
      
      ngram = ngram[1:]+[char]
    return total_log2loss/len(inputList)
  
  def evaluation_Status(self):
    print(f"Not available 'ngrams' are : {'-'*10}")
    print(self._ngrams_notAvailable)
    print(f"Not available 'ngrams_nextChars' are : {'-'*10}")
    print(self._ngrams_nextChars_notAvailable)

In [21]:
def load_data(filePath):
  data_cwe = open(filePath, 'r').read().lower()
  return data_cwe

In [22]:
filePath = "C:/Users/14086/Downloads/train-04/sw-train.txt"
train_data = load_data(filePath)

In [23]:
filePath = "C:/Users/14086/Downloads/test04/sw-test.txt"
test_data = load_data(filePath)

In [24]:
n=15
LM_15 = Language_Model(n)
LM_15.fit(train_data)
print(f"Training Log2Loss : {LM_15.evaluate(train_data)}") 

#print("_ngrams_Available : ", len(LM_15._ngrams_Available))
#print("_ngrams_nextChars_Available : ", len(LM_15._ngrams_nextChars_Available))

#print("_ngrams_notAvailable : ", len(LM_15._ngrams_notAvailable))
#print("_ngrams_nextChars_notAvailable : ", len(LM_15._ngrams_nextChars_notAvailable))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



==== TRAINING IS COMPLETED ====
Success: ngrams & ngrams_nextChars are stored successfully
Training Log2Loss : 2.7039773194184282


In [25]:
print(f"Testing Log2Loss : {LM_15.evaluate(test_data)}") 
"""
print("_ngrams_Available : ", len(LM_15._ngrams_Available))
print("_ngrams_nextChars_Available : ", len(LM_15._ngrams_nextChars_Available))

print("_ngrams_notAvailable : ", len(LM_15._ngrams_notAvailable))
print("_ngrams_nextChars_notAvailable : ", len(LM_15._ngrams_nextChars_notAvailable))"""

Testing Log2Loss : 3.4341225967230136


'\nprint("_ngrams_Available : ", len(LM_15._ngrams_Available))\nprint("_ngrams_nextChars_Available : ", len(LM_15._ngrams_nextChars_Available))\n\nprint("_ngrams_notAvailable : ", len(LM_15._ngrams_notAvailable))\nprint("_ngrams_nextChars_notAvailable : ", len(LM_15._ngrams_nextChars_notAvailable))'