In [1]:

# imports
import os
import sys
import types
import json
import base64

# figure size/format
fig_width = 7
fig_height = 5
fig_format = 'retina'
fig_dpi = 96
interactivity = ''
is_shiny = False
is_dashboard = False
plotly_connected = True

# matplotlib defaults / format
try:
  import matplotlib.pyplot as plt
  plt.rcParams['figure.figsize'] = (fig_width, fig_height)
  plt.rcParams['figure.dpi'] = fig_dpi
  plt.rcParams['savefig.dpi'] = "figure"
  from IPython.display import set_matplotlib_formats
  set_matplotlib_formats(fig_format)
except Exception:
  pass

# plotly use connected mode
try:
  import plotly.io as pio
  if plotly_connected:
    pio.renderers.default = "notebook_connected"
  else:
    pio.renderers.default = "notebook"
  for template in pio.templates.keys():
    pio.templates[template].layout.margin = dict(t=30,r=0,b=0,l=0)
except Exception:
  pass

# disable itables paging for dashboards
if is_dashboard:
  try:
    from itables import options
    options.dom = 'fiBrtlp'
    options.maxBytes = 1024 * 1024
    options.language = dict(info = "Showing _TOTAL_ entries")
    options.classes = "display nowrap compact"
    options.paging = False
    options.searching = True
    options.ordering = True
    options.info = True
    options.lengthChange = False
    options.autoWidth = False
    options.responsive = True
    options.keys = True
    options.buttons = []
  except Exception:
    pass
  
  try:
    import altair as alt
    # By default, dashboards will have container sized
    # vega visualizations which allows them to flow reasonably
    theme_sentinel = '_quarto-dashboard-internal'
    def make_theme(name):
        nonTheme = alt.themes._plugins[name]    
        def patch_theme(*args, **kwargs):
            existingTheme = nonTheme()
            if 'height' not in existingTheme:
              existingTheme['height'] = 'container'
            if 'width' not in existingTheme:
              existingTheme['width'] = 'container'

            if 'config' not in existingTheme:
              existingTheme['config'] = dict()
            
            # Configure the default font sizes
            title_font_size = 15
            header_font_size = 13
            axis_font_size = 12
            legend_font_size = 12
            mark_font_size = 12
            tooltip = False

            config = existingTheme['config']

            # The Axis
            if 'axis' not in config:
              config['axis'] = dict()
            axis = config['axis']
            if 'labelFontSize' not in axis:
              axis['labelFontSize'] = axis_font_size
            if 'titleFontSize' not in axis:
              axis['titleFontSize'] = axis_font_size  

            # The legend
            if 'legend' not in config:
              config['legend'] = dict()
            legend = config['legend']
            if 'labelFontSize' not in legend:
              legend['labelFontSize'] = legend_font_size
            if 'titleFontSize' not in legend:
              legend['titleFontSize'] = legend_font_size  

            # The header
            if 'header' not in config:
              config['header'] = dict()
            header = config['header']
            if 'labelFontSize' not in header:
              header['labelFontSize'] = header_font_size
            if 'titleFontSize' not in header:
              header['titleFontSize'] = header_font_size    

            # Title
            if 'title' not in config:
              config['title'] = dict()
            title = config['title']
            if 'fontSize' not in title:
              title['fontSize'] = title_font_size

            # Marks
            if 'mark' not in config:
              config['mark'] = dict()
            mark = config['mark']
            if 'fontSize' not in mark:
              mark['fontSize'] = mark_font_size

            # Mark tooltips
            if tooltip and 'tooltip' not in mark:
              mark['tooltip'] = dict(content="encoding")

            return existingTheme
            
        return patch_theme

    # We can only do this once per session
    if theme_sentinel not in alt.themes.names():
      for name in alt.themes.names():
        alt.themes.register(name, make_theme(name))
      
      # register a sentinel theme so we only do this once
      alt.themes.register(theme_sentinel, make_theme('default'))
      alt.themes.enable('default')

  except Exception:
    pass

# enable pandas latex repr when targeting pdfs
try:
  import pandas as pd
  if fig_format == 'pdf':
    pd.set_option('display.latex.repr', True)
except Exception:
  pass

# interactivity
if interactivity:
  from IPython.core.interactiveshell import InteractiveShell
  InteractiveShell.ast_node_interactivity = interactivity

# NOTE: the kernel_deps code is repeated in the cleanup.py file
# (we can't easily share this code b/c of the way it is run).
# If you edit this code also edit the same code in cleanup.py!

# output kernel dependencies
kernel_deps = dict()
for module in list(sys.modules.values()):
  # Some modules play games with sys.modules (e.g. email/__init__.py
  # in the standard library), and occasionally this can cause strange
  # failures in getattr.  Just ignore anything that's not an ordinary
  # module.
  if not isinstance(module, types.ModuleType):
    continue
  path = getattr(module, "__file__", None)
  if not path:
    continue
  if path.endswith(".pyc") or path.endswith(".pyo"):
    path = path[:-1]
  if not os.path.exists(path):
    continue
  kernel_deps[path] = os.stat(path).st_mtime
print(json.dumps(kernel_deps))

# set run_path if requested
run_path = 'L2hvbWUvYWJoaWppdC9wcm9qZWN0cy9hYmhpaml0LWRhcmVrYXIvcG9zdHM='
if run_path:
  # hex-decode the path
  run_path = base64.b64decode(run_path.encode("utf-8")).decode("utf-8")
  os.chdir(run_path)

# reset state
%reset

# shiny
# Checking for shiny by using False directly because we're after the %reset. We don't want
# to set a variable that stays in global scope.
if False:
  try:
    import htmltools as _htmltools
    import ast as _ast

    _htmltools.html_dependency_render_mode = "json"

    # This decorator will be added to all function definitions
    def _display_if_has_repr_html(x):
      try:
        # IPython 7.14 preferred import
        from IPython.display import display, HTML
      except:
        from IPython.core.display import display, HTML

      if hasattr(x, '_repr_html_'):
        display(HTML(x._repr_html_()))
      return x

    # ideally we would undo the call to ast_transformers.append
    # at the end of this block whenver an error occurs, we do 
    # this for now as it will only be a problem if the user 
    # switches from shiny to not-shiny mode (and even then likely
    # won't matter)
    import builtins
    builtins._display_if_has_repr_html = _display_if_has_repr_html

    class _FunctionDefReprHtml(_ast.NodeTransformer):
      def visit_FunctionDef(self, node):
        node.decorator_list.insert(
          0,
          _ast.Name(id="_display_if_has_repr_html", ctx=_ast.Load())
        )
        return node

      def visit_AsyncFunctionDef(self, node):
        node.decorator_list.insert(
          0,
          _ast.Name(id="_display_if_has_repr_html", ctx=_ast.Load())
        )
        return node

    ip = get_ipython()
    ip.ast_transformers.append(_FunctionDefReprHtml())

  except:
    pass

def ojs_define(**kwargs):
  import json
  try:
    # IPython 7.14 preferred import
    from IPython.display import display, HTML
  except:
    from IPython.core.display import display, HTML

  # do some minor magic for convenience when handling pandas
  # dataframes
  def convert(v):
    try:
      import pandas as pd
    except ModuleNotFoundError: # don't do the magic when pandas is not available
      return v
    if type(v) == pd.Series:
      v = pd.DataFrame(v)
    if type(v) == pd.DataFrame:
      j = json.loads(v.T.to_json(orient='split'))
      return dict((k,v) for (k,v) in zip(j["index"], j["data"]))
    else:
      return v

  v = dict(contents=list(dict(name=key, value=convert(value)) for (key, value) in kwargs.items()))
  display(HTML('<script type="ojs-define">' + json.dumps(v) + '</script>'), metadata=dict(ojs_define = True))
globals()["ojs_define"] = ojs_define
# globals()["__spec__"] = None



In [2]:
import string
from nltk.util import ngrams
from collections import OrderedDict, defaultdict, namedtuple
from datetime import datetime
from tqdm import tqdm

In [3]:
def loadCorpus(file_path, bi_dict, tri_dict, quad_dict, vocab_dict):
    token = []
    word_len = 0

    with open(file_path,'r') as file:
        lines  = [ x.strip() for x in file.readlines()]
    lines = ['<start> '+x+' <end>' for x in lines]
    for line in lines:
        temp_l = line.split()
        # print(temp_l)
        i = 0
        j = 0
        
        for word in temp_l :
            j = 0
            for l in word :
                if l in '!"#$%&\'()*+,-./:;=?@[\\]^_`{|}~':
                    if l == "'":
                        if j+1<len(word) and word[j+1] == 's':
                            j = j + 1
                            continue
                    word = word.replace(l," ")
                    #print(j,word[j])
                j += 1

            temp_l[i] = word.lower()
            i=i+1   

        content = " ".join(temp_l)

        token = content.split()
        word_len = word_len + len(token)  

        if not token:
            continue

        temp0 = list(ngrams(token,2))
       
        temp1 = list(ngrams(token,3))

        for word in token:
            if word not in vocab_dict:
                vocab_dict[word] = 1
            else:
                vocab_dict[word]+= 1
                
        temp2 = list(ngrams(token,4))

        for t in temp0:
            sen = ' '.join(t)
            bi_dict[sen] += 1

        for t in temp1:
            sen = ' '.join(t)
            tri_dict[sen] += 1

        for t in temp2:
            sen = ' '.join(t)
            quad_dict[sen] += 1

        n = len(token)
           
    return word_len

In [4]:
def createKNDict(ngram_dict, n):

    i = 0
    d = 0.75

    first_dict = {}
    
    sec_dict = {}
    
    for key in ngram_dict:
        
        ngram_token = key.split()
       
        n_1gram_sen = ' '.join(ngram_token[:n-1])
         
        if n_1gram_sen not in sec_dict:
            sec_dict[ n_1gram_sen ] = 1
        else:
            sec_dict[ n_1gram_sen ] += 1
            
        if ngram_token[-1] not in first_dict:
            first_dict[ ngram_token[-1] ] = 1
        else:
            first_dict[ ngram_token[-1] ] += 1
    
    return first_dict, sec_dict

In [5]:
def computeKnesserNeyProbablity(vocab_dict, ngram_dicts, prob_dict):
    d = 0.75
    interpolation = 0.4  # Adjust as needed

    for order in range(2, len(ngram_dicts) + 2):
        current_dict = ngram_dicts[order - 2]

        first_dict, sec_dict = createKNDict(current_dict, order)

        for ngram in tqdm(current_dict):
            ngram_tokens = ngram.split()
            prefix = ' '.join(ngram_tokens[:-1])

            prob1 = max(current_dict[ngram] - d, 0) / sec_dict[prefix] if prefix in sec_dict else 0
            prob2 = d / sec_dict[prefix] * (first_dict[ngram_tokens[-1]] if ngram_tokens[-1] in first_dict else 0)

            for i in range(order - 2, 0, -1):
                ngram_prefix = ' '.join(ngram_tokens[i:-1])
                prob2 *= d / len(ngram_dicts[i - 1]) * (sec_dict[ngram_prefix] if ngram_prefix in sec_dict else 0)

            prob_dict[prefix] = prob_dict.get(prefix, [])
            prob_dict[prefix].append([(1 - interpolation) * (prob1 + prob2) + interpolation * vocab_dict[ngram_tokens[-1]] / sum(vocab_dict.values()), ngram_tokens[-1]])

    print("Completed")

In [6]:
def sortProbWordDict(prob_dict):
    for key in prob_dict:
        if len(prob_dict[key])>0:
            prob_dict[key] = sorted(prob_dict[key],reverse = True)[:2]


def removePunctuations(sen):
    """
    Funtion to remove punctuations from the given input sentence and covert them to lowercase.
    arg: string
    returns: string
    """
    temp_l = sen.split()
    i = 0
    j = 0
    
    for word in temp_l :
        j = 0
        #print(len(word))
        for l in word :
            if l in string.punctuation:
                if l == "'":
                    if j+1<len(word) and word[j+1] == 's':
                        j = j + 1
                        continue
                word = word.replace(l," ")
            j += 1

        temp_l[i] = word.lower()
        i=i+1   
    content = " ".join(temp_l)
    return content
def doPrediction(sen, prob_dict):
    if sen in prob_dict:
        return prob_dict[sen]
    else:
        return ""

In [7]:
first = True
if first:
    bi_dict = defaultdict(int)
    tri_dict = defaultdict(int)            
    quad_dict = defaultdict(int)   
    vocab_dict = defaultdict(int)       
    prob_dict = OrderedDict()         

    quad_dict = defaultdict(int)   

    token_len = loadCorpus("last.txt",bi_dict,tri_dict,quad_dict,vocab_dict)

    computeKnesserNeyProbablity(vocab_dict, [bi_dict, tri_dict, quad_dict] ,prob_dict )
    sortProbWordDict(prob_dict)
    first = False
    
def get_words(text):
    inp_time = datetime.now()
    if text.split() == [] and len(text.split())>0:
        print("Input Text Found to be Empty.")
    text = removePunctuations(text)
    text = "<start> "+text
    if len(text.split())>3:
        text = text.split()
        text = " ".join(text[-3:])
        
    final_words = doPrediction(text.lower(),prob_dict)

    print('Word Prediction:',final_words)
    inp_proc_time = datetime.now()
    print('----------------------------Prediction Time :',inp_proc_time-inp_time)

  0%|          | 0/9845 [00:00<?, ?it/s]

 60%|█████▉    | 5887/9845 [00:00<00:00, 58866.27it/s]

100%|██████████| 9845/9845 [00:00<00:00, 56205.01it/s]




  0%|          | 0/19829 [00:00<?, ?it/s]

 22%|██▏       | 4443/19829 [00:00<00:00, 44420.09it/s]

 45%|████▌     | 9021/19829 [00:00<00:00, 45203.89it/s]

 72%|███████▏  | 14368/19829 [00:00<00:00, 48968.84it/s]

 97%|█████████▋| 19265/19829 [00:00<00:00, 47590.79it/s]

100%|██████████| 19829/19829 [00:00<00:00, 47156.27it/s]




  0%|          | 0/23911 [00:00<?, ?it/s]

 19%|█▉        | 4536/23911 [00:00<00:00, 45354.96it/s]

 40%|████      | 9571/23911 [00:00<00:00, 48291.62it/s]

 61%|██████▏   | 14654/23911 [00:00<00:00, 49449.27it/s]

 83%|████████▎ | 19747/23911 [00:00<00:00, 50032.81it/s]

100%|██████████| 23911/23911 [00:00<00:00, 49565.92it/s]




Completed


In [8]:
get_words("Hi")

Word Prediction: [[0.9761837911670966, 'how'], [0.6773237382168934, 'what']]
----------------------------Prediction Time : 0:00:00.000336


In [9]:
get_words("good Morning")

Word Prediction: [[0.15004384411729987, 'team']]
----------------------------Prediction Time : 0:00:00.000279


In [10]:
get_words("What are")

Word Prediction: [[1.719622097435119, 'you'], [0.5116390954621338, 'we']]
----------------------------Prediction Time : 0:00:00.000390
