<a href="https://colab.research.google.com/github/anjali-rgpt/Autocomplete/blob/master/Matrix_Based_N_gram_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import the necessary libraries

import math
import numpy as np
import requests
import time

In [None]:
# download and read the shakespeare dataset

path = 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt'
r = requests.get(path)
text = r.text

In [None]:
# generate n arrays of frequency matrices

def matrix_builder(text, window):  #the function takes as input the text to be trained on, and a window size i.e the N in an N-gram
  print("Generating matrices for text: ", text[:10], '...')
  matrix = []
  target_encoder, target_decoder = {}, {}      #the encoder and decoder are initialized

  print("Building target encoder...")
  for i, char in enumerate(set(text)):         #generate a character to index mapping for all the unique letters in text
    target_encoder[char] = i
    target_decoder[i] = char

  print("Building matrices...")                #create N layers
  for layer in range(window):
    targets = []

    for i in range(len(text)-layer-1):        # append the target character to be predicted for that given sequence of words in the window
      targets.append(text[i+layer+1])
    
    encoder, decoder, dlist = n_look(text, layer, True)           #for each layer, return the encoder, decoder and the n-gram list
    temp_matrix = np.zeros( (len(encoder), len(target_encoder)) )  #create a matrix of zeroes so that relationships between the encoded value and the target value can be represented
      
    for index, el in enumerate(dlist):   #for each n-gram in the list
      phrase = encoder[el]               # store the index for that n-gram
      target = target_encoder[targets[index]]    #get the target value for that index
      temp_matrix[phrase][target] += 1
    
    matrix.append(temp_matrix)
    print("Built layer", layer)
  
  matrix = np.asarray(matrix)
  print("Generation complete!\n")
  return matrix





In [None]:
# create encoder, decoder for nth layer
def n_look(text, layer, flag=False):
  # encoder, decoder = {}, {}
  dlist = []

  for i in range(len(text)-layer-1):   # create the n-gram
    dlist.append(text[i:i+layer+1])
    
  encoder = dict((c, i) for i, c in enumerate(set(dlist)))  #map character to index
  decoder = dict((i, c) for i, c in enumerate(set(dlist)))  #map index to character

  if flag:
    return encoder, decoder, dlist
  else:
    return encoder, decoder


In [None]:
# test nth layer
def n_test(matrix, seed, layer=0, length=50):        #get a specific layer
  frame = matrix[layer]
  encoder, decoder = n_look(text, layer)             # create encoder and decoder
  seed = encoder[seed[-(layer+1):]]                  # get the encoded representation of the seed
  gen = ''
  for i in range(length):
    gen += decoder[seed]                             #decode the value of the seed
    seed = frame[encoder[gen[-(layer+1):]]].argmax()  # move the window
  # print(gen)
  return gen


In [None]:
def analyse(model, window):
  print('Analysing model...')
  start = time.time()

  # generate accuracy frequency table for each layer
  weights = [0 for i in range(window)]

  for layer in range(window):
    print("Working on layer:", layer)
    ctr = 0
    targets = []
    frame = model[layer]
    encoder, decoder = n_look(text, layer)
    for ptr in range(len(text)-layer-1):
      given = text[ptr:ptr+layer+1]
      pred = decoder[frame[encoder[given]].argmax()]
      ctr+=1
      
      if text[ptr+layer] == pred:
        weights[layer]+=1

  print("Final weights:", weights)
  print(sum(weights), 'correct out of', ctr, 'predictions.')
  print('Accuracy:', (sum(weights)/ctr)*100, '%')
  end = time.time()
  print('Time taken:', end - start)

In [None]:
# cycle model text gen

def generate(matrix, window, gen, length=100):
  i = 0
  print('\nGenerating with seed:', gen)
  # for w layers, create lookups
  encoders, decoders = {}, {}
  for layer in (range(window)):
    encoders[layer], decoders[layer] = n_look(text, layer)
  # generation
  errors = 0
  while len(gen) <= length and errors<=window:
    layer = i%window
    frame = matrix[layer]
    try:
      pred = decoders[layer][frame[encoders[layer][gen[-(layer+1):]]].argmax()]
    except KeyError:
      errors+=1
      i+=1
      continue
    gen += pred
    errors = 0
    i+=1
  if errors == window:
    print('Model failed')
  return gen

In [None]:
def driver(text, window):
  model = matrix_builder(text, window)
  analyse(model, window)
  out = generate(model, window, text[:window])
  print(out)

In [None]:
# run the model here

data = text
window = 5

driver(data, window)

Generating matrices for text:  First Citi ...
Building target encoder...
Building matrices...
Built layer 0
Built layer 1
Built layer 2
Built layer 3
Built layer 4
Generation complete!

Analysing model...
Working on layer: 0
Working on layer: 1
Working on layer: 2
Working on layer: 3
Working on layer: 4
Final weights: [75235, 0, 0, 0, 0]
75235 correct out of 1115389 predictions.
Accuracy: 6.745180381015054 %
Time taken: 10.380549192428589

Generating with seed: First
FirsthPHeo?ub.
RWhoepollo?ub.
RWhoepollo?ub.
RWhoepollo?ub.
RWhoepollo?ub.
RWhoepollo?ub.
RWhoepollo?
