<a href="https://colab.research.google.com/github/anjali-rgpt/Autocomplete/blob/master/Data_Structure_Model_Generic_Trie.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import libraries here

%tensorflow_version 2.x
import tensorflow as tf
import nltk
from nltk.tokenize import word_tokenize
from functools import reduce
import requests

In [2]:
# nltk setup
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
# import dataset here

r = requests.get('https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt')
text = r.text

In [4]:
# define a Trie class

class Trie:
    def __init__(self):
      self.children = {}
      self.flag = False   # Flag to represent that a word ends at this node
      self.count = 1
      self.word_list = set()

    def add(self, char):
      self.children[char] = Trie()

    def insert(self, word):
      node = self
      for char in word:
        if char not in node.children:
            node.add(char)
        else:
            node.count += 1
        node = node.children[char]
      node.flag = True

    def contains(self, word):
      node = self
      for char in word:
        if char not in node.children:
          return False
        node = node.children[char]
      return node.flag

    # Inefficient recursive implementation
    def search(self, node, word):
      if node.flag: 
        self.word_list.add(word) 
      for a, n in node.children.items(): 
        self.search(n, word + a) 

    def predict(self, prefix):
      node = self
      self.word_list = set()
      for char in prefix:
        if char not in node.children:
          return set()
        node = node.children[char]
      self.search(node, prefix)
      return self.word_list
    
    # GitHub implementation - please message with link to your original implementation, if you feel very strongly about this
    def all_suffixes(self, prefix):
      results = set()
      if self.flag:
        results.add(prefix)
      if not self.children: return results
      return reduce(lambda a, b: a | b, [node.all_suffixes(prefix + char) for (char, node) in self.children.items()]) | results

    def autocomplete(self, prefix):
      node = self
      for char in prefix:
        if char not in node.children:
          return set()
        node = node.children[char]
      return list(node.all_suffixes(prefix))

In [5]:
# function to generate a Trie from given text document

def TrieGenerator(text):
  #extracts tokens
  tokens=word_tokenize(text)

  #lowercases all words and drops tokens with non alphanumeric characters
  words=[w.lower() for w in tokens if w.isalnum()]

  #generate symbols list
  symbols=set(text)
  
  trie = Trie()
  for word in words:
    trie.insert(word)

  return trie

In [6]:
# generate Trie from dataset

# smolTrie = TrieGenerator("the quick brown fox jumps over the lazy dog")
bigTrie = TrieGenerator(text)

In [None]:
test = 'app'

print(bigTrie.predict(test))

{'apparelling', 'apprentices', 'appendices', 'appliableness', 'applosion', 'appeacher', 'appetized', 'appellees', 'appanage', 'appliant', 'apperceive', 'appraisers', 'apparentement', 'appropriateness', 'appropriates', 'appendages', 'apparitor', 'appetising', 'appetizing', 'apposite', 'apposer', 'appendicate', 'applauds', 'appendaged', 'appellor', 'appd', 'appetizers', 'appositely', 'apprehended', 'appels', 'appliably', 'apposing', 'appendicectasis', 'appalachian', 'approof', 'appertain', 'applicable', 'appendiculariidae', 'applauder', 'append', 'appositeness', 'apportions', 'appulses', 'appetitional', 'apprense', 'approbator', 'appearanced', 'approvement', 'appellants', 'appendixed', 'appliqued', 'approached', 'appetibleness', 'apprehensible', 'appeasable', 'appl', 'appeasably', 'appertaining', 'appreciatively', 'applauses', 'appropriation', 'appaloosa', 'appellatively', 'appellors', 'appalled', 'apprised', 'applyingly', 'appreteur', 'apparats', 'apprentice', 'applegrower', 'appendent'

In [7]:
import sys

def get_size(obj, seen=None):
    size = sys.getsizeof(obj)
    if seen is None:
        seen = set()
    obj_id = id(obj)
    if obj_id in seen:
        return 0
    seen.add(obj_id)
    if isinstance(obj, dict):
        size += sum([get_size(v, seen) for v in obj.values()])
        size += sum([get_size(k, seen) for k in obj.keys()])
    elif hasattr(obj, '__dict__'):
        size += get_size(obj.__dict__, seen)
    elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
        size += sum([get_size(i, seen) for i in obj])

    return size

print(get_size(bigTrie), 'bytes')

651596674 bytes
