# Trie
## Use a binary tree to store characters. Can be used to quickly search for a word in the trie.
## Practical application: Autocomplete

In [11]:
class Node:
    # We'll use a linked list to store siblings
    def __init__(self, val:str):
        self.val = val
        self.next = None
        
    def push(self, val:str) -> None:
        head = self
        while head.next != None:
            head = head.next
        head.next = Node(val)
                
    def search(self, val:str):
        head = self
        while head:
            if head.val == val:
                return head
            else:
                head = head.next
        return None
            
        


In [134]:
# class Node:
#     # We'll use a linked list to store siblings
#     def __init__(self, val:str):
#         self.item = TrieNode(val)
#         self.next = None
        
#     def push(self, val:str) -> None:
#         head = self
#         while head.next != None:
#             head = head.next
#         head.next = Node(val)
                
#     def search(self, val:str):
#         head = self
#         while head:
#             if head.val == val:
#                 return head
#             else:
#                 head = head.next
#         return None
            

class TrieNode:
    def __init__(self, val:str) -> None:
        self.val = val
        self.children = {} # Dictionary of TrieNodes
        self.is_word = False
        
    def insert(self, word:str) -> None:
        head = self
        for w in word:
            children = head.children
            if w not in children:
                print('Adding ' + w)
                head.children[w] = TrieNode(w)
            head = children[w]
        # mark word as being complete
        head.is_word = True
            
    def find_word(self, word:str) -> bool:
        head = self
        for w in word:
            if w in head.children:
                head = head.children[w]
            else:
                print('Not even close.')
                return False
        if head.is_word == False:
            print('Exists in the trie, but not marked as a complete word')
        return head.is_word
    
    
    def dfs(self, node, prefix):
        # recursively retrieve all values inorder
        if len(node.children) == 0 and node.is_word:
            return [prefix + node.val]
        if node:
            result = []
            if node.is_word:
                result.extend([prefix + node.val])
            for child in node.children:
                result.extend(self.dfs(node.children[child], prefix + node.val))
            return result
                
            
    
    def autocomplete(self, stub:str) -> [str]:
        '''
        Given a stub, return a list of all potential autocompleted words (must be complete)
        '''
        
        head = self
        for w in stub:
            if w in head.children:
                head = head.children[w]
            else:
                print('stub does not exist.')
                return None
        
        solutions = self.dfs(head, stub[:-1])
        # Now retrieve all descendant nodes, with the stub as a prefix. We'll do this using bfs
    
        return solutions
        
        
        

In [135]:
t = TrieNode(None)

In [136]:
t.insert('boast')

Adding b
Adding o
Adding a
Adding s
Adding t


In [137]:
t.insert('boo')

Adding o


In [138]:
t.insert('boom')

Adding m


In [139]:
t.insert('boomer')

Adding e
Adding r


In [140]:
t.insert('boast')

In [141]:
t.find_word('boo')

True

In [142]:
t.autocomplete('bo')

['boast', 'boo', 'boom', 'boomer']

### We can make a slight improvement to the autocomplete feature. we can rank order the autocompletes using some counter dictionary

# So that's pretty cool. But right now, autocomplete just gives us a laundry list of the items we've marked as being real words (via insert)

## To really make this interesting, can we build a SMART recommendation tool, that provides autocomplete suggestions based on the previous words in the current sentence?

propose a simple ngram model.
load up the trie using the training corpus
build a multinomial naive bayes classifier, where every complete word has some probability of being the word in question.
provide the most likely class (i.e. complete word).
 whats the point of the trie? 
 right now, the trie only stores individual strings and whether or not their complete words.
 given a sentence and a partial word, a trie will give us all possible words that partial word could be. i.e. defines the set of possible classes.
 we can then more narrowly define what the word should be.

### t = Trie()
### data = some corpus
### t.load(data)
### model = Pipeline(data -> ngram -> tfidf -> multinomialNB)
#### Model will be fed sentences. sentences will be broken down and processed. model will try to predict  what the n+1th word is.

In [143]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [144]:
import nltk

In [145]:
from nltk.corpus.reader.nps_chat import NPSChatCorpusReader

In [146]:
reader = NPSChatCorpusReader

In [159]:
words = reader.words

In [165]:
reader.fileids()


TypeError: fileids() missing 1 required positional argument: 'self'

In [158]:
posts

<function nltk.corpus.reader.nps_chat.NPSChatCorpusReader.posts(self, fileids=None)>

In [166]:
model = MultinomialNB()

In [167]:
vectorizer = TfidfVectorizer(ngram_range=(1,3))

In [169]:
from nltk.corpus import brown

In [174]:
len(brown.sents())

57340

In [None]:
# Build a dataset from these corpora.
# Th eway to do this is to take random sentences, 

In [201]:
from random import randint
def gen_samples_from_sentence(sent):
    #words = sent.split(' ')
    result = []
    for i in range(len(sent)//2):
        idx = randint(1, len(sent) - 1)
        result.append((sent[:idx], sent[idx]))
    return result

In [204]:
gen_samples_from_sentence('this is the best'.split(' '))

[(['this', 'is'], 'the'), (['this'], 'is')]

In [205]:
brown.sents()[0]

['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of',
 "Atlanta's",
 'recent',
 'primary',
 'election',
 'produced',
 '``',
 'no',
 'evidence',
 "''",
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 '.']

In [None]:
data = []
for sent in brown.sents():
    data.append(gen_samples_from_sentence(data))