# Prblem 9
Implement an autocomplete system. That is, given a query string s and a set of all possible query strings, return all strings in the set that have s as a prefix.

For example, given the query string `de` and the set of strings `[dog, deer, deal]`, return `[deer, deal]`.

Hint: Try preprocessing the dictionary into a more efficient data structure to speed up queries.

---
## Test Cases

In [7]:
# test cases
from english_words import get_english_words_set
test_dict = list(get_english_words_set(['web2'], lower=True))
test_dict.sort()
print("Number of words in 'english_words' python library for test dictionary:", len(test_dict), "words")

Number of words in 'english_words' python library for test dictionary: 234450 words


---
## Solution

In [8]:
# create column database for word list
import pandas as pd
import numpy as np
import itertools
from tqdm.notebook import tqdm


def headers(n = 5):
    columns = []
    letters = 'a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,-'.split(',')
    columns = list(map(list,itertools.product(letters, repeat=n)))
    for col in range(len(columns)):
        header = ''
        for letter in columns[col]:
            letter = letter.replace('-', '')
            header += letter
        columns[col] = header
    columns = list(set(columns))
    columns = [col for col in columns if col != '']
    columns.sort(key = len)
    return columns

def column_df(word_list, header_list):
    df = pd.DataFrame()
    null_filler = 80000
    for header in tqdm(header_list):
        temp_word_list = []
        for word in word_list:
            if(word[:len(header)] == header):
                temp_word_list.append(word)
        if(len(temp_word_list) != 0):
            temp_word_list = temp_word_list + ['' for i in range(0,null_filler-len(temp_word_list))]
            df[f"{header}"] = temp_word_list
    df = df.dropna(how = "all")
    return df

In [9]:
# command to make column database based on 
max_autocomplete_word_length = 3
column_database = column_df(test_dict,headers(max_autocomplete_word_length))

  0%|          | 0/18278 [00:00<?, ?it/s]

In [58]:
import time

# solution based on data structure
def autocomplete(user_input, dataframe):
    output = ''
    columns = dataframe.columns
    if(user_input in columns):
        output = dataframe[f"{user_input}"].to_list()
        if(user_input in output):
            output.remove(user_input)
        output = [result for result in output if result != '']
    else:
        for i in range(1, len(user_input)):
            if(user_input[:len(user_input) - i] in columns):
                output = dataframe[f"{user_input[:len(user_input) - i]}"].to_list()
                if(user_input[:len(user_input) - i] in output):
                    output.remove(user_input[:len(user_input) - i])
                output = [result for result in output if result != '']
                output_remove = []
                for word in output:
                    if(word[:len(user_input)] != user_input):
                        output_remove.append(word)
                output = [word for word in output if word not in output_remove]
                return output
        if(output == ''):
            output = "No results found"
    return output

def pretty_print(user_input, column_database):
    start = time.time() * 1000
    results = autocomplete(user_input, column_database)
    end = time.time() * 1000
    print("User inputed:", user_input)
    print("-" * 20)
    if(type(results) == list):
        print(f"{len(results)} autocomplete results found.")
        print(f"First 5 autocomplete results are: {results[:5]}")
    else:
        print("No autocomplete results found.")
    print(f"This autocomplete took {round(end - start, 2)} milliseconds to query.")

In [59]:
user_input = "dea"
pretty_print(user_input, column_database)

User inputed: dea
--------------------
158 autocomplete results found.
First 5 autocomplete results are: ['deacetylate', 'deacetylation', 'deacidification', 'deacidify', 'deacon']
This autocomplete took 4.64 milliseconds to query.


In [64]:
user_input = "xyloc"
pretty_print(user_input, column_database)

User inputed: xyloc
--------------------
5 autocomplete results found.
First 5 autocomplete results are: ['xylocarp', 'xylocarpous', 'xylocopa', 'xylocopid', 'xylocopidae']
This autocomplete took 2.01 milliseconds to query.


In [65]:
user_input = "z"
pretty_print(user_input, column_database)

User inputed: z
--------------------
941 autocomplete results found.
First 5 autocomplete results are: ['za', 'zabaean', 'zabaglione', 'zabaism', 'zaberma']
This autocomplete took 3.0 milliseconds to query.


In [66]:
user_input = "wsq"
pretty_print(user_input, column_database)

User inputed: wsq
--------------------
0 autocomplete results found.
First 5 autocomplete results are: []
This autocomplete took 94.08 milliseconds to query.


---
## Solution Explained

### function solution
function solution explained