In [26]:
from tqdm.notebook import tqdm
from pprint import pprint
from glob import glob
import random
import spacy
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine
import requests
from sklearn.neighbors import KDTree

In [None]:
! unzip wiki_data

In [4]:
documents = []

for fname in tqdm(glob('wiki_data/texts*.txt')):
    with open(fname) as f:
        document = ""
        for line in f:
            document = document + line.strip()
        
        documents.append(document)

len(documents), documents[0]

  0%|          | 0/1047 [00:00<?, ?it/s]

(1047,
 '= SFINKS =Sfinks (Polish for "Sphynx") was also the initial name of the Janusz A. Zajdel AwardIn cryptography, SFINKS is a stream cypher algorithm developed by An Braeken, Joseph Lano, Nele Mentens, Bart Preneel, and Ingrid Verbauwhede. It includes a message authentication code. It has been submitted to the eSTREAM Project of the eCRYPT network.')

Training the model

In [32]:
nlp = spacy.load("en", disable=["parser", "ner", "tagger"])

def spacy_tokenizer(text):
    return [t.lemma_ for t in nlp(text)]

vec = TfidfVectorizer(tokenizer=spacy_tokenizer)
trained_vectors = vec.fit_transform(documents).todense()
tree = KDTree(trained_vectors)
texts = [[document, vector] for document, vector in zip(documents, trained_vectors)]



In [29]:
all_words = vec.vocabulary_

Initializing the cache

In [53]:
answer = dict()

Searching

In [54]:
def lcp(a: str, b:str):
    '''
    Computes the longest common prefix of the strings `a` and `b`
    '''
    lcp = 0
    while lcp < len(a) and lcp < len(b) and a[lcp] == b[lcp]:
        lcp += 1
    return lcp

def most_similar(text: str):
    '''
    Returns the most similar word to `text` in the training vocabulary 
    by using Edit Distance* (ED from now on) and Longest Common Prefix
    (LCP from now).

    In general:
    - the lower the ED, the more similar the words are.
    - the greater the LCP, the more similar the words are.

    Assumption:
    - Typos are more likely to happen in the middle and the end of the words.
    That's why the LCP plays a major role in comparing the similarity of words.

    Final Similarity criteria**:
    - the similarity of two words `a` and `b` is ED(a, b) / exp(LCP(a, b))
    

    * Edit Distance is also called Levenshtein Distance
    ** It might change later
    '''

    result = min([[nltk.edit_distance(word, text) / np.exp(lcp(word, text)), word] for word in all_words]) # [similarity, resulting_word]
        
    return result[1]

def do_search(text: str):
    text = text.lower()
    if text not in all_words:
        text = most_similar(text)
    if text not in answer: # caching the result for this text
        new_vector = vec.transform([text]).todense()
        dist, ind = tree.query(new_vector, k = 10)
        answer[text] = [documents[i] for i in list(ind[0])]
    return answer[text]

In [55]:
do_search('Dijkstra')



['= Dijkstra\'s algorithm =Dijkstra\'s algorithm ( DYKE-strəz) is an algorithm for finding the shortest paths between nodes in a graph, which may represent, for example, road networks.  It was conceived by computer scientist Edsger W. Dijkstra in 1956 and published three years later.The algorithm exists in many variants. Dijkstra\'s original algorithm found the shortest path between two given nodes, but a more common variant fixes a single node as the "source" node and finds shortest paths from the source to all other nodes in the graph, producing a shortest-path tree.For a given source node in the graph, the algorithm finds the shortest path between that node and every other.:\u200a196–206\u200a It can also be used for finding the shortest paths from a single node to a single destination node by stopping the algorithm once the shortest path to the destination node has been determined. For example, if the nodes of the graph represent cities and edge path costs represent driving distanc

In [56]:
do_search('Dijtra') # typo on purpose (Dijtra instead of Dijkstra)

['= Dijkstra\'s algorithm =Dijkstra\'s algorithm ( DYKE-strəz) is an algorithm for finding the shortest paths between nodes in a graph, which may represent, for example, road networks.  It was conceived by computer scientist Edsger W. Dijkstra in 1956 and published three years later.The algorithm exists in many variants. Dijkstra\'s original algorithm found the shortest path between two given nodes, but a more common variant fixes a single node as the "source" node and finds shortest paths from the source to all other nodes in the graph, producing a shortest-path tree.For a given source node in the graph, the algorithm finds the shortest path between that node and every other.:\u200a196–206\u200a It can also be used for finding the shortest paths from a single node to a single destination node by stopping the algorithm once the shortest path to the destination node has been determined. For example, if the nodes of the graph represent cities and edge path costs represent driving distanc

In [57]:
do_search('Bellman-Ford')



['= Bellman–Ford algorithm =The Bellman–Ford algorithm is an algorithm that computes shortest paths from a single source vertex to all of the other vertices in a weighted digraph.It is slower than Dijkstra\'s algorithm for the same problem, but more versatile, as it is capable of handling graphs in which some of the edge weights are negative numbers.The algorithm was first proposed by Alfonso Shimbel (1955), but is instead named after Richard Bellman and Lester Ford Jr., who published it in 1958 and 1956, respectively. Edward F. Moore also published a variation of the algorithm in 1959, and for this reason it is also sometimes called the Bellman–Ford–Moore algorithm.Negative edge weights are found in various applications of graphs, hence the usefulness of this algorithm.If a graph contains a "negative cycle" (i.e. a cycle whose edges sum to a negative value) that is reachable from the source, then there is no cheapest path: any path that has a point on the negative cycle can be made ch