In [1]:
import wikisearch
import wikipedia
import string
import sys
import numpy as np

In [2]:
def check_links_subjects(model,current,subs,visited):
    """Uses the input model to check the similarity of the links in the current page
    to the title of the target page."""
    #get links from current
    links = current.links
    success = []
    errors = []
    
    #check links against model for relevence to target subject
    for l in links:
        for word in l.split(' '):
            word = word.lower()
            try:
                for targetword in subs:
                    dist = model.wv.distance(word,targetword.lower())
                    if dist < 0.08 and l not in visited:
                        success.append((l,dist))
                        break
            except KeyError:
                errors.append(word)
    #if related links found, use most related link, otherwise random
    if len(success) > 0:
        success.sort(key=lambda tup: tup[1])
        return success[0][0]
    else:
        skiplist = ['Wikipedia', 'Category']
        title = links[np.random.randint(0,len(links))]
        while any(sub in title for sub in skiplist):
            title = links[np.random.randint(0,len(links))]
        return title

In [3]:
def search_subject(start,subs):
    start_page = wikipedia.page(start)
    closest = ("none",sys.maxsize)
    print("0 : " + wikipedia.page(start).title)
    # if 'path' in locals():
    #     if target_page not in path:
    path = [start_page]
    visited = set()
    visited.add(start)
    title = start
    exit = False
    #i = 0

    for i in range(20):
    #while not exit:
        try:

            #get next page
            current = wikipedia.page(title)
            #test for target page
            #if current not in path:
            path.append(current)
            #create model
            model = wikisearch.make_model_new(path)
            #get common words in current page
            if current != start_page:
                top_20 = wikisearch.get_50_most_common(current.content)[:20]

                #test for similarity to subject
                match = {sub:False for sub in subs}
                for word,freq in top_20:
                    try:
                        for sub in subs:
                            dist = model.wv.distance(word,sub)
                            if dist < 0.05:
                                match[sub] = True
                        if False not in match.values():
                            print("Match Found")
                            exit = True
                    except KeyError:
                        pass
                if exit:
                    break
                search_success = True
        except wikipedia.exceptions.DisambiguationError:
            search_success = False
        except wikipedia.exceptions.PageError:
            search_success = False

        #get next link
        title = check_links_subjects(model,current,subs,visited)
        visited.add(title)
        if i % 10 == 0:
            print(".",end="")
        #i += 1

    print("Page Found!") if exit else print("Not Found")
    print(str(i) + ": Matching page: " + title)

In [None]:
search_subject('Harley Quinn',['woman','supervillain'])

0 : Harley Quinn
.

In [4]:
ww = wikipedia.page('Jurassic Park')
model = wikisearch.make_model_new([ww])

In [3]:
len(model.wv.vocab)

1389

In [28]:
text = ww.content.split(' ')

In [5]:
wikisearch.get_50_most_common(ww.content)

[('jurassic', 126),
 ('film', 104),
 ('world', 85),
 ('park', 77),
 ('series', 44),
 ('would', 40),
 ('released', 30),
 ('trevorrow', 30),
 ('dinosaurs', 28),
 ('also', 23),
 ('first', 21),
 ('based', 19),
 ('universal', 17),
 ('spielberg', 17),
 ('lego', 16),
 ('animated', 16),
 ('said', 16),
 ('novel', 15),
 ('june', 15),
 ('isla', 15),
 ('time', 14),
 ('written', 14),
 ('films', 13),
 ('theme', 12),
 ('release', 12),
 ('kingdom', 12),
 ('dinosaur', 12),
 ('escape', 12),
 ('crichton', 11),
 ('lost', 11),
 ('several', 11),
 ('game', 11),
 ('adaptation', 10),
 ('including', 10),
 ('fallen', 10),
 ('nublar', 10),
 ('island', 10),
 ('announced', 10),
 ('michael', 9),
 ('published', 9),
 ('sequel', 9),
 ('games', 9),
 ('characters', 9),
 ('studios', 9),
 ('franchise', 8),
 ('video', 8),
 ('highest', 8),
 ('grossing', 8),
 ('produced', 8),
 ('animation', 8)]

In [35]:
test = {'a':1,'b':2,'c':3}
len(test)

3

In [59]:
model.wv.distance('jurassic','dinosaur')

0.0018458962440490723

In [41]:
test.__repr__()

"{'a': 1, 'b': 2, 'c': 3}"

In [None]:
"{'a': 1, 'b': 2, 'c': 3}"