In [14]:
import wikisearch
import wikipedia
import string
import sys
import numpy as np

In [6]:
def check_links_subjects(model,current,subs,visited):
    """Uses the input model to check the similarity of the links in the current page
    to the title of the target page."""
    #get links from current
    links = current.links
    success = []
    errors = []
    
    #check links against model for relevence to target subject
    for l in links:
        for word in l.split(' '):
            word = word.lower()
            try:
                for targetword in subs:
                    dist = model.wv.distance(word,targetword.lower())
                    if dist < 0.08 and l not in visited:
                        success.append((l,dist))
                        break
            except KeyError:
                errors.append(word)
    #if related links found, use most related link, otherwise random
    if len(success) > 0:
        success.sort(key=lambda tup: tup[1])
        return success[0][0]
    else:
        skiplist = ['Wikipedia', 'Category']
        title = links[np.random.randint(0,len(links))]
        while any(sub in title for sub in skiplist):
            title = links[np.random.randint(0,len(links))]
        return title

In [44]:
def search_subject(start,subs):
    start_page = wikipedia.page(start)
    closest = ("none",sys.maxsize)
    print("0 : " + wikipedia.page(start).title)
    # if 'path' in locals():
    #     if target_page not in path:
    path = [start_page]
    visited = set()
    visited.add(start)
    title = start
    exit = False
    #i = 0

    for i in range(20):
    #while not exit:
        try:

            #get next page
            current = wikipedia.page(title)
            #test for target page
            #if current not in path:
            path.append(current)
            #create model
            model = wikisearch.make_model_new(path)
            #get common words in current page
            if current != start_page:
                top_20 = wikisearch.get_50_most_common(current.content)[:20]

                #test for similarity to subject
                match = {sub:False for sub in subs}
                for word,freq in top_20:
                    try:
                        for sub in subs:
                            dist = model.wv.distance(word,sub)
                            if dist < 0.06:
                                match[sub] = True
                        if False not in match.values():
                            print("Match Found")
                            exit = True
                    except KeyError:
                        pass
                if exit:
                    break
                search_success = True
        except wikipedia.exceptions.DisambiguationError:
            search_success = False
        except wikipedia.exceptions.PageError:
            search_success = False

        #get next link
        title = check_links_subjects(model,current,subs,visited)
        visited.add(title)
        if i % 10 == 0:
            print(".",end="")
        #i += 1

    print("Page Found!") if exit else print("Not Found")
    print(str(i) + ": Matching page: " + title)

In [55]:
search_subject('Jurassic Park',['dinosaur','movie'])

0 : Jurassic Park
.Match Found
Match Found
Match Found
Match Found
Match Found
Page Found!
1: Matching page: Dinosaur


In [52]:
ww = wikipedia.page('Jurassic Park')
model = wikisearch.make_model_new([ww])

In [3]:
len(model.wv.vocab)

1389

In [28]:
text = ww.content.split(' ')

In [53]:
wikisearch.get_50_most_common(ww.content)

[('jurass', 126),
 ('film', 126),
 ('world', 85),
 ('park', 82),
 ('seri', 44),
 ('releas', 43),
 ('dinosaur', 40),
 ('would', 40),
 ('trevorrow', 30),
 ('anim', 29),
 ('spielberg', 24),
 ('write', 24),
 ('also', 23),
 ('first', 21),
 ('univers', 20),
 ('game', 20),
 ('novel', 19),
 ('base', 19),
 ('lego', 16),
 ('escap', 16),
 ('say', 16),
 ('time', 15),
 ('june', 15),
 ('produc', 15),
 ('isla', 15),
 ('adapt', 14),
 ('theme', 13),
 ('publish', 13),
 ('gross', 13),
 ('island', 13),
 ('charact', 13),
 ('featur', 13),
 ('issu', 13),
 ('open', 12),
 ('make', 12),
 ('kingdom', 12),
 ('role', 12),
 ('cover', 12),
 ('crichton', 11),
 ('lose', 11),
 ('includ', 11),
 ('comic', 11),
 ('sever', 11),
 ('nublar', 11),
 ('direct', 11),
 ('sequel', 10),
 ('fall', 10),
 ('stori', 10),
 ('work', 10),
 ('develop', 10)]

In [35]:
test = {'a':1,'b':2,'c':3}
len(test)

3

In [50]:
model.wv.distance('catwoman','woman')

0.0008208751678466797

In [41]:
test.__repr__()

"{'a': 1, 'b': 2, 'c': 3}"

In [None]:
"{'a': 1, 'b': 2, 'c': 3}"