## WikiGolf

Goal: To reach a target page in as few links as possible

Description:
WikiGolf is a program intended to browse Wikipedia with some degree of intelligence. It will accept two parameters, a starting page and a target. It will begin at the starting page and check the links for anything related to the target, and if no relations are found it will choose one at random. It will then examine that page and check any of its links for pages related to the target, and so forth until it reaches a page that is within the threshold for relevence to the target. 

In [None]:
import numpy as np
import requests
import wikipedia
import gensim
from gensim import corpora, models
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import wikisearch
import nltk
from nltk.corpus import stopwords
import nltk.collocations
from nltk import FreqDist, word_tokenize
import string
import re
import sys

### Manual interaction with the Wikipedia API

It is possible to find all links on a page using direct interaction with the Wikipedia API, but accessing the relevent information from the query result is somewhat complex.

In [None]:
S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

TITLE = 'Jurassic Park (novel)'

PARAMS = {
    'action': "query",
    'titles': TITLE,
    'prop': "links",
    'pllimit': "max",
    'format': "json",
}

R = S.get(url=URL, params=PARAMS)
data = R.json()
#print(data['query']['title'])

In [None]:
data['query']['pages'][list(data['query']['pages'])[0]]['links'][:20]

In [None]:
links = data['query']['pages'][list(data['query']['pages'])[0]]['links']

In [None]:
n = np.random.randint(0,len(links))
links[n]

In [None]:
n = np.random.randint(0,len(links))
NEXT = links[n]['title']

PARAMS = {
    'action': "query",
    'titles': NEXT,
    'prop': "links",
    'pllimit': "max",
    'format': "json",
}

R = S.get(url=URL, params=PARAMS)
next_data = R.json()
print(links[n]['title'])
print(next_data['query']['pages'][list(next_data['query']['pages'])[0]]['title'])

In [None]:
def get_links(TITLE):
    PARAMS = {
        'action': "query",
        'titles': TITLE,
        'prop': "links",
        'pllimit': "max",
        'format': "json",
    }

    R = S.get(url=URL, params=PARAMS)
    return R.json()

In [None]:
start = input()
hops = input()

print("0 : " + start)
title = start
for i in range(int(hops)):
    response = get_links(title)
    links = response['query']['pages'][list(response['query']['pages'])[0]]['links']
    n = np.random.randint(0,len(links))
    title = links[n]['title']
    print(str(i+1) + " : " + title)

By using the `Wikipedia` wrapper for the API we can retrieve the same information with a much more simple call.

In [None]:
jp = wikipedia.page('Jurassic Park (novel)')

Now I have created a function in `wikisearch` which uses a FreqDist to find the 50 most common words in a page.

In [None]:
wikisearch.get_50_most_common(wikipedia.page("Batman: No Man's Land").content)[:10]

In [None]:
wikisearch.get_50_most_common(wikipedia.page('Catwoman').content)[:10]

In [None]:
wikisearch.get_50_most_common(wikipedia.page('Bread').content)[:20]

In [None]:
wikisearch.get_50_most_common(wikipedia.page('Cake').content)[:10]

In [None]:
wikisearch.normalized_top_50(wikipedia.page("Spider-man").content)[:20]

In [None]:
c = wikipedia.page('Trickster (comics)')
t = wikipedia.page('Catwoman')

In [None]:
model = make_model(c,t)

dist = model.wv.distance('also'.lower(),'Catwoman'.lower())
# for word in c.title.split(' '):
#             word = word.lower()
#             try:
#                 for targetword in t.title.split(' '):
#                     dist = model.wv.distance(word,targetword.lower())
#                     if dist < 0.05:
#                         print("True")
#                         break
#             except KeyError:
#                 print("KeyError: " + c.title)
# if dist < 0.009:
#         print("True")

model.wv.most_similar(['catwoman'])

In [None]:
model.wv.distance('catwoman', 'elseworlds')

### Main Program

Test functions and main program, for ease of testing and making adjustments.

In [None]:
def make_model(current,target):
    #combine into list
    page_text = current.content.split(".") + target.content.split(".")

    text = []

     #format for word2vec
    for clue in page_text:
        sentence = clue.translate(str.maketrans('','',string.punctuation)).split(' ')
        new_sent = [word.lower() for word in sentence]   
        text.append(new_sent)
    
    #create model
    model = gensim.models.Word2Vec(text,sg=1)
    model.train(text, total_examples=model.corpus_count, epochs=model.epochs)
    return model

def check_links(model,current,target,visited):
    
    #get links from current
    links = current.links
    success = []
    errors = []
    
    #check links against model for relevence to target subject
    for l in links:
        for word in l.split(' '):
            word = word.lower()
            try:
                for targetword in target.split(' '):
                    dist = model.wv.distance(word,targetword.lower())
                    if dist < 0.02 and l not in visited:
                        success.append((l,dist))
                        break
            except KeyError:
                errors.append(word)
    #if related links found, use most related link, otherwise random
    if len(success) > 0:
        success.sort(key=lambda tup: tup[1])
        return success[0][0]
    else:
        skiplist = ['Wikipedia', 'Category']
        title = links[np.random.randint(0,len(links))]
        while any(sub in title for sub in skiplist):
            title = links[np.random.randint(0,len(links))]
        print(title)
        return title

In [None]:
#get input
start = input()
target = input()
#set up initial variables
target_page = wikipedia.page(target)
closest = ("none",sys.maxsize)
print("0 : " + wikipedia.page(start).title)
path = [start]
visited = set(start)
title = start
exit = False
i = 0

#for i in range(100):
while not exit:
    try:
        #test for target page
        if title.lower() == target.lower():
            exit = True
            break
        #get next page
        current = wikipedia.page(title)
        #create model
        model = make_model(current,target_page)
        #get common words in current page
        top_20 = wikisearch.get_50_most_common(page.content)[:20]
#         for word in top_20:
#             if word == target.lower():
# #             for t in target.split(' '):
# #                 if model.wv.distance(t,word):
        #test for similarity to target page
        for word,freq in top_20:
            try:
                for target_word in wikisearch.preprocess(target_page.title):
                    dist = model.wv.distance(word,target_word)
                    if dist < 0.0016 and closest[1] < dist:
                            closest = (title, dist)
#                         exit = True
#                         break
            except KeyError:
                pass
#             if exit:
#                 break
#         if exit:
#             break
        search_success = True
    except wikipedia.exceptions.DisambiguationError:
        search_success = False
    except wikipedia.exceptions.PageError:
        search_success = False
        
    #get next link
    title = check_links(model,current,target,visited)
    visited.add(title)
    path.append(title)
    if i % 100 == 0:
        print(".",end="")
    i += 1

print("Page Found!") if exit else print("Not Found, closest page was: " + closest[0])
print(str(i) + " Hop(s), Finish at : " + title)
#print("Word: " + word + " with dist: " + str(dist))

In [None]:
vec = make_model(wikipedia.page('Batman'),wikipedia.page('Superman'))

In [None]:
vec.wv.distance('luthor','superman')

### Analyzing with Word2Vec

In [None]:

page1 = wikipedia.page('Bread')
page2 = wikipedia.page('Batman')
page_text = page1.content.split(".") + page2.content.split(".")

text = []

for clue in page_text:
    sentence = clue.translate(str.maketrans('','',string.punctuation)).split(' ')
    new_sent = [word.lower() for word in sentence]   
    text.append(new_sent)

In [None]:
text[0]

In [None]:
0.0020514726638793945 < 0.002

In [None]:
model.wv.distance('also','catwoman') < 0.002

In [None]:
"Jurassic Park".lower()