## WikiGolf

Goal: To reach a target page in as few links as possible

Description:
WikiGolf is a program intended to browse Wikipedia with some degree of intelligence. It will accept two parameters, a starting page and a target. It will begin at the starting page and check the links for anything related to the target, and if no relations are found it will choose one at random. It will then examine that page and check any of its links for pages related to the target, and so forth until it reaches a page that is within the threshold for relevence to the target. 

In [168]:
import numpy as np
import requests
import wikipedia
import gensim
from gensim import corpora, models
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import wikisearch
import nltk
from nltk.corpus import stopwords
import nltk.collocations
from nltk import FreqDist, word_tokenize
import string
import re
import sys

### Manual interaction with the Wikipedia API

Experimenting with obtaining all the links in a page through direct API calls.

In [16]:
S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

TITLE = 'Jurassic Park (novel)'

PARAMS = {
    'action': "query",
    'titles': TITLE,
    'prop': "links",
    'pllimit': "max",
    'format': "json",
}

R = S.get(url=URL, params=PARAMS)
data = R.json()
#print(data['query']['title'])

In [29]:
data['query']['pages'][list(data['query']['pages'])[0]]['links'][:20]

[{'ns': 0, 'title': '2000 AD (comics)'},
 {'ns': 0, 'title': 'A Case of Need'},
 {'ns': 0, 'title': 'Airframe (novel)'},
 {'ns': 0, 'title': 'Alfred A. Knopf'},
 {'ns': 0, 'title': 'Amazon (video game)'},
 {'ns': 0, 'title': 'Amber'},
 {'ns': 0, 'title': 'Amphibian'},
 {'ns': 0, 'title': 'Amusement park'},
 {'ns': 0, 'title': 'Ancient DNA'},
 {'ns': 0, 'title': 'Andrew Ferguson'},
 {'ns': 0, 'title': 'Auxotrophy'},
 {'ns': 0, 'title': 'BILBY Award'},
 {'ns': 0, 'title': 'Backdoor (computing)'},
 {'ns': 0, 'title': 'Barnes & Noble'},
 {'ns': 0, 'title': 'Battle at Big Rock'},
 {'ns': 0, 'title': 'Beyond Westworld'},
 {'ns': 0, 'title': 'Binary (novel)'},
 {'ns': 0, 'title': 'Biotechnology'},
 {'ns': 0, 'title': 'Bird'},
 {'ns': 0, 'title': 'Canopy Flyer'}]

In [30]:
links = data['query']['pages'][list(data['query']['pages'])[0]]['links']

In [38]:
n = np.random.randint(0,len(links))
links[n]

{'ns': 0, 'title': 'DNA'}

In [46]:
n = np.random.randint(0,len(links))
NEXT = links[n]['title']

PARAMS = {
    'action': "query",
    'titles': NEXT,
    'prop': "links",
    'pllimit': "max",
    'format': "json",
}

R = S.get(url=URL, params=PARAMS)
next_data = R.json()
print(links[n]['title'])
print(next_data['query']['pages'][list(next_data['query']['pages'])[0]]['title'])

Jurassic World: Original Motion Picture Soundtrack
Jurassic World: Original Motion Picture Soundtrack


In [48]:
def get_links(TITLE):
    PARAMS = {
        'action': "query",
        'titles': TITLE,
        'prop': "links",
        'pllimit': "max",
        'format': "json",
    }

    R = S.get(url=URL, params=PARAMS)
    return R.json()

In [52]:
start = input()
hops = input()

print("0 : " + start)
title = start
for i in range(int(hops)):
    response = get_links(title)
    links = response['query']['pages'][list(response['query']['pages'])[0]]['links']
    n = np.random.randint(0,len(links))
    title = links[n]['title']
    print(str(i+1) + " : " + title)

 Jurassic Park (novel)
 5


0 : Jurassic Park (novel)
1 : Dragon curve
2 : Blancmange curve
3 : Archimedes
4 : Menander
5 : Akrai


In [54]:
jp = wikipedia.page('Jurassic Park (novel)')

Experimenting with custom built functinos using FreqDist calls from the NLTK package. 

In [147]:
wikisearch.get_50_most_common(wikipedia.page("Batman: No Man's Land").content)[:10]

[('gotham', 41),
 ('citi', 38),
 ('batman', 33),
 ('land', 24),
 ('man', 21),
 ('gordon', 20),
 ('comic', 13),
 ('storylin', 11),
 ('stori', 11),
 ('gang', 11)]

In [76]:
wikisearch.get_50_most_common(wikipedia.page('Catwoman').content)[:10]

[('batman', 179),
 ('catwoman', 155),
 ('selina', 142),
 ('bruce', 50),
 ('two', 32),
 ('vol', 30),
 ('series', 29),
 ('story', 29),
 ('kyle', 28),
 ('one', 28)]

In [79]:
wikisearch.get_50_most_common(wikipedia.page('Bread').content)[:20]

[('bread', 100),
 ('dough', 39),
 ('flour', 30),
 ('yeast', 25),
 ('leavening', 20),
 ('water', 18),
 ('breads', 18),
 ('used', 17),
 ('gluten', 14),
 ('wheat', 13),
 ('baking', 12),
 ('time', 12),
 ('made', 11),
 ('may', 11),
 ('protein', 11),
 ('process', 10),
 ('sourdough', 9),
 ('use', 9),
 ('also', 9),
 ('baked', 9)]

In [80]:
wikisearch.get_50_most_common(wikipedia.page('Cake').content)[:10]

[('cake', 75),
 ('cakes', 60),
 ('flour', 19),
 ('made', 14),
 ('sugar', 13),
 ('butter', 13),
 ('baking', 13),
 ('bread', 12),
 ('sponge', 12),
 ('baked', 11)]

In [40]:
wikisearch.normalized_top_50(wikipedia.page("Spider-man").content)[:20]

[('spider', '0.06055'),
 ('parker', '0.01435'),
 ('comic', '0.01396'),
 ('peter', '0.01376'),
 ('charact', '0.01121'),
 ('amaz', '0.009632'),
 ('marvel', '0.008846'),
 ('issu', '0.008649'),
 ('superhero', '0.007273'),
 ('stori', '0.007273'),
 ('book', '0.007077'),
 ('seri', '0.006487'),
 ('origin', '0.005308'),
 ('time', '0.005308'),
 ('ditko', '0.005111'),
 ('also', '0.004914'),
 ('becom', '0.004718'),
 ('power', '0.004521'),
 ('best', '0.004325'),
 ('first', '0.004128')]

In [120]:
c = wikipedia.page('Trickster (comics)')
t = wikipedia.page('Catwoman')

In [123]:
model = make_model(c,t)

dist = model.wv.distance('also'.lower(),'Catwoman'.lower())
# for word in c.title.split(' '):
#             word = word.lower()
#             try:
#                 for targetword in t.title.split(' '):
#                     dist = model.wv.distance(word,targetword.lower())
#                     if dist < 0.05:
#                         print("True")
#                         break
#             except KeyError:
#                 print("KeyError: " + c.title)
# if dist < 0.009:
#         print("True")

model.wv.most_similar(['catwoman'])

[('2', 0.998737633228302),
 ('1', 0.9985352158546448),
 ('becomes', 0.998490035533905),
 ('annual', 0.998481273651123),
 ('elseworlds', 0.9984744787216187),
 ('batman', 0.9984369874000549),
 ('vol', 0.9984265565872192),
 ('2011', 0.9983789920806885),
 ('age', 0.9983401894569397),
 ('continuity', 0.9982714653015137)]

In [125]:
model.wv.distance('catwoman', 'elseworlds')

0.0015254616737365723

In [101]:
def make_model(current,target):
    #combine into list
    page_text = current.content.split(".") + target.content.split(".")

    text = []

     #format for word2vec
    for clue in page_text:
        sentence = clue.translate(str.maketrans('','',string.punctuation)).split(' ')
        new_sent = [word.lower() for word in sentence]   
        text.append(new_sent)
    
    #create model
    model = gensim.models.Word2Vec(text,sg=1)
    model.train(text, total_examples=model.corpus_count, epochs=model.epochs)
    return model

def check_links(model,current,target,visited):

    #model = make_model(current,target)
    
    #get links from current
    #check links against model for relevence to target subject
    links = current.links
    success = []
    errors = []
    for l in links:
        for word in l.split(' '):
            word = word.lower()
            try:
                for targetword in target.split(' '):
                    dist = model.wv.distance(word,targetword.lower())
                    if dist < 0.02 and l not in visited:
                        success.append((l,dist))
                        break
            except KeyError:
                errors.append(word)
    
    if len(success) > 0:
        success.sort(key=lambda tup: tup[1])
        return success[0][0]
    else:
        skiplist = ['Wikipedia', 'Category']
        title = links[np.random.randint(0,len(links))]
        while any(sub in title for sub in skiplist):
            #print(title)
            title = links[np.random.randint(0,len(links))]
        print(title)
        return title

In [172]:
start = input()
target = input()
target_page = wikipedia.page(target)
closest = ("none",sys.maxsize)
print("0 : " + wikipedia.page(start).title)
path = [start]
visited = set(start)
title = start
exit = False
for i in range(100):
    try:
        if title.lower() == target.lower():
            exit = True
            break
        current = wikipedia.page(title)
        model = make_model(current,target_page)
        top_20 = wikisearch.get_50_most_common(page.content)[:20]
#         for word in top_20:
#             if word == target.lower():
# #             for t in target.split(' '):
# #                 if model.wv.distance(t,word):
        for word,freq in top_20:
            try:
                for target_word in wikisearch.preprocess(target_page.title):
                    dist = model.wv.distance(word,target_word)
                    if dist < 0.0016 and closest[1] < dist:
                            closest = (title, dist)
#                         exit = True
#                         break
            except KeyError:
                pass
#             if exit:
#                 break
#         if exit:
#             break
        search_success = True
    except wikipedia.exceptions.DisambiguationError:
        search_success = False
    except wikipedia.exceptions.PageError:
        search_success = False
        
    title = check_links(model,current,target,visited)
    visited.add(title)
    path.append(title)
    if i % 100 == 0:
        print(".",end="")

print("Page Found!") if exit else print("Not Found, closest page was: " + closest[0])
print(str(i) + " Hop(s), Finish at : " + title)
print("Word: " + word + " with dist: " + str(dist))

0 : Jurassic Park
.Page Found!
8 Hop(s), Finish at : Batman
Word: also with dist: 0.0019004344940185547


In [51]:
vec = make_model(wikipedia.page('Batman'),wikipedia.page('Superman'))

In [59]:
vec.wv.distance('luthor','superman')

0.02975165843963623

### Analyzing with Word2Vec

In [23]:

page1 = wikipedia.page('Bread')
page2 = wikipedia.page('Batman')
page_text = page1.content.split(".") + page2.content.split(".")

text = []

for clue in page_text:
    sentence = clue.translate(str.maketrans('','',string.punctuation)).split(' ')
    new_sent = [word.lower() for word in sentence]   
    text.append(new_sent)

In [131]:
text[0]

NameError: name 'text' is not defined

In [133]:
0.0020514726638793945 < 0.002

False

In [135]:
model.wv.distance('also','catwoman') < 0.002

False

In [170]:
"Jurassic Park".lower()

'jurassic park'