# Crossword Solver Project

In [5]:
## 1
from lxml import html
from utils import *
import requests
import calendar
import json
import string
import re

## 2
from __future__ import print_function
from PIL import Image
import pytesseract
import sys

## 3

## 4
import urllib
import functools
import itertools
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords

## 5
import pandas as pd

## 1. Scrape

#### retreive ####
* IN
    * url (str): URL of the page at www.nytimescrossword.com to scrape
* OUT 
    * clues (dict): props Across and Down are lists of dicts with props location, clue, answer

In [3]:
def retrieve(url):
page = requests.get(url)
tree = html.fromstring(page.content)
raw = [x for x in tree.xpath('//i/text()') if x != '\n']
clues = {
"Across": list(),
"Down": list()
}
direction = "Across"
lastLocNum = 0
for d in raw:
try:
dotPos = d.index(".")
locNum = d[1:dotPos]
if direction == "Across":
if int(locNum) < lastLocNum:
direction = "Down"
else:
lastLocNum = int(locNum)
location = locNum+"-"+direction
clue, answer = re.split('\s\:\s+(?=[A-Z\‘\-\.\…]+)',d[dotPos+1:])
clues[direction].append({
"location": location,
"clue": clue.strip('[ \-\.]'),
"answer": re.sub('['+string.punctuation+']','',answer)
})
except:
try:
clue, answer = re.split('\s+(?=[A-Z]{2,})',d[dotPos+1:],1)
clues[direction].append({
"location": location,
"clue": clue.strip('[ \-\.]'),
"answer": re.sub('['+string.punctuation+']','',answer)
})
except:
print((url, d))
errors.append((url, d))
return clues

## 4. Generate Candidates

In [61]:
import spacy
nlp = spacy.load('en')

#### sortVocab ####
* IN
    * maxlen (int): longest word we want to store
* OUT
    * sortedvocab (dict): NLTK vocabulary list sorted by length
    
#### cosine ####
* IN
    * v1 (list): vector
    * v2 (list): vector
* OUT
    * ???

In [77]:
def sortVocab(maxlen):
    sortedvocab = {}
    keys = []
    for i in [w for w in nlp.vocab if w.has_vector and w.orth_.islower() and len(w.orth_) <= maxlen]:
        k = len(i.orth_)
        if k not in keys:
            sortedvocab[k] = []
            keys.append(k)
        sortedvocab[k].append(i)
    return sortedvocab

cosine = lambda v1, v2: np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

#### getKnowledgeGraphCandidates ####
Uses Google Knowledge Graph Search API to return articles
* IN
    * clue (str): clue to pass as API query
    * length (int): word length to filter response
* OUT
    * (set) words from API search of correct length

#### getTokensForKGSearch
* IN
	* clue (str): clue to get pos_
	* properonly (bool): include only proper nouns
* OUT
    * Tokenize clue return words with specific POS

#### getWikiCandidates ####
Uses Wikipedia API to return search results
* IN
    * clue (str): clue to pass as API search query
    * length (int): word length to filter response
* OUT
    * (set) words from API search of correct length

#### getWordnetCandidates ####
* IN
    * clue (str): one-word clue for which to find syno-,hypo-,hyper-nyms. 
    * length (int): length of expected answer
* OUT
    * Uses NLTK WordNet
        
#### getSpacyCandidates ####
* IN
    * clue (list): tokenized list of words whose vectors to sum
    * length (int): length of expected answer
    * ret_count (int): number of candidates to return
* OUT
    * Uses spaCy word vectors

In [192]:
def getKnowledgeGraphCandidates(clue,length):
    api_key = 'AIzaSyCz3EetlDMLlyU7LLWUH2n1U7mhUfqyxRk'
    service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
    params = {
            'query': clue,
            'limit': 10,
            'indent': True,
            'key': api_key,
    }
    url = service_url + '?' + urllib.parse.urlencode(params)
    candidates = set()
    response = json.loads(urllib.request.urlopen(url).read())
    for element in response['itemListElement']:
        if element["result"].get("detailedDescription",{}).get("articleBody",None) is not None:
            ## TODO use helper function removePunctAndTokenize
            article = re.split(", |\. |\s",element["result"]["detailedDescription"]["articleBody"])
            candidates.update(set(filter(lambda x: len(x) == length,article)))
    return candidates

def getTokensForKGSearch(clue,properonly = True):
    clue = clue[:1].lower() + clue[1:]
    tokens = nltk.pos_tag(nltk.word_tokenize(clue))
    lookup = ["NNP","NNPS"] if properonly else ["NNP","NNPS","NN","NNS"]
    ret = " ".join([i for (i,j) in tokens if j in lookup])
    return ret

def getWikiCandidates(clue,length):
    service_url = 'https://en.wikipedia.org/w/api.php?action=query&list=search&format=json'
    search_string = urllib.parse.urlencode({"srsearch":clue})
    url = service_url+'&'+search_string
    response = json.loads(urllib.request.urlopen(url).read())
    candidates = set()
    for entry in response['query']['search']:
        text = removePunctAndTokenize(entry['title'])
        candidates.update(set(matchLengthNoNumbers(text,length)))
        text = removePunctAndTokenize(entry['snippet'])
        candidates.update(set(matchLengthNoNumbers(text,length)))
    return candidates

def getWordnetCandidates(clue,length):
    candidates = set()
    clue = re.sub('['+string.punctuation+']','',clue.lower())
    if clue is not wn.morphy(clue):
        morphedclue = wn.morphy(clue)
        morph = clue.replace(morphedclue,'')
        clue = morphedclue
    synsets = wn.synsets(clue)
    candidates.update({y for x in synsets for y in x.lemma_names() if len(y) == length})
    if 'morph' in locals():
        candidates.update({y+morph for x in synsets for y in x.lemma_names() if len(y) == length - len(morph)})
    for syn in synsets:
        if syn.hyponyms():
            for hyposet in syn.hyponyms():
                candidates.update({lemma.name() for lemma in hyposet.lemmas() if len(lemma.name()) == length})
        if syn.hypernyms():
            for hyperset in syn.hypernyms():
                candidates.update({lemma.name() for lemma in hyperset.lemmas() if len(lemma.name()) == length})
    return candidates

def getSpacyCandidates(clue,length,vocab,ret_count):
    vecs = [x.vector for x in clue]
    vecsum = functools.reduce(lambda x,y: np.add(x,y),vecs)
    vocab = [w for w in vocab if w not in clue]
    vocab.sort(key=lambda w: cosine(w.vector, vecsum))
    return {w.orth_ for w in vocab[-1*ret_count:]}

def removePunctAndTokenize(text):
    text = re.sub(":|\(|\)","",text)
    return re.split(", |\. |\s",text)

def matchLengthNoNumbers(text,length):
    return filter(lambda x: len(x) == length and re.search('[0-9]',x) is None, text)

#### getCandidates ####
Takes array of clues and appends candidate answers

In [176]:
def getCandidates(clues):
    for i,v in enumerate(clues):
        clue = v['clue']
        length = v['length']
        print(clue,length)
        if clue.find(' ') == -1:
            v["cand_wn"] = getWordnetCandidates(clue,length)
            v["cand_wn"] = list(v["cand_wn"])
#             print('Wordnet: ',v["cand_wn"])
        elif "___" in clue:
            v["cand_wk"] = set()
            v["cand_wk"].update(getWikiCandidates(re.sub("___","",clue),length))
            v["cand_wk"] = list(v["cand_wk"])
        elif re.search('([0-9]+)(\-)(Across|Down)',clue) is not None:
            print("x-A/y-D",clue)
        else:
            formulations = []
            formulations.append([nlp.vocab[x.lower_] for x in nlp(clue) if x.pos_ == "NOUN" or x.pos_ == "PROPN"])
            formulations.append([nlp.vocab[x] for x in clue.split() if x not in stopwords.words('english')])
            if len(formulations[0]) == 0 and len(formulations[1]) == 0:
                formulations = list([nlp.vocab[x.lower_] for x in nlp(clue) if x.pos_ is not "PART"])
            v["cand_vec"] = set()
            for clue_tokens in formulations:
                if len(clue_tokens) != 0:
                    v["cand_vec"].update(getSpacyCandidates(clue_tokens,length,vocab[length],5))
#             print('Word Vectors: ', v["cand_vec"])

            v["cand_kg"] = set()
            clue_tokens = getTokensForKGSearch(clue,True)
            if clue_tokens != '':
                v["cand_kg"].update(getKnowledgeGraphCandidates(clue_tokens,length))
            clue_tokens = getTokensForKGSearch(clue,False)
            if clue_tokens != '':
                v["cand_kg"].update(getKnowledgeGraphCandidates(clue_tokens,length))
#             print('Knowledge Graph: ', v["cand_kg"])

            v["cand_wk"] = set()
            v["cand_wk"].update(getWikiCandidates(clue,length))

            v["cand_vec"] = list(v["cand_vec"])
            v["cand_kg"] = list(v["cand_kg"])
            v["cand_wk"] = list(v["cand_wk"])
        clues[i] = v
    return clues

#### loadPuzzle ####
* IN
	* fileloc (str): path to JSON file
* OUT
    * writes file with clues+candidates appended

In [80]:
def loadPuzzle(fileloc):
    with open(fileloc,'r') as fr:
        puzzle = json.loads(fr.read())[0]
    puzzle['clues'] = getCandidates(puzzle['clues'])
    if input("print to file? y/n: ") == 'y':
        with open(fileloc[:-5]+"_cands.json",'w') as fw:
            fw.write(json.dumps(puzzle,indent=1))

In [81]:
vocab = sortVocab(15)

In [177]:
loadPuzzle('./data/merge_0102-17.json')

Leatherworker's tool 3
Wrath 3
Sauce often used in a Bloody Mary 7
Port-au-Prince resident 7
"Um-hmm, O.K." 7




Call from a football referee 7
"Please! Anything but!" 7
Onion relative used in soups 4
Little troublemakers 4
Charged particles 4
M.R.I. orderers 3
Versatile bean 4
Texas site of a 1993 siege 4
Itsy-bitsy branch 4
Some DVD players 4
Caustic agent 3
Japan's largest company by revenue 6
Crops used in making cigarettes 8
Ready, willing and ___ 4
Classic Eric Clapton song about unrequited love 5
Statutes 4
Loses one's hair 8
Hold back, as a yawn 6
Moment, informally 3
World's fair, e.g 4
Wish 4
Like the water in a baptism 4
Get bent out of shape 4
___ talks (lecture series) 3
Busy time at the drive-thru 4
Nay voter 4
It's made up of DNA 4
Message that might end "R.I.P." 7
Invaded in large numbers 7
17-year insects 7
Standards by which things are measured 7
Follows, as a schedule 7
Monterrey Mrs 3
Consumed 3
Get ___ of (grasp) 5
Communion tidbit 5
Vegetarianism or bohemianism 15
Three on a grandfather clock 3
Source of faraway X-rays 11
Foe 5
Pantry containers 4
Long, long ___ 3
Web crawle

## 5. Match Candidates

### Run some analytics on my candidates

In [185]:
import pandas as pd
import numpy as np
import json
import re

cand_methods = ['cand_vec','cand_kg','cand_wn','cand_wk']

def getHitCount(clues):
    success = []
    tally = []
    for i in clues:
        score = [0 for x in cand_methods]
        add = False
        for index,j in enumerate(cand_methods):
            if i.get(j) and re.sub(" ","",i['answer'].lower()) in [x.lower() for x in i[j]]:
#                 print(i['answer'],i[j],j)
                add = True
                score[index] = 1
        if add:
            success.append(i)
            tally.append(score)
    print(len(success),len(clues),len(success)/len(clues))
    return success,tally

### What is my hit rate?

In [186]:
puzzle = json.load(open('./data/merge_0102-17_cands.json','r'))
clues = puzzle['clues']
success,tally = getHitCount(clues)

27 74 0.36486486486486486


### Which methods did the hits come from?

In [187]:
tally = pd.DataFrame(tally,index=map(lambda x: x['answer'],success),columns=cand_methods)
print(tally)

         cand_vec  cand_kg  cand_wn  cand_wk
AWL             1        0        0        1
IRE             0        0        1        0
TABASCO         0        1        0        1
HAITIAN         1        0        0        1
LEEK            0        0        0        1
IONS            1        1        0        0
SOYA            0        1        0        0
WACO            0        0        0        1
TWIG            1        0        0        0
LYE             1        0        0        1
ABLE            0        0        0        1
LAYLA           0        0        0        1
HOPE            0        0        1        0
HOLY            1        0        0        1
GENE            1        0        0        0
CICADAS         1        1        0        1
ENEMY           0        0        1        0
AGO             0        0        0        1
BOT             1        0        0        1
CHAN            0        1        0        1
OATS            0        0        0        1
OBOE      

### What do these hits actually look like?

In [93]:
pd.set_option("display.max_colwidth",200)
df = pd.DataFrame(success)
print(df.T)

                                                                                        0   \
answer                                                                                 AWL   
cand_kg                                                                                 []   
cand_vec                                [use, app, can, nwn, awl, key, cnc, jig, web, bot]   
cand_wk   [use, The, and, all, now, Pin, awl, its, for, Its, Ann, are, can, Age, the, his]   
cand_wn                                                                                NaN   
clue                                                                  Leatherworker's tool   
coords                                                            [[0, 0], [0, 1], [0, 2]]   
length                                                                                   3   
location                                                                          1-Across   

                                1   \
answer               

### Cross-Reference the Candidates

In [190]:
def getCoordsLookup(puzzle):
    clues = puzzle['clues']
    clues = concatCands(clues)
    grid = puzzle['grid']
    lookup = dict()
    for i,row in enumerate(grid):
        for j,cell in enumerate(row):
            if cell != "  ":
                entry = dict()
                relevant_clues = [x for x in clues if [i,j] in x['coords']]
                for clue in relevant_clues:
                    position = clue['coords'].index([i,j])
                    ## TODO punctuation check should happen upstream
                    cands = [(x.lower()[position],x.lower()) for x in clue['cands'] if x.lower()[position] not in string.punctuation]
                    for cand in cands:
                        if not entry.get(cand[0]):
                            entry[cand[0]] = {"Across":[],"Down":[]}
                        entry[cand[0]][clue['location'].split("-")[1]].append(cand[1])
                lookup[str(i)+"_"+str(j)] = entry
    return lookup

def concatCands(clues):
    for index,clue in enumerate(clues):
        clue['cands'] = set()
        for i in cand_methods:
            if clue.get(i):
                clue['cands'].update(clue[i])
        clue['cands'] = list(clue['cands'])
        clues[index] = clue
    return clues

### In each cell, what letters appear for candidates in both directions?

In [191]:
coordslookup = getCoordsLookup(puzzle)
for k,v in coordslookup.items():
    print(k)
    print(pd.DataFrame(v).T)

0_0
                                Across                          Down
a  [app, and, all, awl, ann, are, age]                [audio, about]
b                                [bot]                       [back?]
c                           [can, cnc]                [chart, coins]
d                                   []                       [drama]
e                                   []                       [every]
f                                [for]                       [field]
g                                   []                [grabs, grasp]
h                                [his]                       [hymie]
i                           [its, its]                            []
j                                [jig]                            []
k                                [key]                       [krazy]
l                                   []                       [least]
m                                   []                [mouse, mouse]
n                           [n