# Crossword Solver Project

In [1]:
## 1
from lxml import html
from utils import *
import requests
import calendar
import json
import string
import re

## 2
from __future__ import print_function
from PIL import Image
#brew install tesseract
import pytesseract
import sys

## 3

## 4
import spacy
nlp = spacy.load('en')

import urllib
import functools
import itertools
import numpy as np
import nltk
#nltk.download()
from nltk.stem.porter import *
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
stemmer = PorterStemmer()

from PyDictionary import PyDictionary
dictionary=PyDictionary()

## 5
import pandas as pd

## 1. Scrape

#### zero #### 
(utility function)
* IN
* OUT

#### scrape ####
* IN
* OUT

#### retreive ####
* IN
    * url (str): URL of the page at www.nytimescrossword.com to scrape
* OUT 
    * clues (dict): props Across and Down are lists of dicts with props location, clue, answer

In [2]:
def zero(n):
    if len(str(n)) == 1:
        return "0"+str(n)
    else:
        return str(n)

def stripPunct(word):
    return re.sub('\W','',word)
    
def file_date(year,month,day):
    return zero(month)+zero(day)+"-"+year[2:]
    
def url_builder(year,month,day):
#     return "http://www.nytcrossword.com/"+year+"/"+zero(month)+"/"+file_date(year,month,day)+"-new-york-times-crossword.html"
    return "http://www.nytcrossword.com/"+year+"/"+zero(month)+"/"+file_date(year,month,day)+"-ny-times-crossword-answers"
    
def scrape(year,month,**kwargs):
    print('stage 1: scraping clues and answers')
    collection = []
    errors = list()
    for i in month:
        day = kwargs['day'] if 'day' in kwargs else calendar.monthrange(int(year),i)[1]
        for j in range(day,day+1):
            url = url_builder(year, i, j)
            collection.append({
                "date": year+"-"+zero(i)+'-'+zero(j),
                "clues": retrieve(url)
            })

    errorfile = open('./errors.txt','w')
    errorfile.write('\n'.join(map(lambda x: x[0]+";"+x[1],errors)))
    errorfile.close()

    datafile = open('./data/data_'+year+'.json','w')
    datafile.write(json.dumps(collection,indent=1))
    datafile.close()

def retrieve(url):
    page = requests.get(url)
    tree = html.fromstring(page.content)
    #raw = [x for x in tree.xpath('//i/text()') if x != '\n']
    raw = [x for x in tree.xpath('//h2[@id="all-clues"]/following-sibling::p[@class="no_bottom_margin"]/em/text()')]
    clues = {
        "Across": list(),
        "Down": list()
    }
    direction = "Across"
    lastLocNum = 0
    for d in raw:
        try:
            dotPos = d.index(".")
            locNum = d[1:dotPos]
            if direction == "Across":
                if int(locNum) < lastLocNum:
                    direction = "Down"
                else:
                    lastLocNum = int(locNum)
            location = locNum+"-"+direction
            clue, answer = re.split('\s\:\s+(?=[A-Z\‘\-\.\…]+)',d[dotPos+1:])
            clues[direction].append({
                "location": location,
                "clue": re.sub(u"(\u2018|\u2019|\u201c|\u201d)","'",clue.strip('[ \-\.]')),
                "answer": re.sub('['+string.punctuation+',\\t]','',answer)
            })
        except:
            try:
                clue, answer = re.split('\s+(?=[A-Z]{2,})',d[dotPos+1:],1)
                clues[direction].append({
                    "location": location,
                    "clue": re.sub(u"(\u2018|\u2019|\u201c|\u201d)","'",clue.strip('[ \-\.]')),
                    "answer": re.sub('['+string.punctuation+',\\t]','',answer)
                })
            except:
                print((url, d))
                errors.append((url, d))
    return clues

## 2. Process Image

#### saveImage ####
* IN
* OUT

#### parseAndPrintImage ####
* IN
* OUT

#### saveMultipleImages ####
* IN
* OUT

In [3]:
def saveImage(url):
    page = requests.get(url)
    tree = html.fromstring(page.content)
    path = '//a[@href="#top"]/img/@src' #'+url+'
    imgUrl = tree.xpath(path)[0]
    with open('./images/'+url[36:43]+'.png','wb') as file:
        img = requests.get(imgUrl,stream=True)
        for chunk in img:
            file.write(chunk)

def parseAndPrintImage(year,month,day):
    print('stage 2: scanning and rendering grid images')
    nSquares = 21 if calendar.weekday(int(year), month, day) == 6 else 15
    imgLoc = './images/'+file_date(year,month,day)+'.png'
    im = Image.open(imgLoc)
    im = Image.composite(im, Image.new('RGB', im.size, 'white'), im)
    pps = im.size[0]/nSquares
    # im.show()
    for i in range(nSquares):
        row = list()
        for j in range(nSquares):
            boxOuter = (j*pps,i*pps,j*pps+pps,i*pps+pps)
            tile = im.crop(boxOuter)
            #tile.save('./images/tile_'+str(i)+'_'+str(j)+'.jpg')
            if not tile.getbbox():
                tileText = " "
            else:
                boxInner = (7,9,27,27)
                tileInner = tile.crop(boxInner)
                tileText = pytesseract.image_to_string(tileInner,config="-psm 10 -l eng -c tessedit_char_whitelist="+string.ascii_uppercase)
                ## this is hacky - find a better way
                tileText = "I" if tileText == "" else tileText
            row.append(tileText)
        print(" ".join(row))

def saveMultipleImages(year,month,**kwargs):
    print('stage 2: downloading and saving grid images')
    for i in month:
        day = kwargs['day'] if 'day' in kwargs else calendar.monthrange(int(year),i)[1]
        for j in range(day,day+1):
            url = url_builder(year, i, j)
            saveImage(url)


## 3. Format Grid

#### Clue ####
* METHODS

#### getBlackSquares ####
* IN
* OUT

#### getNewClueWithLength ####
* IN
* OUT

#### placeClues ####
* IN
* OUT

#### mergeWithClues ####
* IN
* OUT

In [4]:
class Clue(object):
    def __init__(self,direction,locNum,coords=(0,0),length=0):
        self.location = str(locNum)+"-"+direction
        self.length = length
        self.coords = self.getcoords(coords,length,direction)
        self.answer = ""
    def getcoords(self,start,length,direction):
        coords = [start]
        if direction == "Across":
            for j in range(1,length):
                coords.append((start[0],start[1]+j))
        else:
            for i in range(1,length):
                coords.append((start[0]+i,start[1]))
        return coords
    def __repr__(self):
        return str({'location': self.location, 'coords': self.coords, 'length': self.length})

def getBlackSquares(imgLoc,nSquares):
    im = Image.open(imgLoc)
    im = Image.composite(im, Image.new('RGB', im.size, 'white'), im)
    pps = im.size[0]/nSquares
    return [(i,j) for i in range(nSquares) for j in range(nSquares) if not im.crop((j*pps,i*pps,j*pps+pps,i*pps+pps)).getbbox()]

def getNewClueWithLength(i,j,n,blackSquares,direction,nSquares):
    if direction == "Across":
        nextBlack = [q for (p,q) in blackSquares if i == p]
        len = (nextBlack[0] if nextBlack else nSquares) - j
    elif direction == "Down":
        nextBlack = [p for (p,q) in blackSquares if j == q]
        len = (nextBlack[0] if nextBlack else nSquares) - i
    return Clue(direction,n,(i,j),len)

def placeClues(blackSquares,nSquares):
    black = "  "
    n = 1
    grid = list()
    clues = {
        "Across": list(),
        "Down": list()
    }
    for i in range(nSquares):
        grid.append(list())
        for j in range(nSquares):
            if len(blackSquares) > 0 and blackSquares[0] == (i,j):
                grid[i].append(black)
                blackSquares.pop(0)
            else:
                if i == 0:
                    if j == 0 or grid[i][j-1] == black:
                        clues["Across"].append(getNewClueWithLength(i,j,n,blackSquares,"Across",nSquares))
                    grid[i].append(zero(n))
                    clues["Down"].append(getNewClueWithLength(i,j,n,blackSquares,"Down",nSquares))
                    n+=1
                elif j == 0:
                    if grid[i-1][j] == black:
                        clues["Down"].append(getNewClueWithLength(i,j,n,blackSquares,"Down",nSquares))
                    grid[i].append(zero(n))
                    clues["Across"].append(getNewClueWithLength(i,j,n,blackSquares,"Across",nSquares))                        
                    n+=1
                elif grid[i][j-1] == black:
                    grid[i].append(zero(n))
                    clues["Across"].append(getNewClueWithLength(i,j,n,blackSquares,"Across",nSquares))
                    n+=1
                elif grid[i-1][j] == black:
                    grid[i].append(zero(n))
                    clues["Down"].append(getNewClueWithLength(i,j,n,blackSquares,"Down",nSquares))
                    n+=1
                else:
                    grid[i].append("__")
        # print(" ".join(grid[i]))
    return clues, grid

def mergeWithClues(year,month,**kwargs):
    print('stage 3: merging clues with grid shape')
    with open('./data/data_'+year+'.json','r') as f:
        clues_from_text = f.read()
        clues_from_text = json.loads(clues_from_text)
    for i in month:
        day = kwargs['day'] if 'day' in kwargs else calendar.monthrange(int(year),i)[1]
        for j in [x for x in range(day,day+1)]:
            nSquares = 21 if calendar.weekday(int(year), i, j) == 6 else 15
            url_date = zero(i)+zero(j)+"-"+year[2:]
            cal_date = year+"-"+zero(i)+'-'+zero(j)
            date_clues_from_image, grid = placeClues(getBlackSquares('./images/'+url_date+'.png',nSquares),nSquares)
            date_clues_from_text = [x for x in clues_from_text if x["date"] == cal_date][0]['clues']
            merge = list()
            for dir in ("Across","Down"):
                merge += [{'location': a['location'], 'clue': a['clue'], 'answer': a['answer'], 'coords': b.coords, 'length': b.length} for a in date_clues_from_text[dir] for b in date_clues_from_image[dir] if a['location'] == b.location]
            data = [{"date": cal_date, "clues": merge, "grid": grid}]
            with open('./data/merge_'+url_date+'.json','w') as f:
                f.write(json.dumps(data,indent=1))

## 4. Generate Candidates

#### sortVocab ####
* IN
    * maxlen (int): longest word we want to store
* OUT
    * sortedvocab (dict): NLTK vocabulary list sorted by length
    
#### cosine ####
* IN
    * v1 (list): vector
    * v2 (list): vector
* OUT
    * ???

In [5]:
def sortVocab(maxlen):
    sortedvocab = {}
    keys = []
    for i in [w for w in nlp.vocab if w.has_vector and w.orth_ == re.sub('[^A-Za-z]','',w.orth_) and w.orth_.islower() and len(w.orth_) <= maxlen]:
        k = len(i.orth_)
        if k not in keys:
            sortedvocab[k] = []
            keys.append(k)
        sortedvocab[k].append(i)
    return sortedvocab

cosine = lambda v1, v2: np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [6]:
vocab = sortVocab(15)

#### getKnowledgeGraphCandidates ####
Uses Google Knowledge Graph Search API to return articles
* IN
    * clue (str): clue to pass as API query
    * length (int): word length to filter response
* OUT
    * (set) words from API search of correct length

#### getTokensForKGSearch
* IN
	* clue (str): clue to get pos_
	* properonly (bool): include only proper nouns
* OUT
    * Tokenize clue return words with specific POS

#### getWikiCandidates ####
Uses Wikipedia API to return search results
* IN
    * clue (str): clue to pass as API search query
    * length (int): word length to filter response
* OUT
    * (set) words from API search of correct length

#### getWordnetCandidates ####
* IN
    * clue (str): one-word clue for which to find syno-,hypo-,hyper-nyms. 
    * length (int): length of expected answer
* OUT
    * Uses NLTK WordNet
        
#### getSpacyCandidates ####
* IN
    * clue (list): tokenized list of words whose vectors to sum
    * length (int): length of expected answer
    * ret_count (int): number of candidates to return
* OUT
    * Uses spaCy word vectors

In [192]:
def getKnowledgeGraphCandidates(clue,length):
    api_key = 'AIzaSyCz3EetlDMLlyU7LLWUH2n1U7mhUfqyxRk'
    service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
    params = {
            'query': clue,
            'limit': 10,
            'indent': True,
            'key': api_key,
    }
    url = service_url + '?' + urllib.parse.urlencode(params)
    candidates = set()
    response = json.loads(urllib.request.urlopen(url).read())
    for element in response['itemListElement']:
        if element["result"].get("detailedDescription",{}).get("articleBody",None) is not None:
            text = removePunct(element["result"]["detailedDescription"]["articleBody"])
            candidates.update(matchLength(text,length))
        if element["result"].get("name") is not None:
            text = removePunct(element["result"]["name"])
            candidates.update(
                matchLength(
                    [text[i]+text[i+1] for (i,v) in enumerate(element['result']['name']) if i < len(text)-1],
                    length
                )
            )
    return candidates

def getTokensForKGSearch(clue):
    clue = " ".join(removePunct(clue[:1].lower() + clue[1:]))
    tokens = nltk.pos_tag(nltk.word_tokenize(clue))
    lookups = [["NNP","NNPS"],["NNP","NNPS","NN","NNS"],["NNP","NNPS","NN","NNS","ADJ","ADV","RB","CD"]]
    return [ [i for (i,j) in tokens if j in lookup] for lookup in lookups ]

def getWikiRawResponse(clue):
    service_url = 'https://en.wikipedia.org/w/api.php?action=query&list=search&prop=extracts&format=json&srprop=snippet'
    search_string = urllib.parse.urlencode({"srsearch":clue})
    url = service_url+'&'+search_string
    return json.loads(urllib.request.urlopen(url).read())

def getWikiCandidates(clue,length):
    candidates = set()
    response = getWikiRawResponse(clue)
    if len(response['query']['search']) == 0 and response['query']['searchinfo'].get('suggestion'):
        response = getWikiRawResponse(re.findall('<em>(.*?)</em>',response['query']['searchinfo']['suggestionsnippet']))
    for entry in response['query']['search']:
        text = removePunct(entry['title'])
        candidates.update(matchLength(text,length))
        candidates.update(
            matchLength([text[i]+text[i+1] for (i,v) in enumerate(text) if i < len(text)-1],length)
        )
        text = removePunct(entry['snippet'])
        candidates.update(matchLength(text,length))
        candidates.update({x+'s' for x in matchLength(text,length-1)})
    return candidates

def noSpace(word):
    return re.sub('_','',word)

def getWordnetCandidates(clue,length):
    candidates = set()
#     clue = re.sub('['+string.punctuation+']','',clue.lower())
    if clue is not wn.morphy(clue) and wn.morphy(clue) is not None:
        morphedclue = wn.morphy(clue)
        morph = clue.replace(morphedclue,'')
        clue = morphedclue
    synsets = wn.synsets(clue)
    candidates.update({y for x in synsets for y in x.lemma_names() if len(y) == length})
    if 'morph' in locals():
        candidates.update({y+morph for x in synsets for y in x.lemma_names() if len(y) == length - len(morph)})
    for syn in synsets:
        candidates.update(searchWordnetSpace(syn, length, candidates))
    return candidates

def searchWordnetSpace(synset, length, candidates=set(), iteration=0):
    if synset.lemma_names():
        candidates.update({ 
            noSpace(lemmaname) for lemmaname in synset.lemma_names() if noSpace(lemmaname) == length 
        })
    for attr in ['root_hypernyms','member_holonyms','hyponyms','hypernyms']:
        if getattr(synset,attr)():
            for nym in getattr(synset,attr)():
                candidates.update({ 
                    noSpace(lemma.name()) for lemma in nym.lemmas() if len(noSpace(lemma.name())) == length
                })
                if iteration < 3:
                    candidates.update(searchWordnetSpace(nym,length,candidates,iteration+1))
    return candidates

def getSpacyCandidates(clue,length,vocab,ret_count):
    vecs = [x.vector for x in clue]
    vecsum = functools.reduce(lambda x,y: np.add(x,y),vecs)
    vocab = [w for w in vocab if w not in clue and re.search("\'", w.lower_)==None]
    vocab.sort(key=lambda w: cosine(w.vector, vecsum))
#     saveVectors(clue,vecsum,vocab[-25:])
    return {w.orth_.lower() for w in vocab[-1*ret_count:]}

def getSpacyFormulations(clue):
    formulations = []
    formulations.append([nlp.vocab[x] for x in clue.split() if x not in stopwords.words('english')])
    ### ADD NLTK POS TAGGING nltk.pos_tag(nltk.word_tokenize(clue))
#     formulations.append([nlp.vocab[x.lower_] for x in nltk.])
    formulations.append([nlp.vocab[x.lower_] for x in nlp(clue) if x.pos_ == "NOUN" or x.pos_ == "PROPN"])
    formulations.append([nlp.vocab[x.lower_] for x in nlp(clue) if x.pos_ in ["NOUN","PROPN","ADJ","ADV"] ])
    formulations.append([nlp.vocab[x.lower_] for x in nlp(clue) if x.pos_ is not "PART"])
    formulations.append([nlp.vocab[x.lemma_] for x in nlp(clue.lower()) if x.tag_ in ["NNS","NNPS","NN","NNP"]])
    if clue.find(',') != -1:
        formulations += getSpacyFormulations(clue[:clue.find(',')])
    if clue.find(' (') != -1:
        formulations += getSpacyFormulations(clue[:clue.find(' (')])
    return formulations

def getSpacySimilar(word,length):
    by_similarity = sorted(filter(lambda x: len(x.lower_) == length, word.vocab), key=lambda w: word.similarity(w), reverse=True)
    return {w.orth_.lower() for w in by_similarity[:10]}

def getDictionaryCandidates(clue, length):
    cands = []
    syns1 = dictionary.synonym(clue)
    if syns1 is not None:
        for syn in syns1:
            if syn.find(' ') != -1:
                syn = re.sub(' ','',syn)
            cands += [syn]
            syns2 = dictionary.synonym(syn)
            cands += syns2 if syns2 is not None else []
        return filter(lambda x: len(x) == length, set(cands))
    else:
        return set()

def saveVectors(clue,vecsum,vocab):
    clue = "_".join([x.lower_ for x in clue])
    data = list()
    data.append({"word": clue, "vector": list(map(lambda x: x.item(),vecsum))})
    for i in vocab:
        data.append({"word": i.lower_, "vector": list(map(lambda x: x.item(),i.vector))})
    with open('./vector_data/'+clue+'.json','w') as f:
        f.write(json.dumps(data))

def removePunct(text):
    text = re.sub('[é,ê,è]','e',text)
    text = re.sub('[ø,ö]','o',text)
    text = re.sub('[ñ]','n',text)
    text = re.sub('[ü]','u',text)
    text = re.sub('[à,ã,á,â,å]','a',text)
    text = re.sub('[ç]','c',text)
    text = re.split('[^a-zA-Z]+',text)
    return text

def matchLength(text,length):
    return filter(lambda x: len(x) == length, { x.lower() for x in text })

#### getCandidates ####
Takes array of clues and appends candidate answers

In [194]:
def getCandidates(clues):
    for i,v in enumerate(clues):
        clue = v['clue']
        length = v['length']
        v["cand_wk"] = set()
        v["cand_vec"] = set()
        v["cand_kg"] = set()
        v["cand_wn"] = set()
        v["cand_dct"] = set()
        print(clue,length)
        if re.search('([0-9]+)(\-)(Across|Down)',clue) is not None:
            print('x-Across/y-Down')
        elif clue.find(' ') == -1:
#             print('single word')
            
            v["cand_dct"].update({x+'s' for x in getDictionaryCandidates(stemmer.stem(clue),length-1)})
            v["cand_dct"].update(getDictionaryCandidates(clue,length))
            
            v["cand_wn"].update({ x+'s' for x in getWordnetCandidates(stemmer.stem(clue),length-1)})
            v["cand_wn"].update(getWordnetCandidates(clue,length))
            
            v["cand_vec"].update(getSpacyCandidates([nlp.vocab[clue]],length,vocab[length],5))
            v["cand_vec"].update(getSpacySimilar(nlp.vocab[clue],length))
            
        elif "___" in clue:
#             print('space')
            
            v["cand_wk"].update(getWikiCandidates(re.sub("___","",clue),length))
            
            clue_tokens = [nlp.vocab[stripPunct(x)] for x in clue.split() if x != "___"]
            v["cand_vec"].update(getSpacyCandidates(clue_tokens,length,vocab[length],5))
            
            v["cand_kg"].update(getKnowledgeGraphCandidates(re.sub("___","",clue),length))
            
        else:
#             print('default')
            formulations = getSpacyFormulations(clue)
            for clue_tokens in formulations:
#                 print([x.lower_ for x in clue_tokens])
                if len(clue_tokens) != 0:
                    v["cand_vec"].update(
                        {x+'s' for x in getSpacyCandidates([nlp.vocab[stemmer.stem(ct.lower_)] for ct in clue_tokens],length,vocab[length-1],10)}
                    )
                    v["cand_vec"].update(getSpacyCandidates(clue_tokens,length,vocab[length],10))

            v["cand_wk"].update(getWikiCandidates(clue,length))
            for clue_tokens in getTokensForKGSearch(clue):
                if len(clue_tokens) > 0:
                    v["cand_kg"].update(getKnowledgeGraphCandidates(clue_tokens,length))
                    v["cand_wk"].update(getWikiCandidates(clue_tokens,length))
            
            ents = nlp(clue).ents
            if len(ents) > 0:
                for ent in nlp(clue).ents:
                    v["cand_wk"].update(getWikiCandidates(ent,length))
                    v["cand_kg"].update(getKnowledgeGraphCandidates(ent,length))

            if clue.find(',') and clue[:clue.find(',')].find(' ') == -1:
                v["cand_dct"].update(getDictionaryCandidates(clue[:clue.find(',')],length))
                v["cand_wn"].update({ x+'s' for x in getWordnetCandidates(stemmer.stem(clue[:clue.find(',')]),length-1)})
                v["cand_wn"].update(getWordnetCandidates(clue[:clue.find(',')],length))
            if clue.find(' (') and clue[:clue.find(' (')].find(' ') == -1:
                v["cand_dct"].update(getDictionaryCandidates(clue[:clue.find(' (')],length))
                v["cand_wn"].update({ x+'s' for x in getWordnetCandidates(stemmer.stem(clue[:clue.find(' (')]),length-1)})
                v["cand_wn"].update(getWordnetCandidates(clue[:clue.find(' (')],length))
            
        v["cand_vec"] = list(v["cand_vec"])
        v["cand_kg"] = list(v["cand_kg"])
        v["cand_wk"] = list(v["cand_wk"])
        v["cand_wn"] = list(v["cand_wn"])
        v["cand_dct"] = list(v["cand_dct"])
#         print('Wiki: ',v["cand_wk"])
#         print('Wordnet: ',v["cand_wn"])
#         print('Word Vectors: ', v["cand_vec"])
#         print('Knowledge Graph: ', v["cand_kg"])
#         print('Dictionary: ', v["cand_dct"])
        clues[i] = v
    return clues

In [193]:
with open('./data/merge_1002-17.json','r') as fr:
    puzzle = json.loads(fr.read())[0]
clues = getCandidates([puzzle['clues'][14]])
print(clues[0]['clue'], clues[0]['answer'])
for method in cand_methods:
    print(method, clues[0]['answer'].lower() in clues[0][method])#, clues[0][method]

Hay storage areas 5
Hay storage areas LOFTS
cand_kg False
cand_wk False
cand_vec False
cand_wn False
cand_dct False


#### loadPuzzle ####
* IN
	* fileloc (str): path to JSON file
* OUT
    * writes file with clues+candidates appended

In [9]:
def loadPuzzle(year,month,day):
    print("stage 4: generating candidates")
    with open('./data/merge_'+file_date(year,month,day)+'.json','r') as fr:
        puzzle = json.loads(fr.read())[0]
    puzzle['clues'] = getCandidates(puzzle['clues'])
    with open('./data/merge_'+file_date(year,month,day)+"_cands.json",'w') as fw:
        fw.write(json.dumps(puzzle,indent=1))

## 5. Match Candidates

### Run some analytics on my candidates

In [34]:
cand_methods = ['cand_kg','cand_wk','cand_vec','cand_wn','cand_dct']

def getHitCount(clues):
    success = []
    tally = []
    for i in clues:
        score = [0 for x in cand_methods]
        add = False
        for index,j in enumerate(cand_methods):
            if i.get(j) and re.sub(" ","",i['answer'].lower()) in [x.lower() for x in i[j]]:
                add = True
                score[index] = 1
        if add:
            success.append(i)
            tally.append(score)
    print(len(success),len(clues),len(success)/len(clues))
    return success,tally

### What is my hit rate?

In [11]:
def controller(**kwargs):
    year = kwargs['year'] if 'year' in kwargs else '2017'
    month = range(kwargs['month'],kwargs['month']+1) if 'month' in kwargs else range(1,13)
    day = kwargs['day'] if 'day' in kwargs else 1
    scrape(year,month,day=day)
    saveMultipleImages(year,month,day=day)
    parseAndPrintImage(year,month[0],day)
    mergeWithClues(year,month,day=day)
    loadPuzzle(year,month[0],day)

In [31]:
controller(year='2017',month=10,day=2)
# loadPuzzle('2017',10,2)

stage 4: generating candidates
Salmon or sole 4


  if sys.path[0] == '':


Somersault 4




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


Rough on the eyes or ears 5
'Fancy seeing you here!' 4
Ages and ages 4
The same 5
Chicken pen 4
A B C D E F G 10
Fitting 3
Merriment 3
Longtime Time magazine rival, briefly 6
A B C D F 12
Opposite of 'neath 3
Slippery 1-Across 3
x-Across/y-Down
Hay storage areas 5
Plan that's 'hatched' 6
Campbell's container 3
Narrow inlet 3
B C F H I K N O P S U V W Y 15
Have a bug, maybe 3
French affirmative 3
Stops 6
Bundles of hay 5
Strands in a cell? 3
Not the main choice: Abbr 3
G R X 12
Taj Mahal material 6
'My Country, ___ of Thee' 3
Glass of 'This American Life' 3
A B O 10
Jump in an ice rink 4
Cottage or cabin 5
___ of Sandwich 4
Gardening tools 4
Lecherous figure of myth 5
Eye affliction 4
Paul who sang 'Eso Beso' 4
Centrally located 5
'Fingers crossed!' 5
Utterly ruined, informally 10
Cool 3
Longest and strongest bone in the human body 5
Take it easy 6
Drop-___ (surprise visitors) 3
It comes between chi and omega 3
Fairy tale character who leaves a trail of bread crumbs in the forest 6
King

In [195]:
puzzle = json.load(open('./data/merge_1002-17_cands.json','r'))
clues = puzzle['clues']
success,tally = getHitCount(clues)

32 76 0.42105263157894735


### Which methods did the hits come from?

In [196]:
tally = pd.DataFrame(tally,index=map(lambda x: x['answer'],success),columns=cand_methods)
print(tally)

            cand_kg  cand_wk  cand_vec  cand_wn  cand_dct
FISH              0        1         0        0         0
FLIP              0        0         0        1         0
APT               0        0         0        0         1
FUN               0        0         0        1         1
SCHEME            0        0         1        0         0
CAN               0        1         0        0         0
CEASES            0        0         0        1         0
BALES             0        1         0        0         0
DNA               0        1         0        0         0
MARBLE            1        1         0        0         0
IRA               1        1         0        0         0
EARL              1        1         0        0         0
ANKA              1        1         0        0         0
FEMUR             0        1         0        0         0
INS               0        1         0        0         0
PSI               0        1         0        0         0
HANSEL        

### What do these hits actually look like?

In [278]:
pd.set_option("display.max_colwidth",200)
df = pd.DataFrame(success)
# print(df.T)

### Cross-Reference the Candidates

In [37]:
def getCoordsLookup(puzzle):
    clues = puzzle['clues']
    clues = concatCands(clues)
    grid = puzzle['grid']
    lookup = dict()
    for i,row in enumerate(grid):
        for j,cell in enumerate(row):
            if cell != "  ":
                entry = dict()
                relevant_clues = [x for x in clues if [i,j] in x['coords']]
                for clue in relevant_clues:
                    position = clue['coords'].index([i,j])
                    cands = [(x[position],x) for x in clue['cands']]
                    for cand in cands:
                        if not entry.get(cand[0]):
                            entry[cand[0]] = {"Across":[],"Down":[]}
                        entry[cand[0]][clue['location'].split("-")[1]].append(cand[1])
                lookup[str(i)+"_"+str(j)] = entry
    return lookup

def concatCands(clues):
    for index,clue in enumerate(clues):
        clue['cands'] = set()
        for i in cand_methods:
            if clue.get(i):
                clue['cands'].update(clue[i])
        clue['cands'] = [x.lower() for x in clue['cands']]
        clues[index] = clue
    return clues

In [38]:
def getIntersections(puzzle):
    clues = puzzle['clues']
    clues = concatCands(clues)
    grid = puzzle['grid']
    intersections = dict()
    for clue in clues:
        direction = "Down" if clue['location'].split("-")[1] == "Across" else "Across"
        intersections[clue['location']] = dict()
        for cand in clue['cands']:
            intersections[clue['location']][cand] = 0
            for i,coord in enumerate(clue['coords']):
#                 print(cand,i,coord)
                if len(coordslookup[str(coord[0])+"_"+str(coord[1])][cand[i]][direction]) > 0:
#                     print(coordslookup[str(coord[0])+"_"+str(coord[1])][cand[i]][direction])
                    intersections[clue['location']][cand] += 1
            intersections[clue['location']][cand] = round(intersections[clue['location']][cand]/clue['length'],2)
    return intersections

### In each cell, what letters appear for candidates in both directions?

In [40]:
coordslookup = getCoordsLookup(puzzle)
# print(coordslookup)
# for k,v in coordslookup.items():
#     print(k)
#     print(pd.DataFrame(v).T)
intersections = getIntersections(puzzle)
print(intersections)

{'1-Across': {'ones': 1.0, 'that': 1.0, 'they': 0.75, 'shad': 1.0, 'prob': 1.0, 'upon': 1.0, 'bait': 1.0, 'lost': 1.0, 'list': 1.0, 'bass': 1.0, 'time': 1.0, 'bute': 1.0, 'anys': 0.75, 'thus': 1.0, 'true': 1.0, 'will': 1.0, 'uses': 1.0, 'skin': 1.0, 'like': 0.75, 'itss': 1.0, 'legs': 0.75, 'nots': 1.0, 'oily': 0.75, 'john': 1.0, 'wide': 1.0, 'such': 1.0, 'sees': 1.0, 'them': 1.0, 'made': 1.0, 'mech': 1.0, 'both': 1.0, 'quot': 0.75, 'gank': 1.0, 'boot': 1.0, 'eels': 1.0, 'when': 1.0, 'even': 0.75, 'only': 0.75, 'owns': 1.0, 'ikke': 0.75, 'hiss': 1.0, 'sole': 1.0, 'airs': 1.0, 'shoe': 1.0, 'buts': 1.0, 'host': 1.0, 'ares': 1.0, 'each': 1.0, 'pens': 1.0, 'swim': 1.0, 'ages': 1.0, 'zerg': 0.75, 'also': 0.75, 'food': 1.0, 'full': 1.0, 'with': 1.0, 'duty': 0.75, 'pass': 1.0, 'cans': 1.0, 'days': 0.75, 'dues': 1.0, 'lvls': 0.75, 'same': 1.0, 'wass': 1.0, 'side': 1.0, 'near': 1.0, 'wife': 1.0, 'many': 0.75, 'some': 1.0, 'nors': 1.0, 'pvps': 0.75, 'fors': 1.0, 'plss': 1.0, 'form': 1.0, 'look': 