# Imports & functions

In [1]:
import spacy # after installing, be sure to run 'python -m spacy download en_core_web_lg'
import pickle
import os
import random
import re
from sklearn.metrics.pairwise import cosine_distances
import numpy as np

nlp = spacy.load("en_core_web_lg")

In [2]:
def build_matrix(clue_list):
    """
    Takes in a list of words (clues or boardwords) and returns a vector-representation-matrix  of dim {n_words, n_features}
    """
    vector_array = [doc.vector for doc in nlp.pipe(clue_list)]
    matrix = np.array(vector_array)
    return matrix

## How ``cosine_distances`` works:
Each row corresponds to a word in the X input and each column corresponds to a word in the Y input.

In [3]:
vec1 = np.array([[1,1,0,1,1]])
vec2 = np.array([[0,1,0,1,1]])
print(cosine_distances(vec1, vec2))

[[0.1339746]]


# Building clue word distances from spacy words

### Getting all spacy words and cleaning them (remove duplicates, keep alpha-numeric word)

In [4]:
all_spacy_words = []
for num_vec_tup in list(nlp.vocab.vectors.items()):
    all_spacy_words.append(nlp.vocab.strings[num_vec_tup[0]].lower())

# Keep unique only
spacy_words_unique = list(set(all_spacy_words))

# Only alpha-numeric words allowed
spacy_words_unique_clean = [word for word in spacy_words_unique
                            if re.match(r"^[a-zA-Z0-9]+(-[a-zA-Z0-9]+)*$",word)]
len(spacy_words_unique_clean)

295030

In [5]:
spacy_words_unique_clean[:10]

['smeagol',
 'rakia',
 'soha',
 'pleated',
 'asslickers',
 'three-station',
 'poise',
 'nausia',
 'milley',
 'facultad']

### Read all boardwords from ``wordlist.txt``

In [6]:
path = os.getcwd()
parent = os.path.dirname(path) 
board_words_path = os.path.join(parent, 'wordlist.txt')

boardwords = []
with open(board_words_path) as word_file:
        boardwords += word_file.read().splitlines() 

### Use ``build_matrix`` with the word lists (clues & boardwords)

In [7]:
%%time
# Takes a few minutes
clues_matrix = build_matrix(spacy_words_unique_clean)
boardwords_matrix = build_matrix(boardwords)

CPU times: user 1min 40s, sys: 154 ms, total: 1min 40s
Wall time: 1min 40s


### Create ``clue_word_distances``:

In [8]:
clue_word_distances = {}

clue_word_distances['boardwords'] = {}
for i in range(len(boardwords)):
    clue_word_distances['boardwords'][boardwords[i]] = i

clue_word_distances['clue_words'] = {}
for i in range(len(spacy_words_unique_clean)):
    clue_word_distances['clue_words'][spacy_words_unique_clean[i]] = i

clue_word_distances['distances'] = cosine_distances(boardwords_matrix, clues_matrix)

In [9]:
with open('clue_word_distances.pkl', 'wb') as handle:
    pickle.dump(clue_word_distances, handle, protocol=pickle.HIGHEST_PROTOCOL)