In [19]:
# usage: python similarity.py cmudict-0.7b-simvecs
# reads a single line of standard input, displays the most similar items
# found in cmudict-0.7b-simvecs
import sys
from annoy import AnnoyIndex

from scipy.spatial.distance import cosine
from sklearn import svm
import numpy as np

In [2]:
# given two words 'string' -> returns their phonetic similarity
def pcosine(w1, w2):
    w1, w2 = w1.upper(), w2.upper()
    return 1-cosine(lookup[w1], lookup[w2])

In [3]:
t = AnnoyIndex(50, metric='angular')
words = list()
lookup = dict()

print("loading...", file=sys.stderr)
for i, line in enumerate(open("../data/cmudict-0.7b-simvecs", encoding="latin1")):
    line = line.strip()
    word, vec_s = line.split("  ")
    vec = [float(n) for n in vec_s.split()]
    t.add_item(i, vec)
    lookup[word] = vec
    words.append(word)
t.build(50)
print("done.", file=sys.stderr)

loading...
done.


In [4]:
# number of words in phonetic dictionary
len(lookup)

133859

In [5]:
w = "graph".upper()
vec = lookup[w]
num_neigh = 5
[words[i] for i in t.get_nns_by_vector(vec, num_neigh)]

['GRAEF(1)', 'GRAEFE(1)', 'GRAEFF(1)', 'GRAF', 'GRAFF']

In [6]:
pcosine("he", "she")

0.4266593016705489

In [7]:
pcosine("cat","banter")

0.020586781508952656

# Train classifier to segregate easy/difficult words

In [10]:
easy = "cat,rat,mat,bat,chair,table,mathematics,science,apple,banana,laptop,shirt,fan,book,boy,he,female,male".split(",")
difficult = "Graph,Group,Trivedi,Green,Grand,Provost,Printer,Alaska,Close,Italy".split(",")

X, y = [], []
for w in easy:
    word = w.upper()
    if word in lookup:
        X.append(lookup[word])
        y.append(0)
        
for w in difficult:
    word = w.upper()
    if word in lookup:
        X.append(lookup[word])
        y.append(1)

In [35]:
clf = svm.SVC(probability=True)
clf.fit(X, y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [None]:
pred = clf.predict_proba(list(lookup.values()))

In [47]:
res_words = np.array(list(lookup.keys()))[pred[:,1]>0.95]
res_words.shape

(1382,)

In [48]:
res_words[:1000]

array(['"CLOSE-QUOTE', '+PLUS', 'AARDVARKS', 'ACCUMULATIVELY(1)',
       'ACCUSINGLY', 'ACREE', 'ACUTELY', 'AGGRIEVE', 'AGREES', 'AGRUSA',
       'AHLGRIM', 'AIRCREW', 'ALASKA', 'ALEGRE', 'ALEKSANDER',
       'ALEKSANDR', 'ALEX', 'ALEXANDRA(1)', 'ALEXANDRINE', 'ALEXANDRINES',
       'ALEXS', 'ALKALI', 'ALLOGRAPH', 'AMBROSINE', 'AMBROSINI',
       'ANCHOVIES', 'ANCHOVIES(1)', 'ANDREY', 'ANDRY', 'ANGLOPHONE',
       'ANGLOPHONES', 'ANGLOS(1)', 'ANGRILY', 'APPRECIATIVELY(1)',
       'APPROVINGLY', 'ARCHITRAVE', 'ARDINE', 'ARGENBRIGHT', 'ARGUES',
       'ARMBRISTER(1)', 'ARMBRUSTER(1)', 'ARQUILLA', 'ATTRACTIVELY',
       'BELGRAVE', 'BLANDFORD', "BLANKLY'S", 'BLEAKNEY', 'BLITZKRIEG',
       'BLUEPRINT', 'BLUEPRINTS', 'BRACKNEY', 'BRADFIELD', 'BRADLEES',
       "BRADLEY'S", 'BRADLEYS', 'BRADNEY', 'BRAENDSTROEM', 'BRAINTREE',
       'BRAND', "BRAND'S", 'BRAND-NEW', 'BRANDE', 'BRANDER', 'BRANDES',
       'BRANDI', 'BRANDIS', 'BRANDISH', 'BRANDNER', 'BRANDNEW', 'BRANDS',
       "BRANDS'", "BRA