In [4]:
# usage: python similarity.py cmudict-0.7b-simvecs
# reads a single line of standard input, displays the most similar items
# found in cmudict-0.7b-simvecs
import sys
from scipy.spatial.distance import cosine
from sklearn import svm
import numpy as np
import pickle

In [5]:
# given two words 'string' -> returns their phonetic similarity
def pcosine(w1, w2):
    w1, w2 = w1.upper(), w2.upper()
    return 1-cosine(lookup[w1], lookup[w2])

In [6]:
# read phonetic embedding pickle file
path = "../data/"
with open(path+'phonetic_embd.pickle', 'rb') as handle:
    lookup = pickle.load(handle)

In [7]:
# number of words in phonetic dictionary
len(lookup)

116514

In [9]:
pcosine("he", "she")

0.4266593016705489

In [10]:
pcosine("cat","banter")

0.020586781508952656

# Train classifier to segregate easy/difficult words

In [11]:
easy = "cat,rat,mat,bat,chair,table,mathematics,science,apple,banana,laptop,shirt,fan,book,boy,he,female,male".split(",")
difficult = "Graph,Group,Trivedi,Green,Grand,Provost,Printer,Alaska,Close,Italy".split(",")

X, y = [], []
for w in easy:
    word = w.upper()
    if word in lookup:
        X.append(lookup[word])
        y.append(0)
        
for w in difficult:
    word = w.upper()
    if word in lookup:
        X.append(lookup[word])
        y.append(1)

In [12]:
clf = svm.SVC(probability=True, random_state=0)
clf.fit(X, y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [15]:
clf.predict_proba([lookup["CHAIR"]])

array([[0.97422497, 0.02577503]])

In [13]:
pred = clf.predict_proba(list(lookup.values()))

In [13]:
res_words = np.array(list(lookup.keys()))[pred[:,1]>0.95]
res_words.shape

(1533,)

In [14]:
res_words[:1000]

array(['"CLOSE-QUOTE', '+PLUS', 'AARDVARKS', 'ACCUMULATIVELY(1)',
       'ACCUSINGLY', 'ACREE', 'ACUTELY', 'AGGRIEVE', 'AGREES', 'AGRUSA',
       'AHLGRIM', 'AIRCREW', 'ALASKA', 'ALEGRE', 'ALEKSANDER',
       'ALEKSANDR', 'ALEX', 'ALEXANDRA', 'ALEXANDRA(1)', 'ALEXANDRINE',
       'ALEXANDRINES', 'ALEXS', 'ALKALI', 'ALLOGRAPH', 'AMBROSINE',
       'AMBROSINI', 'ANCHOVIES', 'ANCHOVIES(1)', 'ANDREY', 'ANDRIES',
       'ANDRY', 'ANGLOPHONE', 'ANGLOPHONES', 'ANGLOS(1)', 'ANGRILY',
       'APPRECIATIVELY(1)', 'APPROVINGLY', 'ARCAND', 'ARCHITRAVE',
       'ARDINE', 'ARGENBRIGHT', 'ARGOSY', 'ARGUES', 'ARMBRISTER(1)',
       'ARMBRUSTER(1)', 'ARQUILLA', 'ATTRACTIVELY', 'BELGRAVE',
       'BLANDFORD', "BLANKLY'S", 'BLEAKNEY', 'BLITZKRIEG', 'BLUEGRASS',
       'BLUEPRINT', 'BLUEPRINTS', 'BRACINGLY', 'BRACKNEY', 'BRADFIELD',
       'BRADLEES', "BRADLEY'S", 'BRADLEYS', 'BRADNER', 'BRADNEY',
       'BRAENDSTROEM', 'BRAFF', 'BRAINTREE', 'BRAND', "BRAND'S",
       'BRAND-NEW', 'BRANDE', 'BRANDER', 'BR

In [19]:
vec = lookup["BRAND"]
clf.predict_proba([vec])[0][1]

0.9766210961016647