In [21]:
from gensim import models
from scipy import spatial
import time
import numpy as np

import warnings
warnings.filterwarnings('ignore')

# Our Model

Our model was trained by op-eds scraped from Washington Post, CNN and Fox News on topic about Obama Care and Trump Care. So these are expert opinions on healthcare

In [22]:
our_model = models.Word2Vec.load("our_model")
del our_model.wv.vocab['']

# Google Model

we compare "our model" to the model trained with generic Google news data (more generic than our healthcare data)
Download: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
Open file and rename it "google.bin"

In [31]:
t1 = time.time()
google = models.KeyedVectors.load_word2vec_format('google.bin',binary=True)
t2 = time.time()
t2-t1

90.79399991035461

## Build in features

### Most similar

- This function is trying to find the similarity between words relationship. For example, the code:
- "our_model.most_similar(positive=['woman', 'female'], negative=['man'], topn=5)"
- explores the logical relationship of: 'woman' to 'man' is like 'female' to '___'

In [34]:
our_model.most_similar(positive=['women', 'female'], negative=['man'], topn=5)

[(u'latinos', 0.6830320358276367),
 (u'(including', 0.6655078530311584),
 (u'minorities', 0.6600582003593445),
 (u'non-voters', 0.6593189835548401),
 (u'male', 0.6519131064414978)]

In [36]:
google.most_similar(positive=['women', 'female'], negative=['man'], topn=5)

[(u'females', 0.6208751201629639),
 (u'Female', 0.5989526510238647),
 (u'Women', 0.5655556321144104),
 (u'womens', 0.5516568422317505),
 (u'male', 0.5251103639602661)]

### Doesn't match

- detects most dissimilar vector

In [38]:
our_model.doesnt_match("man woman inhuman kevin".split())

'kevin'

In [39]:
google.doesnt_match("man woman inhuman kevin".split())

'inhuman'

### Similarity

In [40]:
our_model.similarity('women', 'woman')

0.5670666358568155

In [41]:
google.similarity('woman', 'women')

0.5303777486481195

## What we have come up

### Sentence Similarity
- calculates the average vector for all words in every sentence and use cosine similarity between vectors
- the average vector of a given sentence is calculated by taking average on each element of the vector. This method doesn't take order of words in account. For example, "This is a test" should be identical with "A test is this"

In [42]:
def avg_feature_vector(sentence, model, num_features):
    index2word_set = set(model.wv.index2word)
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

In [43]:
s1_afv = avg_feature_vector('obamacare abortion preexisting coverage', model=our_model, num_features=50)
s2_afv = avg_feature_vector('democrats', model=our_model, num_features=50)
sim = 1 - spatial.distance.cosine(s1_afv, s2_afv)
sim

0.4357183575630188

In [44]:
s1_afv = avg_feature_vector('obamacare abortion preexisting coverage', model=google, num_features=300)
s2_afv = avg_feature_vector('democrats', model=google, num_features=300)
sim = 1 - spatial.distance.cosine(s1_afv, s2_afv)
sim

0.33329442143440247

## Test for vector subtraction and addition

- Following the idea of Mikolov, this tests vector addtion and substraction on both our model and Google's model. The resulting vector is then compared to an expectation word. For example, following Mikolov, "king" - "man" + "woman" should be equal to "queen". 

In [51]:
difO = our_model['obamacare'] - our_model['democrats'] + our_model['republican']
simO = 1 - spatial.distance.cosine(difO, our_model['healthcare'])
simO

0.3298827111721039

In [48]:
difG = google['king'] - google['man'] + google['woman']
simG = 1 - spatial.distance.cosine(difG, google['queen'])
simG

0.7300516963005066

- Then we are wondering whether the result vector really means something, so we show the distance to the closest ones.


In [52]:
our_model.most_similar(positive = [difO], topn = 10)

[(u'obamacare', 0.771660566329956),
 (u'\u201crepeal', 0.7148782014846802),
 (u'obamacare\u201d', 0.7135400772094727),
 (u'republican', 0.7030623555183411),
 (u'\u201cskinny', 0.6987356543540955),
 (u'repeal', 0.6965476870536804),
 (u"senate's", 0.6873770356178284),
 (u'sink', 0.6867955923080444),
 (u'obamacare)', 0.6823086738586426),
 (u'repeal\u201d', 0.6795483231544495)]

In [53]:
google.most_similar(positive = [difG], topn = 10)

[(u'king', 0.8449392318725586),
 (u'queen', 0.7300517559051514),
 (u'monarch', 0.6454660892486572),
 (u'princess', 0.6156251430511475),
 (u'crown_prince', 0.5818676948547363),
 (u'prince', 0.5777117609977722),
 (u'kings', 0.5613663792610168),
 (u'sultan', 0.5376776456832886),
 (u'Queen_Consort', 0.5344247817993164),
 (u'queens', 0.5289887189865112)]

In [54]:
import copy
def find_similar(model, dem, vec, topn):
    waitlist = [k for (k,v) in model.wv.vocab.items()]
    distance = []
    for word in [a for a in waitlist]:
        for d in range(2):
            if abs(vec[d] - model.wv[word][d]) >= 0.2:
                waitlist.remove(word)
                break
    print(len(waitlist))
    for word in waitlist:
        distance.append((word, spatial.distance.cosine(vec, model.wv[word])))
    distance = sorted(distance, key = lambda k: k[1])[1:topn+1]
    print(distance)
    

In [55]:
find_similar(our_model,50,difO,5)

30
[(u'marketplaces', 0.46691805124282837), (u'2010', 0.473321795463562), (u'current', 0.5403972268104553), (u'form', 0.5978847146034241), (u'himself', 0.6058657169342041)]


### Models on Politician Related Problems

- on politician related problems, which form most of our model's input, our model seems to perform better than Google's news model.

In [52]:
our_model.most_similar(positive=['obamacare', 'republicans'], negative=['abortion'], topn=5)

[('democrats', 0.7632335424423218),
 ('dems', 0.7000274062156677),
 ('repeal', 0.6927065253257751),
 ('filibuster', 0.6834850311279297),
 ('trumpcare', 0.6723067164421082)]

In [53]:
google.most_similar(positive=['obamacare', 'republicans'], negative=['abortion'], topn=5)

[('Rethugs', 0.6481903195381165),
 ('dems', 0.6273144483566284),
 ('repubs', 0.621368408203125),
 ('repugs', 0.5910837650299072),
 ('rethugs', 0.578933596611023)]