In [49]:
import cPickle as pickle

from stream import SentenceStream #from the prior post

sql_url = 'postgres://postgres:**PASSWORD**@localhost/articles'

# query yields same text as last time (now includes alignment data)
query = """
        SELECT side, article_text
        FROM articles a
        LEFT JOIN alignment s
        ON split_part(a.post_id, '_', 1) = s.fb_id
        WHERE num_words > 100 and not
              (lower(article_text) like '%%daily caller news foundation%%' and
               base_url != 'dailycaller.com') and not
               lower(article_text) like '%%copyright 20__ the associated press%%'
        ORDER BY side
        """

# pull in the noun n-grams
with open('../intermediate/noun_ngrams_all.pkl', 'rb') as infile:
    noun_ngrams = pickle.load(infile)

# limit to those that appear with some frequency
noun_ngrams = [n for n in noun_ngrams if noun_ngrams[n] > 100]
    
# the n-grams we've located will now be identified in the stream of text
# using the MWETokenizer from nltk
# since I'm using a Google Compute Engine I have RAM to spare
# and will gluttonously store it all in memory
sentences = list(SentenceStream(sqldb=sql_url, query=query,
                                ngrams=noun_ngrams, idcol='side'))

In [None]:
from gensim.models import Word2Vec
model = Word2Vec([x[1] for x in sentences],
                 min_count=50, iter=5, sg=1, hs=1, workers=10, size=500)

In [81]:
model.save('../intermediate/word2vec.pkl')

In [None]:
model_left = Word2Vec([x[1] for x in sentences if x[0] == 'left'],
                      min_count=50, iter=5, sg=1, hs=1, workers=10, size=500)
model_left.save('../intermediate/word2vec_left.pkl')
model_right = Word2Vec([x[1] for x in sentences if x[0] == 'right'],
                       min_count=50, iter=5, sg=1, hs=1, workers=10, size=500)
model_right.save('../intermediate/word2vec_right.pkl')

## Some initial explorations

Before we get into anything serious, we can begin by doing a little playing around with our models. One of the most basic things we can do with word embeddings is to find word vectors that are "close" to others, as defined by their cosine similarity. 

I thought it would be interesting to see what words are considered similar in one model, but not the other. This isn't especially sophisticated, but it's intuitive.

In [18]:
import pandas as pd

def partition_similar_terms(term, model_a, model_b, pool_n=100, n=10,
                            a_lab='left', b_lab='right'):
    """
    For the `pool_n` terms most similar to `term`
    in `model_a` and `model_b`, return
    A - B
    B - A
    A & B
    """
    labs = {'a':a_lab, 'b':b_lab}
    
    terms_a = [t for t, s in model_a.most_similar(term, topn=pool_n)]
    terms_b = [t for t, s in model_b.most_similar(term, topn=pool_n)]
    
    a_not_b = [t for t in terms_a if t not in terms_b][:n]
    b_not_a = [t for t in terms_b if t not in terms_a][:n]
    a_and_b = [t for t in terms_a if t in terms_b][:n]
    
    return pd.DataFrame({'{a} not {b}'.format(**labs): a_not_b,
                         '{b} not {a}'.format(**labs): b_not_a,
                         '{a} and {b}'.format(**labs): a_and_b })

In [19]:
# what does it mean to be "shady" on the right, that is different from what it means on the left, and vice versa?
partition_similar_terms('shady', model_left, model_right)

Unnamed: 0,left and right,left not right,right not left
0,questionable,dirty_tricks,influence-peddling
1,dealings,clever,scandals
2,sleazy,scam,clinton_foundation
3,unsavory,frauds,seedy
4,financial_dealings,astroturf,connections
5,sketchy,disreputable,sordid
6,unscrupulous,slimy,foreign_entities
7,fraudulent,self-dealing,clinton_foundations
8,business_practices,unseemly,illicit
9,business_dealings,fake,profiteering


Fairly interesting stuff! The right focuses on the Clinton Foundation and its influence, as expected, and the left appears to make more on the unseemliness of (presumably) Trump's business practices. Let's try something else:

In [20]:
partition_similar_terms('radical', model_left, model_right)

Unnamed: 0,left and right,left not right,right not left
0,extremist,far-right,islamic_ideology
1,fringe,anti-imperialist,anti-american
2,progressive,populist,muslim_brotherhood
3,militant,socialist,salafist
4,revolutionary,conservatism,extremist_groups
5,fundamentalist,hard-line,deobandi
6,extremists,doctrinaire,islamists
7,leftist,mainstream,liberation_theology
8,radicals,wing,wahhabist
9,reactionary,fundamentalism,totalitarian


The right uses "radical" to refer to muslim groups almost exclusively.

In [29]:
partition_similar_terms('alt-right', model_left, model_right)

Unnamed: 0,left and right,left not right,right not left
0,white_nationalists,breitbart,nevertrump
1,nationalist,white_nationalism,bigot
2,supremacist,daily_stormer,nevertrump_movement
3,neo-nazis,stormfront,pepe
4,neo-nazi,vdarecom,movement
5,white_supremacists,breitbart_news,donald_trumps_supporters
6,anti-semitic,white-supremacist,feminism
7,alt,bannon,conservatives
8,anti-semites,steve_bannon,leftist
9,far-right,jared_taylor,black_lives_matter_movement


In [32]:
partition_similar_terms(['feminism', 'feminist', 'feminists'], model_left, model_right)

Unnamed: 0,left and right,left not right,right not left
0,feminist_movement,sex-positive,social_justice_warriors
1,gloria_steinem,motherhood,radical_feminist
2,intersectional,women,leftist
3,womanhood,reproductive_justice,radical_feminists
4,womens_rights,traister,lefty
5,queer,freethenipple,dunham
6,liberals,femininity,lena
7,liberal,body_positivity,far-left
8,patriarchy,bernie_bros,sjw
9,progressives,anti-racist,lgbt_activists


# TESTING GROUNDS

{'__ignoreds': ['syn0norm', 'cum_table'],
 '__numpys': ['syn1neg', 'syn0', 'syn1'],
 '__recursive_saveloads': [],
 '__scipys': [],
 'alpha': 0.025,
 'batch_words': 10000,
 'cbow_mean': 1,
 'corpus_count': 5376975,
 'cum_table': array([  30014158,   48446814,   65668194, ..., 2147474761, 2147479204,
        2147483647], dtype=uint32),
 'hashfxn': <function hash>,
 'hs': 1,
 'index2word': [u'the',
  u'to',
  u'of',
  u'and',
  u'a',
  u'in',
  u'that',
  u'is',
  u'for',
  u'on',
  u'it',
  u'with',
  u'was',
  u'as',
  u'i',
  u'he',
  u'this',
  u'are',
  u'have',
  u'be',
  u'not',
  u'by',
  u'at',
  u'you',
  u'but',
  u'his',
  u'from',
  u'they',
  u'has',
  u'we',
  u'an',
  u's',
  u'who',
  u'about',
  u'their',
  u'or',
  u'said',
  u'its',
  u'people',
  u'her',
  u'more',
  u'one',
  u'she',
  u'were',
  u'what',
  u'all',
  u'when',
  u'will',
  u'would',
  u'if',
  u'so',
  u'out',
  u'been',
  u'like',
  u'had',
  u'can',
  u'which',
  u'do',
  u'there',
  u'up',
  u'just

In [197]:
model.most_similar(['clinton', 'shady'], ['trump'])
model.most_similar(['trump', 'shady'], ['clinton'])

model.most_similar(['clinton', 'controversial'], ['trump'])
model.most_similar(['trump', 'controversial'], ['clinton'])

model.most_similar(['liberal', 'feminist'], ['conservative'])

model.most_similar(['liberal', 'radical'], ['conservative'])

model.most_similar(['liberal', 'alt-right'], ['conservative'])
model.most_similar(['conservative', 'alt-right'], ['liberal'])

model.most_similar(['conservative', 'demagogue'], ['liberal'])

model.most_similar(['trump', 'violence'], ['clinton'])
model.most_similar(['clinton', 'violence'], ['trump'])

model.most_similar(['trump', 'crime'], ['clinton'])
model.most_similar(['clinton', 'crime'], ['trump'])

[(u'islamic_state', 0.6242399215698242),
 (u'islamic_state_group', 0.5898193120956421),
 (u'isil', 0.5716771483421326),
 (u'islamic_state_isis', 0.5501219034194946),
 (u'daesh', 0.5432126522064209),
 (u'terror_group', 0.49710047245025635),
 (u'isiss', 0.48887044191360474),
 (u'terrorist_group', 0.48086708784103394),
 (u'islamic_states', 0.4808511734008789),
 (u'jihadists', 0.46260446310043335)]

In [1]:
# trained_model.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london'])
from gensim.models import Word2Vec
model_left = Word2Vec.load('../intermediate/word2vec_left.pkl')
model_right = Word2Vec.load('../intermediate/word2vec_right.pkl')
model = Word2Vec.load('../intermediate/word2vec_all.pkl')

In [2]:
model_left.save_word2vec_format('../intermediate/word2vec_portable_left.bin',
                                fvocab='../intermediate/word2vec_vocab_left.bin', binary=True)
model_right.save_word2vec_format('../intermediate/word2vec_portable_right.bin',
                                 fvocab='../intermediate/word2vec_vocab_right.bin', binary=True)

In [60]:
model_left.most_similar('urban')
model_right.most_similar('urban')

[(u'housing', 0.4908299744129181),
 (u'hud', 0.45650115609169006),
 (u'inner_city', 0.44377341866493225),
 (u'julian_castro', 0.44084376096725464),
 (u'neighborhoods', 0.43410342931747437),
 (u'inner-city', 0.43364620208740234),
 (u'affluent', 0.4262969493865967),
 (u'high-crime', 0.4080944061279297),
 (u'low-income', 0.40600207448005676),
 (u'ghettos', 0.3926388621330261),
 (u'rural', 0.3868141174316406),
 (u'development', 0.386753648519516),
 (u'suburban', 0.3820151090621948),
 (u'areas', 0.3819997310638428),
 (u'suburbs', 0.3768220543861389)]

In [61]:
def find_disjoint_terms(term, keep_model, reject_model, pool_n=100, n=10):
    """
    Find similar terms in one model that are not present in a second
    """
    in_terms = [t for t, s in in_model.most_similar(term, topn=pool_n)]
    out_terms = [t for t, s in out_model.most_similar(term, topn=pool_n)]
    
    return [t for t in in_terms if t not in out_terms][:n]

In [65]:
find_partisan_similarity('immigration', model_right, model_left, n=25)

[u'immigrations',
 u'border_enforcement',
 u'mass_immigration',
 u'restrictionist',
 u'detainers',
 u'ice',
 u'anti-illegal',
 u'assimilation',
 u'issuances',
 u'detainer',
 u'entitlements',
 u'deport_illegal_immigrants',
 u'h1-b',
 u'guest-worker',
 u'chris_crane',
 u'immigration_system',
 u'criminal_illegal_immigrants',
 u'touchback',
 u'numbersusa',
 u'visas',
 u'senate_subcommittee',
 u'us_immigration_policy',
 u'h1b',
 u'mass_migration',
 u'open-borders']

In [138]:
import numpy as np
from fuzzywuzzy import process, fuzz
from six import string_types

class ExtendedWord2Vec(Word2Vec):
    """
    Add some desired functionality to Word2Vec
    """
    def __getitem__(self, words):
        """
        For our purposes, averaging the vectors of multiple
        words is most useful (as opposed to returning an ndarray
        of shape (len(words), self.vector_size) )
        """
        vecs = super(ExtendedWord2Vec, self).__getitem__(words)
        
        if vecs.size > self.vector_size:
            vecs = np.mean(vecs, axis=0)
            
        return vecs
        
    def likely_synonyms(self, words, threshold=75, sample_from_topn=10):
        """
        Return words that are similar both in context and spelling
        """
        similar_words = [word for word, score in
                         self.most_similar(words, topn=sample_from_topn)]
        
        if isinstance(words, string_types):
            words = [words]
        
        return set([sim_word for word in words
                             for sim_word, score in process.extract(word, similar_words)
                    if score > threshold])
            
model_leftext = model_left
model_leftext.__class__ = ExtendedWord2Vec

model_rightext = model_right
model_rightext.__class__ = ExtendedWord2Vec

modelext = model
modelext.__class__ = ExtendedWord2Vec

In [196]:
clinton_terms = [u'clintons',
                 u'democratic_nominee_hillary_clinton',
                 u'hillary_clinton',
                 u'hillary_clintons',
                 u'hillary_rodham_clinton',
                 u'hillarys',
                 u'mrs_clinton',
                 u'hrc']

trump_terms = [u'trumps',
               u'donald_trump',
               u'mr_trump',
               u'real_estate_mogul',
               u'donalds',
               u'gop_front-runner_donald_trump',
               u'republican_nominee_donald_trump']

left_terms = ['liberal', 'left-wing', 'progressive', 'center-left', 'leftist',
              'liberals', 'democrat', 'democratic', 'democrats',
              'left-leaning', 'far-left', 'leftwing']

right_terms = ['conservative', 'right-wing', 'conservatives', 'right-leaning',
              'far-right', 'republican', 'republicans']

candidate_vec = modelext[clinton_terms] - modelext[trump_terms]

russia_terms = ['putin', 'russia', 'russian', 'vladimir_putin',
                'moscows', 'moscow', 'kremlin', 'russian_president_vladimir',
                'president_vladimir_putin', 'vladimir_putins', 'russias', 'putins', 'russians',
                'russian_government', 'leftwing']

In [218]:
partisan_rejector = (model_leftext['liberal'] - model_leftext['conservative']) - 
                    (model_leftext['democrat'] - model_leftext['republican'])
model_leftext_nonpart = [model_leftext[v] - partisan_rejector for v in model_leftext_

array([-0.20462778,  0.16366085, -0.00548428,  0.14233346, -0.32446277,
        0.03609439, -0.04971508, -0.23342769, -0.14397219, -0.11840305,
       -0.13927713,  0.0510698 , -0.24667627,  0.06406192, -0.04721181,
        0.02854012,  0.0997995 ,  0.01129037, -0.02115975, -0.00543809,
        0.10735297, -0.14480504, -0.00292246, -0.07363608, -0.01873149,
       -0.17365167, -0.06916165,  0.30533025,  0.02238379,  0.17596652,
       -0.13528068, -0.24979006,  0.03903581,  0.076314  , -0.26145479,
       -0.01821651,  0.11010098, -0.2024342 , -0.04129888, -0.00520282,
       -0.02740867,  0.24004759, -0.01050987,  0.03867228,  0.11851116,
        0.19107898, -0.01806241, -0.16182363,  0.02297141,  0.16541953,
       -0.04577152, -0.22203198, -0.02401115,  0.32177806,  0.15446866,
       -0.01838038,  0.00294004, -0.26690271, -0.17676209,  0.32002151,
        0.11630105, -0.01903433,  0.0049375 ,  0.20263997, -0.06431071,
       -0.24258183, -0.1079428 ,  0.17044337,  0.09561429,  0.01

In [209]:
right_on_left = [w for w, s in model_rightext.most_similar(['liberal'], topn=100)]
left_on_left = [w for w, s in model_leftext.most_similar(['liberal'], topn=100)]

[r for r in right_on_left if r not in left_on_left]

[u'leftwing',
 u'hard-left',
 u'ultra-liberal',
 u'social_justice',
 u'lefties',
 u'social_justice_warriors',
 u'bleeding-heart',
 u'environmentalist',
 u'lefts',
 u'elitist',
 u'statist',
 u'radical_pro-abortion',
 u'hollywood',
 u'hardcore',
 u'big-government',
 u'abortion_rights',
 u'open-borders',
 u'sjw',
 u'billionaire_george_soros',
 u'candidate_bernie_sanders',
 u'socialistic',
 u'anti-capitalist',
 u'right-of-center',
 u'liberal_media',
 u'marxist',
 u'social-justice',
 u'loons',
 u'like_bernie_sanders',
 u'anti-freedom',
 u'pro-abort',
 u'leftist_agenda',
 u'anti-life',
 u'lib',
 u'conservative-leaning',
 u'dyed-in-the-wool',
 u'lunacy',
 u'college_professors',
 u'mainstream_media',
 u'pundit',
 u'loony',
 u'globalist',
 u'typical',
 u'soros-funded',
 u'pro-amnesty',
 u'activists',
 u'daily_kos',
 u'anti-second',
 u'pro-homosexual',
 u'media_matters',
 u'talking_heads',
 u'columnists',
 u'filmmaker_michael_moore',
 u'sympathetic',
 u'commentators',
 u'anti-war',
 u'pro-gay',


In [None]:
left_on_right = [w for w, s in model_leftext.most_similar(['conservative'], topn=100)]
right_on_right = [w for w, s in model_rightext.most_similar(['conservative'], topn=100)]

[l for l in left_on_right if r not in right_on_right]

In [5]:
cossim = modelext.similar_by_vector(candidate_vec, topn=False)
idx_cossim_ordered = np.argsort(cossim)[::-1]
vocab = {v.index: k for k, v in modelext.vocab.items()}
[vocab[i] for i in idx_cossim_ordered]

NameError: name 'modelext' is not defined

In [7]:
[(l, lscore) for l, lscore in ru_left if l not in [r for r, rscore in ru_right]]

NameError: name 'ru_left' is not defined

## Using Histwords on Data
