In [1]:
from pandas import read_json
from gensim.models import word2vec
from sklearn.feature_extraction.text import CountVectorizer
df = read_json("reddit_jokes.json")
sentences = df['title'] + ' ' + df['body']
# this analyzer from sklearn tokenizes including stop words and lower case
analyze = CountVectorizer(stop_words='english').build_analyzer()
sentences = [analyze(s) for s in sentences.tolist()]
# this trains a cbow on sentences
model = word2vec.Word2Vec(sentences, size=100, min_count=10, workers=4)
model.wv.save('reddit_jokes.model')

In [2]:
# Read the non-trainable part of the model from disk
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load('reddit_jokes.model')
google_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)  



Here's some similarity comparisons between Google's massive model and ours.  Ours does surprisingly well

In [2]:
#w = word_vectors.most_similar(positive=['woman', 'king'], negative=['man'])
#g = google_vectors.most_similar(positive=['woman', 'king'], negative=['man'])
def Display(s):
    w = eval('word_vectors.' + s)
    g = eval('google_vectors.' + s)
    #import pdb; pdb.set_trace()
    print ('{}'.format(s))
    print('Trained by Jokes\tTrained by Google')
    for i in range(4):  #range(len(w)):
        print("{}\t{:4.2}\t\t{}\t{:4.2}".format(w[i][0], w[i][1], g[i][0], g[i][1]))
tests = ["most_similar(positive=['king'])", 
         "most_similar(positive=['king','woman'])",
         "most_similar(positive=['king','woman'], negative=['man'])",
         "most_similar(positive=['man'])",
         "most_similar(positive=['woman'])",
        ]
for s in tests:
    print('='*80)
    Display(s)

most_similar(positive=['king'])
Trained by Jokes	Trained by Google
queen	0.74		kings	0.71
arthur	0.72		queen	0.65
lancelot	0.65		monarch	0.64
kings	0.65		crown_prince	0.62
most_similar(positive=['king','woman'])
Trained by Jokes	Trained by Google
queen	0.63		man	0.66
arthur	0.57		queen	0.64
beauty	0.55		girl	0.61
bodyguard	0.53		princess	0.61
most_similar(positive=['king','woman'], negative=['man'])
Trained by Jokes	Trained by Google
queen	0.65		queen	0.71
elizabeth	0.59		monarch	0.62
birbal	0.57		princess	0.59
arthur	0.55		crown_prince	0.55
most_similar(positive=['man'])
Trained by Jokes	Trained by Google
mans	0.67		woman	0.77
gentleman	0.66		boy	0.68
guy	0.66		teenager	0.66
woman	0.62		teenage_girl	0.61
most_similar(positive=['woman'])
Trained by Jokes	Trained by Google
lady	0.75		man	0.77
man	0.62		girl	0.75
women	0.62		teenage_girl	0.73
gentleman	0.58		teenager	0.63


In [3]:
# Show me the vectorization of the word 'king'
vKing = word_vectors['king']
print ('Length of our vectorization of "king" (compare to the ctor for Word2Vec): {}'.format(len(vKing)))
print ('First 12 features of "king":  {}'.format(vKing[:12]))
# Show me the vocabulary itself# Export to CSV?  Real close here but I don't think we want to go this way.
# Instead, use the gensim API / "KeyedVector" (word_vector here).
from __future__ import print_function
def dump_embedding():
    for word in word_vectors.index2word[:10]:
        print ('{}, '.format(word), end='')
        for feat in word_vectors[word][:2]:
            print('{}, '.format(feat), end='')
        print('')
print(type(word_vectors))
dump_embedding()
print ('The vocabulary is stored as a standard {} of len {}'.format(type(word_vectors.vocab), len(word_vectors.vocab)))
print ('There is also a {}, also of len {}'.format(type(word_vectors.index2word), len(word_vectors.index2word)))
print ('Here are the first 10 entries:  {}'.format(word_vectors.index2word[:10]))

Length of our vectorization of "king" (compare to the ctor for Word2Vec): 100
First 12 features of "king":  [-1.53627563 -1.44087207  0.02402437  1.61712003 -0.89577299 -0.04609798
 -1.54079127  1.01438117  0.84849453  0.80059016  1.14108241 -1.77830291]
<class 'gensim.models.keyedvectors.KeyedVectors'>
man, 1.98347747326, 1.96043431759, 
says, 1.17057991028, 0.387207120657, 
said, 1.29437792301, -1.11761200428, 
did, -0.731746554375, -1.54007816315, 
like, 0.520845174789, -0.422606885433, 
just, 0.421174943447, 2.01930809021, 
don, -0.217230424285, 1.85953974724, 
know, 2.2788233757, 0.290828645229, 
guy, 2.4918589592, 2.01255893707, 
asks, 2.19117808342, 0.256881028414, 
The vocabulary is stored as a standard <type 'dict'> of len 20012
There is also a <type 'list'>, also of len 20012
Here are the first 10 entries:  [u'man', u'says', u'said', u'did', u'like', u'just', u'don', u'know', u'guy', u'asks']


In [4]:
femininity = word_vectors['king'] - word_vectors['queen']
near_woman = word_vectors['man'] - femininity
print('Joke-trained, near woman?')
print(word_vectors.most_similar(positive=[near_woman]))

femininity = google_vectors['king'] - google_vectors['queen']
near_woman = google_vectors['man'] - femininity
print('Google-trained, near woman?')
print(google_vectors.most_similar(positive=[near_woman]))


Joke-trained, near woman?
[(u'man', 0.8185614347457886), (u'gentleman', 0.5897939205169678), (u'woman', 0.5865741968154907), (u'mans', 0.5807508230209351), (u'lady', 0.5801395177841187), (u'businessman', 0.5530569553375244), (u'guy', 0.5443193912506104), (u'husband', 0.4750097990036011), (u'madam', 0.47310683131217957), (u'smiles', 0.4672502279281616)]
Google-trained, near woman?
[(u'woman', 0.718680202960968), (u'man', 0.6557512283325195), (u'girl', 0.5882835388183594), (u'lady', 0.5754351615905762), (u'teenage_girl', 0.5700528621673584), (u'teenager', 0.5378326177597046), (u'schoolgirl', 0.497780978679657), (u'policewoman', 0.49065014719963074), (u'blonde', 0.4870774447917938), (u'redhead', 0.4778464436531067)]


In [80]:
from __future__ import print_function
import numpy as np
def doc2avgcbow(sentence):
    # this analyzer from sklearn tokenizes including stop words and lower case
    analyze = CountVectorizer(stop_words='english').build_analyzer()
    tokens = analyze(sentence)
    avg_cbow = np.zeros(100)
    N = 0
    for token in tokens:
        #print('   {}'.format(token))
        if token in word_vectors.vocab:
            N += 1
            avg_cbow += word_vectors[token]
    if N != 0:
        # Some jokes contain nothing in our vocabulary.  Examples.
        #  What do you call a shart? Woopsie Poopsie :) 
        # 1 2 3 4 5 6
        # Who do you call when a sleepwalker injures himself? The somnambulance.
        avg_cbow = avg_cbow/float(N)
    return avg_cbow

def avg_cbow():
    sentences = df['title'] + ' ' + df['body']
    for s in sentences:
        yield doc2avgcbow(s)

df['cbow'] = [f for f in avg_cbow()]


In [88]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
X = df['cbow']
y = pd.cut(df.score, 3, labels=['low', 'med', 'high'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = DecisionTreeClassifier()
clf.fit(X_train.tolist(), y_train.tolist())
score = clf.score(X_test.tolist(), y_test.tolist())
print('{}'.format(score))

0.9981496235
