## Training word vectors in Gensim
https://radimrehurek.com/gensim/models/word2vec.html

## Gensim on Github
https://github.com/RaRe-Technologies/gensim

## word2vec on Gensim on Github
https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/models/word2vec.py

## Also
https://rare-technologies.com/deep-learning-with-word2vec-and-gensim/

In [1]:
# Create some raw documents
raw_documents = ['I love tacos.',
             'She ran with the chicken.',
             'I don’t choose to take a nap. The nap chooses me.',
            'That man is nice as pie with ice cream.',
            'This pizza is an affront to nature.']
from nltk.tokenize import word_tokenize
def get_tokens(text):
    tokens = word_tokenize(text)
    return tokens

In [2]:
import gensim
sentences  = [get_tokens(r) for r in raw_documents]
model = gensim.models.word2vec.Word2Vec(sentences,min_count=1,size=5)
print(model['ran'])
print(model['love'])
print(model['tacos'])



[-0.08233323 -0.06830415 -0.01001786  0.0555655  -0.00198256]
[ 0.04440577 -0.09361976 -0.07637561  0.01410124 -0.04199225]
[ 0.0774129   0.02489747  0.05281512 -0.07839555  0.08457994]


In [3]:
vocab = list(model.vocab.keys())
vocab[:10]

['with', 'tacos', 'man', 'pizza', 'ice', 'That', 'take', 'The', 'nap', 'This']

In [4]:
model.most_similar('tacos')

[('chicken', 0.8110033273696899),
 ('is', 0.6826239824295044),
 ('nature', 0.607844352722168),
 ('choose', 0.5738837718963623),
 ('an', 0.5073656439781189),
 ('affront', 0.39505720138549805),
 ('The', 0.15771405398845673),
 ('a', 0.12146198749542236),
 ('.', 0.12046509981155396),
 ('This', 0.09019875526428223)]

In [5]:
# Make sure you are using C underneath
assert gensim.models.doc2vec.FAST_VERSION > -1

In [6]:
# http://scikit-learn.org/stable/datasets/twenty_newsgroups.html#newsgroups
from sklearn.datasets import fetch_20newsgroups
texts = fetch_20newsgroups(subset='train')
dir(texts)

['DESCR', 'data', 'description', 'filenames', 'target', 'target_names']

In [7]:
print (texts.filenames)

[ '/Users/jmugan/scikit_learn_data/20news_home/20news-bydate-train/rec.autos/102994'
 '/Users/jmugan/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.mac.hardware/51861'
 '/Users/jmugan/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.mac.hardware/51879'
 ...,
 '/Users/jmugan/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.ibm.pc.hardware/60695'
 '/Users/jmugan/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38319'
 '/Users/jmugan/scikit_learn_data/20news_home/20news-bydate-train/rec.motorcycles/104440']


In [8]:
# 11,314 posts
print(len(texts.target))
print(texts.target)
print(texts.target_names)

11314
[7 4 4 ..., 3 1 8]
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [9]:
data = texts.data
len(data)

11314

In [10]:
data[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [11]:
def get_tokens(text):
    tokens = word_tokenize(text)
    return [token.lower() for token in tokens]
print(get_tokens(data[0]))

['from', ':', 'lerxst', '@', 'wam.umd.edu', '(', 'where', "'s", 'my', 'thing', ')', 'subject', ':', 'what', 'car', 'is', 'this', '!', '?', 'nntp-posting-host', ':', 'rac3.wam.umd.edu', 'organization', ':', 'university', 'of', 'maryland', ',', 'college', 'park', 'lines', ':', '15', 'i', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'i', 'saw', 'the', 'other', 'day', '.', 'it', 'was', 'a', '2-door', 'sports', 'car', ',', 'looked', 'to', 'be', 'from', 'the', 'late', '60s/', 'early', '70s', '.', 'it', 'was', 'called', 'a', 'bricklin', '.', 'the', 'doors', 'were', 'really', 'small', '.', 'in', 'addition', ',', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', '.', 'this', 'is', 'all', 'i', 'know', '.', 'if', 'anyone', 'can', 'tellme', 'a', 'model', 'name', ',', 'engine', 'specs', ',', 'years', 'of', 'production', ',', 'where', 'this', 'car', 'is', 'made', ',', 'history', ',', 'or', 'whatever', 'inf

In [12]:
# We will treat each document as a sentence
sentences = [get_tokens(doc) for doc in data]
print(sentences[0])

['from', ':', 'lerxst', '@', 'wam.umd.edu', '(', 'where', "'s", 'my', 'thing', ')', 'subject', ':', 'what', 'car', 'is', 'this', '!', '?', 'nntp-posting-host', ':', 'rac3.wam.umd.edu', 'organization', ':', 'university', 'of', 'maryland', ',', 'college', 'park', 'lines', ':', '15', 'i', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'i', 'saw', 'the', 'other', 'day', '.', 'it', 'was', 'a', '2-door', 'sports', 'car', ',', 'looked', 'to', 'be', 'from', 'the', 'late', '60s/', 'early', '70s', '.', 'it', 'was', 'called', 'a', 'bricklin', '.', 'the', 'doors', 'were', 'really', 'small', '.', 'in', 'addition', ',', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', '.', 'this', 'is', 'all', 'i', 'know', '.', 'if', 'anyone', 'can', 'tellme', 'a', 'model', 'name', ',', 'engine', 'specs', ',', 'years', 'of', 'production', ',', 'where', 'this', 'car', 'is', 'made', ',', 'history', ',', 'or', 'whatever', 'inf

In [13]:
model_ng = gensim.models.word2vec.Word2Vec(sentences,min_count=3,size=200)

In [14]:
model_ng.most_similar('man')

[('woman', 0.8316221237182617),
 ('father', 0.7187923789024353),
 ('christ', 0.7182186245918274),
 ('god', 0.7141736745834351),
 ('son', 0.705515444278717),
 ('jesus', 0.7020677924156189),
 ('person', 0.7012173533439636),
 ('lord', 0.6988461017608643),
 ('child', 0.6978594064712524),
 ('satan', 0.6909868717193604)]

In [15]:
# Download and unzip corpus from http://mattmahoney.net/dc/text8.zip
sents = gensim.models.word2vec.Text8Corpus('/Users/jmugan/Downloads/text8', max_sentence_length=10000)

In [16]:
model_t8 = gensim.models.word2vec.Word2Vec(sents,min_count=5,size=200,iter=15)

In [17]:
model_t8.most_similar('man')

[('woman', 0.6351103782653809),
 ('men', 0.526848316192627),
 ('person', 0.507014274597168),
 ('girl', 0.4971894919872284),
 ('boy', 0.492550253868103),
 ('creature', 0.47789499163627625),
 ('gentleman', 0.47342896461486816),
 ('stranger', 0.464810311794281),
 ('thief', 0.4601999521255493),
 ('god', 0.4542997479438782)]

In [18]:
model_t8.most_similar('happy')

[('quiet', 0.5618761777877808),
 ('lucky', 0.5422603487968445),
 ('merry', 0.5137842893600464),
 ('laugh', 0.5086448192596436),
 ('agony', 0.48725980520248413),
 ('miserable', 0.4734904170036316),
 ('awake', 0.4691343605518341),
 ('sad', 0.4645567834377289),
 ('hungry', 0.462251216173172),
 ('angry', 0.45838749408721924)]

In [19]:
model_t8.most_similar(positive=['woman', 'king'], negative=['man'])

[('queen', 0.6193341016769409),
 ('isabella', 0.5154644250869751),
 ('princess', 0.48500490188598633),
 ('husband', 0.4801233112812042),
 ('regent', 0.4659966230392456),
 ('throne', 0.4594484567642212),
 ('jadwiga', 0.458169549703598),
 ('matilda', 0.4560677409172058),
 ('monarch', 0.45564383268356323),
 ('consort', 0.45116063952445984)]

In [20]:
# Bonus doc2vec
# More details here https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb
from gensim.models.doc2vec import TaggedDocument

In [21]:
# https://radimrehurek.com/gensim/models/doc2vec.html
# recall sentences from Newsgroup data
sentences = [get_tokens(doc) for doc in texts.data]

In [22]:
tagged_documents = []
for i, sent in enumerate(sentences):
    tagged_documents.append(TaggedDocument(sent,["sent_{}".format(i)]))
d2v_model = gensim.models.doc2vec.Doc2Vec(tagged_documents, size=300)

In [23]:
d2v_model.most_similar('fast')

[('long', 0.6099879145622253),
 ('far', 0.5357909798622131),
 ('slow', 0.5179504156112671),
 ('risky', 0.5038941502571106),
 ('well', 0.49702027440071106),
 ('soon', 0.4968246817588806),
 ('quickly', 0.49398237466812134),
 ('quick', 0.48880892992019653),
 ('busy', 0.4786657691001892),
 ('much', 0.47404593229293823)]

In [24]:
vec0 = d2v_model.infer_vector('i love tacos'.split())
print(vec0)

[ -2.98566967e-02  -4.36381325e-02  -2.15346087e-02   1.03564030e-02
  -3.18286829e-02   1.97017882e-02  -4.06667445e-04   1.83063131e-02
  -2.96763089e-02   1.85645334e-02   2.87281927e-02  -1.09502999e-02
  -9.92021640e-04  -2.25025680e-04  -8.00123904e-04   4.98046819e-03
  -1.97231490e-02  -9.75857675e-03  -1.12661123e-02   1.80483516e-02
  -1.92526332e-03  -2.93154214e-02  -1.05524519e-02   2.79053897e-02
   9.89449397e-03   2.19989065e-02  -7.75479572e-03  -1.52771017e-02
  -2.49881726e-02   8.98548681e-03  -3.32210734e-02   1.88188087e-02
   9.11299046e-03   2.63141338e-02  -1.10818679e-02   8.20602942e-03
  -4.14928496e-02   6.96951430e-03  -9.75239370e-03   8.89556541e-04
  -1.07758055e-02   6.25817291e-03   1.98161621e-02  -4.36437167e-02
  -3.90608311e-02  -5.79272173e-02  -3.22807916e-02   1.80786650e-03
   3.50661166e-02  -1.69788506e-02   5.97968101e-05  -9.32477508e-03
   6.51244167e-03  -2.59092487e-02  -9.15811770e-03   5.91427600e-03
   9.22613144e-02  -1.23681249e-02

In [25]:
d2v_model.docvecs.most_similar( [ vec0 ] )

[('sent_9692', 0.370047926902771),
 ('sent_6653', 0.34805458784103394),
 ('sent_8179', 0.3007533550262451),
 ('sent_78', 0.29340916872024536),
 ('sent_1599', 0.2916225790977478),
 ('sent_4884', 0.27913421392440796),
 ('sent_4903', 0.27526798844337463),
 ('sent_52', 0.2742495536804199),
 ('sent_7978', 0.2725113332271576),
 ('sent_568', 0.2688180208206177)]

Exercise: compare doc2vec with the tf-idf method we did in a previous video.