# Word2Vec & Doc2Vec with Gensim

## Tim Hochberg

# What is Word2Vec

* **Unsupervised** method of mapping words to a vector space


In [2]:
from gensim.models import Word2Vec
# Load pretrained model from https://github.com/3Top/word2vec-api/blob/master/GoogleNews-vectors-negative300.bin.gz
model = Word2Vec.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

Using gpu device 0: GeForce GT 750M (CNMeM is disabled)


In [3]:
len(model["car"]) # Words map to 300 element vectors

300

In [4]:
model["car"][:3]

array([ 0.04943277,  0.00318177,  0.01263487], dtype=float32)

* Similar words get mapped to nearby vectors


In [5]:
>>> model.n_similarity(['car'], ['truck'])

0.67357901840345347

In [6]:
>>> model.n_similarity(['car'], ['fish'])

0.092147685651241318

In [7]:
>>> model.most_similar(['cow'])

[('cows', 0.7792555093765259),
 ('pig', 0.6542098522186279),
 ('dairy_cow', 0.6442502737045288),
 ('bovines', 0.6437903642654419),
 ('bovine', 0.6407181620597839),
 ('goat', 0.6359611749649048),
 ('cattle', 0.631946325302124),
 ('sheep', 0.6081749200820923),
 ('Holstein_cow', 0.6066932678222656),
 ('goats', 0.5846577286720276)]

* Vector arithmetic solves analogy questions

In [8]:
# man:king as woman:?
model.most_similar(positive=["king", "woman"], negative=["man"], topn=4)

[('queen', 0.7118192911148071),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431607246399),
 ('crown_prince', 0.5499460697174072)]

In [9]:
# Google:Android as Apple:?
model.most_similar(positive=["Android", "Apple"], negative=["Google"], topn=4)

[('iPhone', 0.6760426759719849),
 ('Android_OS', 0.6658217906951904),
 ('iOS', 0.6550344228744507),
 ('WP7', 0.6534616947174072)]

In [10]:
# knee:leg as elbow:?
model.most_similar(positive=["leg", "elbow"], negative=["knee"], topn=4)

[('forearm', 0.6154942512512207),
 ('arm', 0.5596616268157959),
 ('legs', 0.5394924879074097),
 ('puncturing_lung', 0.5146561861038208)]

*More examples like this, but many sensible analogies fail*

# How?

* Words appearing in similar contexts mapped close together
* word2vec is a family of related algorithms
   - Continuous Bag of Words (CBOW) | Skip-Gram
   - Hierarchical Softmax | Negative-Sampling
   

![skip-gram](skip_gram.png)
*From [word2vec Parameter Learning Explained](http://www-personal.umich.edu/~ronxin/pdf/w2vexp.pdf)*

* Words one-hot encoded (~1e6 words in English)
* Network trained to predict context based on word
* Hidden layer is the word vector

# How to Train on Custom Corpus

1. Convert corpus to list of 'sentences'
   - each sentence is a list of 'words'
   - 'words' words may be phrases or punctuation


## Convert movie review data to correct form

Data from [http://ai.stanford.edu/~amaas/data/sentiment/](http://ai.stanford.edu/~amaas/data/sentiment/)

In [11]:
def review_to_wordlist(review_text):
    review_text = review_text.replace("<br />", " ")
    for x in '"()?.;:!,':
        review_text = review_text.replace(x, " "+x+" ")
    return review_text.lower().split()

In [12]:
from glob import glob
base = "aclImdb/"

train_sentences = []  
train_sentiment = []

for path in glob(base + "train/pos/*.txt"):
    train_sentences.append(review_to_wordlist(open(path).read()))
    train_sentiment.append(1)
    
for path in glob(base + "train/neg/*.txt"):
    train_sentences.append(review_to_wordlist(open(path).read()))
    train_sentiment.append(0)  
    
for path in glob(base + "train/unsup/*.txt"):
    train_sentences.append(review_to_wordlist(open(path).read()))

In [13]:
test_sentences = []  
test_sentiment = []

for path in glob(base + "test/pos/*.txt"):
    test_sentences.append(review_to_wordlist(open(path).read()))
    test_sentiment.append(1)
    
for path in glob(base + "test/neg/*.txt"):
    test_sentences.append(review_to_wordlist(open(path).read()))
    test_sentiment.append(0)  

In [14]:
# Check how many sentences we have in total 
print(len(train_sentences)) 
print(len(test_sentences))
print(train_sentences[0])

75000
25000
['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', '.', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', ',', 'such', 'as', '"', 'teachers', '"', '.', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', "high's", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"', 'teachers', '"', '.', 'the', 'scramble', 'to', 'survive', 'financially', ',', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', "teachers'", 'pomp', ',', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', ',', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students', '.', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', ',', 'i', 'immediately', 'recalled', '.', '.', '.', '.', '.', '.', '.', '.', '.', 'at', '.', '.', '.', '.'

# How to Train on Custom Corpus

1. Convert corpus to list of 'sentences'
   - each sentence is a list of 'words'
   - 'words' words may be phrases or punctuation
2. Train 


In [15]:
from gensim.models import word2vec       

movie_model = word2vec.Word2Vec(train_sentences, 
                                workers=6, # Number of threads to run in parallel
                                size=100,  # Word vector dimensionality
                                window=10  # Context window size 
                               )

In [16]:
movie_model.most_similar("alien")

[('aliens', 0.7625104784965515),
 ('robot', 0.688758373260498),
 ('predator', 0.6847430467605591),
 ('mutant', 0.6433262825012207),
 ('planet', 0.6344485878944397),
 ('space', 0.6288543343544006),
 ('virus', 0.6277874708175659),
 ('attack', 0.6222692728042603),
 ('creature', 0.6197825074195862),
 ('monster', 0.617556095123291)]

In [17]:
model.most_similar("alien")

[('aliens', 0.786628246307373),
 ('extraterrestrial', 0.6394338607788086),
 ('extra_terrestrials', 0.6255451440811157),
 ('Klaatu_Keanu_Reeves', 0.6106236577033997),
 ('extraterrestrials', 0.60221266746521),
 ('intergalactic', 0.6010125875473022),
 ('humanoid_aliens', 0.6000779867172241),
 ('earthling', 0.5918036699295044),
 ('Alien', 0.5912981629371643),
 ('alien_invader', 0.5830426812171936)]

# How to Train on Custom Corpus

1. Convert corpus to list of 'sentences'
   - each sentence is a list of 'words'
   - 'words' words may be phrases or punctuation
2. Train 
3. **???**
4. Profit!

# ???

* Usually want to compare larger units of text
* Word vectors can be used as features for higher level model
   - Recurrent Neural Net (RNN)
   - Bag of centroids
   - Fixed length models
   - ...
* Ideally want equivalent to word2vec on chunks of text

# Doc2vec (aka Paragraph2vec)

* doc2vec is a family of related algorithms (again)
    - Distributed Memory (PV-DM) | Distributed Bag of Words (PV-DBOW)
    - Hierarchical Softmax | Negative-Sampling 
    - Maps paragraphs/documents to vectors
    - PV-DM also maps words to vectors
* Can compare web pages, movie review, etc.

![PV-DBOW](PV-DBOW.png)
*From [Distributed Representations of Sentences and Documents](https://cs.stanford.edu/~quocle/paragraph_vector.pdf)*

# Sentiment Analysis

In [18]:
from gensim.models.doc2vec import LabeledSentence, Doc2Vec
import numpy as np

# labels is a list of ids identifying which review a given sentence came from
labelled_sentences = [LabeledSentence(words=s, 
                                      tags=['P{0}'.format(i)]) for (i,s) in enumerate(train_sentences)]

def train(model, alpha=0.025, min_alpha=0.001, epochs=20):
    model.build_vocab(labelled_sentences)
    shuffled = labelled_sentences[:]
    for a in np.linspace(alpha, min_alpha, epochs):
        print("alpha =", a)
        np.random.shuffle(shuffled)
        model.alpha = model.min_alpha = a
        model.train(shuffled)

## Train two networks

In [19]:
pv_dm_model = Doc2Vec(size=100, negative=10, window=5, min_count=2, dm=1, hs=0, dm_concat=0)
train(pv_dm_model)

alpha = 0.025
alpha = 0.0237368421053
alpha = 0.0224736842105
alpha = 0.0212105263158
alpha = 0.0199473684211
alpha = 0.0186842105263
alpha = 0.0174210526316
alpha = 0.0161578947368
alpha = 0.0148947368421
alpha = 0.0136315789474
alpha = 0.0123684210526
alpha = 0.0111052631579
alpha = 0.00984210526316
alpha = 0.00857894736842
alpha = 0.00731578947368
alpha = 0.00605263157895
alpha = 0.00478947368421
alpha = 0.00352631578947
alpha = 0.00226315789474
alpha = 0.001


In [20]:
pv_dbow_model = Doc2Vec(size=100, negative=10, window=10, min_count=2, hs=0, dm=0)
train(pv_dbow_model)

alpha = 0.025
alpha = 0.0237368421053
alpha = 0.0224736842105
alpha = 0.0212105263158
alpha = 0.0199473684211
alpha = 0.0186842105263
alpha = 0.0174210526316
alpha = 0.0161578947368
alpha = 0.0148947368421
alpha = 0.0136315789474
alpha = 0.0123684210526
alpha = 0.0111052631579
alpha = 0.00984210526316
alpha = 0.00857894736842
alpha = 0.00731578947368
alpha = 0.00605263157895
alpha = 0.00478947368421
alpha = 0.00352631578947
alpha = 0.00226315789474
alpha = 0.001


## Generate Features

*Note that we **infer** vectors for the test features*

In [21]:
n_labeled = len(train_sentiment)

train_features = np.array([np.concatenate([pv_dm_model.docvecs['P{0}'.format(i)], 
                                           pv_dbow_model.docvecs['P{0}'.format(i)]]
                                         ) for i in range(n_labeled) ])
train_labels = np.array(train_sentiment)

test_features = np.array([np.concatenate([pv_dm_model.infer_vector(x),
                                          pv_dbow_model.infer_vector(x)
                                         ]) for x in test_sentences])

test_labels = np.array(test_sentiment)

## Train a Classifier

In [22]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()

classifier.fit(train_features, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [23]:
predicted = classifier.predict(test_features)
accuracy = (test_labels == predicted).mean()

# Paragraph2Vec Paper: 7.42%
# Doc2Vec Example: 9.48%
print(1 - accuracy)

0.11524


# Links

* Original word2vec paper: http://www.cs.columbia.edu/~blei/seminar/2016_discrete_data/readings/MikolovSutskeverChenCorradoDean2013.pdf
* word2vec Parameter Learning Explained: http://www-personal.umich.edu/~ronxin/pdf/w2vexp.pdf
* Distributed Representations of Sentences and Documents: https://cs.stanford.edu/~quocle/paragraph_vector.pdf
* Gensim: https://radimrehurek.com/gensim/
* Kaggle word2vec tutorial: https://www.kaggle.com/c/word2vec-nlp-tutorial
* piskvorky IMDB Doc2Vec example: https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb