# Train word2vec locally

This allows a smart initialization of our neural net's word embeddings.
It seems that initializing the embeddings by training them locally, as opposed to using pre-trained word2vec embeddings (available online) can lead to better performance.

In [27]:
import os
import sys
print(sys.executable)

/Users/andrei/anaconda3/envs/cil/bin/python


In [28]:
from gensim.models.word2vec import Word2Vec

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [29]:
TRAIN = os.path.join('..', 'train')
TEST = os.path.join('..', 'test')
POS_TWEET_FILE = os.path.join(TRAIN, 'train_pos_full.txt')
NEG_TWEET_FILE = os.path.join(TRAIN, 'train_pos_full.txt')
TEST_TWEET_FILE = os.path.join(TEST, 'test_data.txt')
EMBEDDING_SIZE = 275

In [30]:
def read_tweets(fname):
    """Read the tweets in the given file.
    
    Returns a 2d array where every row is a tweet, split into words.
    """
    with open(fname, 'r') as f:
        return [l.split() for l in f.readlines()]

In [31]:
pos_tweets = read_tweets(POS_TWEET_FILE)

In [32]:
neg_tweets = read_tweets(NEG_TWEET_FILE)

In [33]:
test_tweets = read_tweets(TEST_TWEET_FILE)

In [34]:
sentences = pos_tweets + neg_tweets + test_tweets
print(len(sentences))

2510000


In [35]:
# Download this for testing: https://github.com/arfon/word2vec/blob/master/questions-words.txt
# Highly recommended!

question_file = "questions-words.txt"

def eval_embeddings(model):
    accuracy_results = model.accuracy(question_file)
    summary = accuracy_results[-1]
    assert summary['section'] == 'total'
    incorrect = summary['incorrect']
    correct = summary['correct']

    incorrect_n = len(incorrect)
    correct_n = len(correct)

    acc = correct_n / incorrect_n
    return acc, correct_n, incorrect_n

In [36]:
WORKERS = 8
# Note: Moises's team uses size=200 as of June 13.
# See: https://groups.google.com/forum/#!msg/gensim/ggCHGncd5rU/Z_pQDD69AAAJ
# for some parameter hints.
model = Word2Vec(sentences, size=EMBEDDING_SIZE, window=10, min_count=5, workers=WORKERS)# , alpha=0.05, cbow_mean=1)

In [37]:
# Should be queen
model.most_similar(positive=['woman', 'king'], negative=['man'])

[('queen', 0.4792458415031433),
 ('alpha', 0.38749757409095764),
 ('madonna', 0.3621598482131958),
 ('pageants', 0.353053480386734),
 ('kenny', 0.3485540747642517),
 ('goddess', 0.34342798590660095),
 ('hawk', 0.342306524515152),
 ('protects', 0.34013429284095764),
 ('sinner', 0.3393355906009674),
 ('president', 0.3371276557445526)]

In [13]:
# Should be germany
model.most_similar(positive=['france', 'berlin'], negative=['paris'])

[('croatia', 0.6066615581512451),
 ('finland', 0.6033172607421875),
 ('holland', 0.5851148366928101),
 ('norway', 0.5845724940299988),
 ('belgium', 0.5838773250579834),
 ('switzerland', 0.5790820121765137),
 ('sweden', 0.578895628452301),
 ('germany', 0.5740625858306885),
 ('spain', 0.5683605670928955),
 ('portugal', 0.5624834299087524)]

In [14]:
model.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'

In [15]:
model.estimate_memory()

{'syn0': 83779200, 'syn1neg': 83779200, 'total': 202466400, 'vocab': 34908000}

In [19]:
# A few more sanity checks
print(model.similarity('woman', 'man'))
print(model.similarity('woman', 'coffee'))
print(model.similarity('woman', 'penis'))
print(model.similarity('woman', 'football'))

0.531663722765
0.148042275749
0.245260193706
0.0137228767854


In [20]:
print(model.similarity('car','man'))
print(model.similarity('car','truck'))

0.201689831471
0.686243034977


In [25]:
acc, correct_n, incorrect_n = eval_embeddings(model)
print("{0:5.3f} accuracy; Analogies: {1} correct, {2} incorrect".format(
    acc, correct_n, incorrect_n))

0.340 accuracy; Analogies: 2339 correct, 6871 incorrect


### Accuracies (full Twitter data)
 * Vanilla (size=225, window=5, min_count=5): 0.319 accuracy; Analogies: 2233 correct, 7004 incorrect
 * (size=300, window=5,  min_count=5): 0.337 accuracy; Analogies: 2329 correct, 6908 incorrect
 * (size=500, window=5,  min_count=5): 0.330 accuracy; Analogies: 2292 correct, 6945 incorrect
 * (size=300, window=10, min_count=5): 0.346 accuracy; Analogies: 2374 correct, 6863 incorrect
 * (size=300, window=15, min_count=5): 0.342 accuracy; Analogies: 2356 correct, 6881 incorrect



In [86]:
# TODO(andrei): Save the model!
fname = "./word2vec-local-gensim-{0}.bin".format(EMBEDDING_SIZE)
model.save(fname)

### Some experimentation

In [26]:
emb_sizes = [100, 150, 175, 200, 225, 250, 275, 300, 325, 350, 375, 400]

for emb_size in emb_sizes:
    print("Computing embeddings of size {0}...".format(emb_size))
    model = Word2Vec(sentences, size=emb_size, window=10, min_count=5, workers=WORKERS)
    print("Evaluating embeddings of size {0}...".format(emb_size))
    acc, correct_n, incorrect_n = eval_embeddings(model)
    print("Size {3}: {0:5.3f} accuracy; Analogies: {1} correct, {2} incorrect".format(
        acc, correct_n, incorrect_n, emb_size))

Computing embeddings of size 100...
Evaluating embeddings of size 100...
Size 100: 0.274 accuracy; Analogies: 1983 correct, 7227 incorrect
Computing embeddings of size 150...
Evaluating embeddings of size 150...
Size 150: 0.317 accuracy; Analogies: 2216 correct, 6994 incorrect
Computing embeddings of size 175...
Evaluating embeddings of size 175...
Size 175: 0.323 accuracy; Analogies: 2246 correct, 6964 incorrect
Computing embeddings of size 200...
Evaluating embeddings of size 200...
Size 200: 0.337 accuracy; Analogies: 2323 correct, 6887 incorrect
Computing embeddings of size 225...
Evaluating embeddings of size 225...
Size 225: 0.336 accuracy; Analogies: 2316 correct, 6894 incorrect
Computing embeddings of size 250...
Evaluating embeddings of size 250...
Size 250: 0.337 accuracy; Analogies: 2319 correct, 6891 incorrect
Computing embeddings of size 275...
Evaluating embeddings of size 275...
Size 275: 0.346 accuracy; Analogies: 2369 correct, 6841 incorrect
Computing embeddings of siz