# Train word2vec locally

This allows a smart initialization of our neural net's word embeddings.
It seems that initializing the embeddings by training them locally, as opposed to using pre-trained word2vec embeddings (available online) can lead to better performance.

In [2]:
import os
import sys
print(sys.executable)

/Users/andrei/anaconda3/envs/cil/bin/python


In [3]:
from gensim.models.word2vec import Word2Vec

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
TRAIN = os.path.join('..', 'train')
TEST = os.path.join('..', 'test')
POS_TWEET_FILE = os.path.join(TRAIN, 'train_pos_full.txt')
NEG_TWEET_FILE = os.path.join(TRAIN, 'train_neg_full.txt')
TEST_TWEET_FILE = os.path.join(TEST, 'test_data.txt')
EMBEDDING_SIZE = 300

In [5]:
def read_tweets(fname):
    """Read the tweets in the given file.
    
    Returns a 2d array where every row is a tweet, split into words.
    """
    with open(fname, 'r') as f:
        return [l.split() for l in f.readlines()]

In [6]:
pos_tweets = read_tweets(POS_TWEET_FILE)

In [7]:
neg_tweets = read_tweets(NEG_TWEET_FILE)

In [8]:
test_tweets = read_tweets(TEST_TWEET_FILE)

In [9]:
sentences = pos_tweets + neg_tweets + test_tweets
print(len(sentences))

2510000


In [10]:
tokens = [item.strip() for sentence in sentences for item in sentence]

In [14]:
# Check for Nikos's 1st stage substitutions.
assert '<num>' in tokens

In [13]:
# Another sanity check
print(len([t for t in tokens if 'bootstrap' == t]))

10


In [15]:
# Download this for testing: https://github.com/arfon/word2vec/blob/master/questions-words.txt
# Highly recommended!

question_file = "questions-words.txt"

def eval_embeddings(model):
    accuracy_results = model.accuracy(question_file)
    summary = accuracy_results[-1]
    assert summary['section'] == 'total'
    incorrect = summary['incorrect']
    correct = summary['correct']

    incorrect_n = len(incorrect)
    correct_n = len(correct)

    acc = correct_n / incorrect_n
    return acc, correct_n, incorrect_n

In [16]:
WORKERS = 8
# Note: Moises's team uses size=200 as of June 13.
# See: https://groups.google.com/forum/#!msg/gensim/ggCHGncd5rU/Z_pQDD69AAAJ
# for some parameter hints.
model = Word2Vec(sentences, size=EMBEDDING_SIZE, window=10, min_count=5, workers=WORKERS)# , alpha=0.05, cbow_mean=1)

In [17]:
# Yet another sanity check.
model.vocab['bootstrap'].count

10

In [18]:
# Should be queen
model.most_similar(positive=['woman', 'king'], negative=['man'])

[('queen', 0.5664982795715332),
 ('josephine', 0.4243753254413605),
 ("king's", 0.41293755173683167),
 ('witch', 0.40765368938446045),
 ('memoirs', 0.40616926550865173),
 ('empress', 0.3961029648780823),
 ('palliser', 0.3934062719345093),
 ('wealthy', 0.38778430223464966),
 ('oedipus', 0.38776156306266785),
 ('geisha', 0.3861485719680786)]

In [19]:
# Should be germany
model.most_similar(positive=['france', 'berlin'], negative=['paris'])

[('finland', 0.6568661332130432),
 ('germany', 0.6284235715866089),
 ('croatia', 0.6253992915153503),
 ('austria', 0.6232521533966064),
 ('sweden', 0.6187658905982971),
 ('switzerland', 0.615707516670227),
 ('belgium', 0.6112383604049683),
 ('denmark', 0.6090745329856873),
 ('russia', 0.6063501238822937),
 ('poland', 0.6030704975128174)]

In [20]:
model.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'

In [21]:
model.estimate_memory()

{'syn0': 111271200,
 'syn1neg': 111271200,
 'total': 268905400,
 'vocab': 46363000}

In [22]:
# A few more sanity checks
print(model.similarity('woman', 'man'))
print(model.similarity('woman', 'coffee'))
print(model.similarity('woman', 'penis'))
print(model.similarity('woman', 'football'))

0.52955911881
0.114179236087
0.268376207035
0.0404427249823


In [23]:
print(model.similarity('car','man'))
print(model.similarity('car','truck'))

0.192689974775
0.640076543279


In [24]:
acc, correct_n, incorrect_n = eval_embeddings(model)
print("{0:5.3f} accuracy; Analogies: {1} correct, {2} incorrect".format(
    acc, correct_n, incorrect_n))

0.407 accuracy; Analogies: 3053 correct, 7493 incorrect


### Accuracies (full Twitter data)
 * Vanilla (size=225, window=5, min_count=5): 0.319 accuracy; Analogies: 2233 correct, 7004 incorrect
 * (size=300, window=5,  min_count=5): 0.337 accuracy; Analogies: 2329 correct, 6908 incorrect
 * (size=500, window=5,  min_count=5): 0.330 accuracy; Analogies: 2292 correct, 6945 incorrect
 * (size=300, window=10, min_count=5): 0.346 accuracy; Analogies: 2374 correct, 6863 incorrect
 * (size=300, window=15, min_count=5): 0.342 accuracy; Analogies: 2356 correct, 6881 incorrect
 * (size=400, window=10, min_count=5): 0.341 accuracy; Analogies: 2340 correct, 6870 incorrect

### Accuracties (full Twitter data + Nikos 1st stage preprocessing)
 * (size=200, window=10, min_count=5): 0.327 accuracy; Analogies: 2316 correct, 7093 incorrect
 * (size=225, window=10, min_count=5): 0.331 accuracy; Analogies: 2342 correct, 7067 incorrect
 * (size=250, window=10, min_count=5): 0.330 accuracy; Analogies: 2336 correct, 7073 incorrect
 * (size=275, window=10, min_count=5): 0.337 accuracy; Analogies: 2374 correct, 7035 incorrect
 * (size=300, window=10, min_count=5): 0.334 accuracy; Analogies: 2355 correct, 7054 incorrect
 * (size=325, window=10, min_count=5): 0.334 accuracy; Analogies: 2356 correct, 7053 incorrect
 * (size=350, window=10, min_count=5): 0.330 accuracy; Analogies: 2334 correct, 7075 incorrect
 * (size=400, window=10, min_count=5): 0.321 accuracy; Analogies: 2289 correct, 7120 incorrect
 
### After fixing Andrei's retarded bug
 * (size=300, window=10, min_count=5): 0.438 accuracy; Analogies: 3071 correct, 7019 incorrect



In [25]:
print("Embedding dimensionality: {0}".format(EMBEDDING_SIZE))

Embedding dimensionality: 300


In [26]:
fname = "./word2vec-local-gensim-{0}.bin".format(EMBEDDING_SIZE)
print("Writing embeddings to file {0}.".format(fname))
model.save(fname)
print("Done! Happy neural networking!")

Writing embeddings to file ./word2vec-local-gensim-300.bin.
Done! Happy neural networking!


### Some experimentation

In [None]:
emb_sizes = [225, 250, 275, 300, 325, 350]

for w_size in [5, 8, 10, 12]:
    for emb_size in emb_sizes:
        print("Computing embeddings of size {0} and window {1}...".format(emb_size, w_size))
        model = Word2Vec(sentences, size=emb_size, window=w_size, min_count=5, workers=4)
        print("Evaluating embeddings of size {0}...".format(emb_size))
        acc, correct_n, incorrect_n = eval_embeddings(model)
        print("Size {3}; wsize {4}: {0:5.3f} accuracy; Analogies: {1} correct, {2} incorrect".format(
            acc, correct_n, incorrect_n, emb_size, w_size))

```
Computing embeddings of size 225 and window 5...
Evaluating embeddings of size 225...
Size 225; wsize 5: 0.388 accuracy; Analogies: 2948 correct, 7598 incorrect
Computing embeddings of size 250 and window 5...
Evaluating embeddings of size 250...
Size 250; wsize 5: 0.381 accuracy; Analogies: 2907 correct, 7639 incorrect
Computing embeddings of size 275 and window 5...
Evaluating embeddings of size 275...
Size 275; wsize 5: 0.381 accuracy; Analogies: 2909 correct, 7637 incorrect
Computing embeddings of size 300 and window 5...
Evaluating embeddings of size 300...
Size 300; wsize 5: 0.392 accuracy; Analogies: 2968 correct, 7578 incorrect
Computing embeddings of size 325 and window 5...
Evaluating embeddings of size 325...
Size 325; wsize 5: 0.393 accuracy; Analogies: 2977 correct, 7569 incorrect
Computing embeddings of size 350 and window 5...
Evaluating embeddings of size 350...
Size 350; wsize 5: 0.391 accuracy; Analogies: 2967 correct, 7579 incorrect
Computing embeddings of size 225 and window 8...
Evaluating embeddings of size 225...
Size 225; wsize 8: 0.408 accuracy; Analogies: 3055 correct, 7491 incorrect
Computing embeddings of size 250 and window 8...
Evaluating embeddings of size 250...
Size 250; wsize 8: 0.410 accuracy; Analogies: 3064 correct, 7482 incorrect
Computing embeddings of size 275 and window 8...
Evaluating embeddings of size 275...
Size 275; wsize 8: 0.402 accuracy; Analogies: 3026 correct, 7520 incorrect
Computing embeddings of size 300 and window 8...
Evaluating embeddings of size 300...
Size 300; wsize 8: 0.410 accuracy; Analogies: 3069 correct, 7477 incorrect
Computing embeddings of size 325 and window 8...
Evaluating embeddings of size 325...
Size 325; wsize 8: 0.405 accuracy; Analogies: 3039 correct, 7507 incorrect
Computing embeddings of size 350 and window 8...
Evaluating embeddings of size 350...
Size 350; wsize 8: 0.407 accuracy; Analogies: 3049 correct, 7497 incorrect
Computing embeddings of size 225 and window 10...
Evaluating embeddings of size 225...
Size 225; wsize 10: 0.406 accuracy; Analogies: 3046 correct, 7500 incorrect
Computing embeddings of size 250 and window 10...
Evaluating embeddings of size 250...
Size 250; wsize 10: 0.417 accuracy; Analogies: 3102 correct, 7444 incorrect
Computing embeddings of size 275 and window 10...
Evaluating embeddings of size 275...
Size 275; wsize 10: 0.411 accuracy; Analogies: 3070 correct, 7476 incorrect
Computing embeddings of size 300 and window 10...
Evaluating embeddings of size 300...
Size 300; wsize 10: 0.417 accuracy; Analogies: 3106 correct, 7440 incorrect
Computing embeddings of size 325 and window 10...
Evaluating embeddings of size 325...
Size 325; wsize 10: 0.411 accuracy; Analogies: 3071 correct, 7475 incorrect
Computing embeddings of size 350 and window 10...
Evaluating embeddings of size 350...
Size 350; wsize 10: 0.404 accuracy; Analogies: 3035 correct, 7511 incorrect
Computing embeddings of size 225 and window 12...
Evaluating embeddings of size 225...
Size 225; wsize 12: 0.399 accuracy; Analogies: 3008 correct, 7538 incorrect
Computing embeddings of size 250 and window 12...
Evaluating embeddings of size 250...
Size 250; wsize 12: 0.419 accuracy; Analogies: 3115 correct, 7431 incorrect
Computing embeddings of size 275 and window 12...
Evaluating embeddings of size 275...
Size 275; wsize 12: 0.423 accuracy; Analogies: 3134 correct, 7412 incorrect
Computing embeddings of size 300 and window 12...
Evaluating embeddings of size 300...
Size 300; wsize 12: 0.417 accuracy; Analogies: 3104 correct, 7442 incorrect
Computing embeddings of size 325 and window 12...
Evaluating embeddings of size 325...
Size 325; wsize 12: 0.428 accuracy; Analogies: 3162 correct, 7384 incorrect
Computing embeddings of size 350 and window 12...
Evaluating embeddings of size 350...
Size 350; wsize 12: 0.413 accuracy; Analogies: 3080 correct, 7466 incorrect

Computing embeddings of size 225 and window 13...
Evaluating embeddings of size 225...
Size 225; wsize 13: 0.415 accuracy; Analogies: 3094 correct, 7452 incorrect
Computing embeddings of size 250 and window 13...
Evaluating embeddings of size 250...
Size 250; wsize 13: 0.412 accuracy; Analogies: 3078 correct, 7468 incorrect
Computing embeddings of size 275 and window 13...
Evaluating embeddings of size 275...
Size 275; wsize 13: 0.420 accuracy; Analogies: 3121 correct, 7425 incorrect
Computing embeddings of size 300 and window 13...
Evaluating embeddings of size 300...
Size 300; wsize 13: 0.410 accuracy; Analogies: 3067 correct, 7479 incorrect
Computing embeddings of size 325 and window 13...
Evaluating embeddings of size 325...
Size 325; wsize 13: 0.411 accuracy; Analogies: 3074 correct, 7472 incorrect
Computing embeddings of size 350 and window 13...
Evaluating embeddings of size 350...
Size 350; wsize 13: 0.426 accuracy; Analogies: 3150 correct, 7396 incorrect
Computing embeddings of size 225 and window 14...
Evaluating embeddings of size 225...
Size 225; wsize 14: 0.421 accuracy; Analogies: 3125 correct, 7421 incorrect
Computing embeddings of size 250 and window 14...
Evaluating embeddings of size 250...
Size 250; wsize 14: 0.426 accuracy; Analogies: 3150 correct, 7396 incorrect
Computing embeddings of size 275 and window 14...
Evaluating embeddings of size 275...
Size 275; wsize 14: 0.422 accuracy; Analogies: 3132 correct, 7414 incorrect
Computing embeddings of size 300 and window 14...
Evaluating embeddings of size 300...
Size 300; wsize 14: 0.426 accuracy; Analogies: 3149 correct, 7397 incorrect
Computing embeddings of size 325 and window 14...
Evaluating embeddings of size 325...
Size 325; wsize 14: 0.418 accuracy; Analogies: 3107 correct, 7439 incorrect
Computing embeddings of size 350 and window 14...
Evaluating embeddings of size 350...
Size 350; wsize 14: 0.426 accuracy; Analogies: 3150 correct, 7396 incorrect
Computing embeddings of size 225 and window 15...
Evaluating embeddings of size 225...
Size 225; wsize 15: 0.421 accuracy; Analogies: 3124 correct, 7422 incorrect
Computing embeddings of size 250 and window 15...
Evaluating embeddings of size 250...
Size 250; wsize 15: 0.431 accuracy; Analogies: 3174 correct, 7372 incorrect
Computing embeddings of size 275 and window 15...
Evaluating embeddings of size 275...
Size 275; wsize 15: 0.427 accuracy; Analogies: 3154 correct, 7392 incorrect
Computing embeddings of size 300 and window 15...
Evaluating embeddings of size 300...
Size 300; wsize 15: 0.432 accuracy; Analogies: 3183 correct, 7363 incorrect
Computing embeddings of size 325 and window 15...
Evaluating embeddings of size 325...
Size 325; wsize 15: 0.434 accuracy; Analogies: 3191 correct, 7355 incorrect
Computing embeddings of size 350 and window 15...
Evaluating embeddings of size 350...
Size 350; wsize 15: 0.441 accuracy; Analogies: 3227 correct, 7319 incorrect

Computing embeddings of size 225 and window 16...
Evaluating embeddings of size 225...
Size 225; wsize 16: 0.409 accuracy; Analogies: 3063 correct, 7483 incorrect
Computing embeddings of size 250 and window 16...
Evaluating embeddings of size 250...
Size 250; wsize 16: 0.423 accuracy; Analogies: 3133 correct, 7413 incorrect
Computing embeddings of size 275 and window 16...
Evaluating embeddings of size 275...
Size 275; wsize 16: 0.413 accuracy; Analogies: 3084 correct, 7462 incorrect
Computing embeddings of size 300 and window 16...
Evaluating embeddings of size 300...
Size 300; wsize 16: 0.421 accuracy; Analogies: 3126 correct, 7420 incorrect
Computing embeddings of size 325 and window 16...
Evaluating embeddings of size 325...
Size 325; wsize 16: 0.423 accuracy; Analogies: 3133 correct, 7413 incorrect
Computing embeddings of size 350 and window 16...
Evaluating embeddings of size 350...
Size 350; wsize 16: 0.421 accuracy; Analogies: 3122 correct, 7424 incorrect
Computing embeddings of size 225 and window 17...
Evaluating embeddings of size 225...
Size 225; wsize 17: 0.404 accuracy; Analogies: 3034 correct, 7512 incorrect
Computing embeddings of size 250 and window 17...
Evaluating embeddings of size 250...
Size 250; wsize 17: 0.429 accuracy; Analogies: 3168 correct, 7378 incorrect
Computing embeddings of size 275 and window 17...
Evaluating embeddings of size 275...
Size 275; wsize 17: 0.436 accuracy; Analogies: 3204 correct, 7342 incorrect
Computing embeddings of size 300 and window 17...
Evaluating embeddings of size 300...
Size 300; wsize 17: 0.427 accuracy; Analogies: 3158 correct, 7388 incorrect
Computing embeddings of size 325 and window 17...
Evaluating embeddings of size 325...
Size 325; wsize 17: 0.429 accuracy; Analogies: 3166 correct, 7380 incorrect
Computing embeddings of size 350 and window 17...
Evaluating embeddings of size 350...
Size 350; wsize 17: 0.417 accuracy; Analogies: 3106 correct, 7440 incorrect
Computing embeddings of size 225 and window 18...
Evaluating embeddings of size 225...
Size 225; wsize 18: 0.427 accuracy; Analogies: 3156 correct, 7390 incorrect
Computing embeddings of size 250 and window 18...
Evaluating embeddings of size 250...
Size 250; wsize 18: 0.417 accuracy; Analogies: 3105 correct, 7441 incorrect
Computing embeddings of size 275 and window 18...
Evaluating embeddings of size 275...
Size 275; wsize 18: 0.428 accuracy; Analogies: 3160 correct, 7386 incorrect
Computing embeddings of size 300 and window 18...
Evaluating embeddings of size 300...
Size 300; wsize 18: 0.421 accuracy; Analogies: 3126 correct, 7420 incorrect
Computing embeddings of size 325 and window 18...
Evaluating embeddings of size 325...
Size 325; wsize 18: 0.434 accuracy; Analogies: 3193 correct, 7353 incorrect
Computing embeddings of size 350 and window 18...
Evaluating embeddings of size 350...
Size 350; wsize 18: 0.418 accuracy; Analogies: 3107 correct, 7439 incorrect
Computing embeddings of size 225 and window 19...
Evaluating embeddings of size 225...
Size 225; wsize 19: 0.417 accuracy; Analogies: 3105 correct, 7441 incorrect
Computing embeddings of size 250 and window 19...
Evaluating embeddings of size 250...
Size 250; wsize 19: 0.421 accuracy; Analogies: 3125 correct, 7421 incorrect
Computing embeddings of size 275 and window 19...
Evaluating embeddings of size 275...
Size 275; wsize 19: 0.439 accuracy; Analogies: 3219 correct, 7327 incorrect
Computing embeddings of size 300 and window 19...
Evaluating embeddings of size 300...
Size 300; wsize 19: 0.438 accuracy; Analogies: 3212 correct, 7334 incorrect
Computing embeddings of size 325 and window 19...
Evaluating embeddings of size 325...
Size 325; wsize 19: 0.426 accuracy; Analogies: 3153 correct, 7393 incorrect
Computing embeddings of size 350 and window 19...
Evaluating embeddings of size 350...
Size 350; wsize 19: 0.428 accuracy; Analogies: 3161 correct, 7385 incorrect
Computing embeddings of size 225 and window 20...
Evaluating embeddings of size 225...
Size 225; wsize 20: 0.429 accuracy; Analogies: 3166 correct, 7380 incorrect
Computing embeddings of size 250 and window 20...
Evaluating embeddings of size 250...
Size 250; wsize 20: 0.424 accuracy; Analogies: 3139 correct, 7407 incorrect
Computing embeddings of size 275 and window 20...
Evaluating embeddings of size 275...
Size 275; wsize 20: 0.427 accuracy; Analogies: 3155 correct, 7391 incorrect
Computing embeddings of size 300 and window 20...
Evaluating embeddings of size 300...
Size 300; wsize 20: 0.419 accuracy; Analogies: 3116 correct, 7430 incorrect
Computing embeddings of size 325 and window 20...
Evaluating embeddings of size 325...
Size 325; wsize 20: 0.438 accuracy; Analogies: 3211 correct, 7335 incorrect
Computing embeddings of size 350 and window 20...
Evaluating embeddings of size 350...
Size 350; wsize 20: 0.409 accuracy; Analogies: 3061 correct, 7485 incorrect
Computing embeddings of size 225 and window 21...
Evaluating embeddings of size 225...
Size 225; wsize 21: 0.414 accuracy; Analogies: 3088 correct, 7458 incorrect
Computing embeddings of size 250 and window 21...
Evaluating embeddings of size 250...
Size 250; wsize 21: 0.415 accuracy; Analogies: 3094 correct, 7452 incorrect
Computing embeddings of size 275 and window 21...
Evaluating embeddings of size 275...
Size 275; wsize 21: 0.415 accuracy; Analogies: 3093 correct, 7453 incorrect
Computing embeddings of size 300 and window 21...
Evaluating embeddings of size 300...
Size 300; wsize 21: 0.438 accuracy; Analogies: 3213 correct, 7333 incorrect
Computing embeddings of size 325 and window 21...
Evaluating embeddings of size 325...
Size 325; wsize 21: 0.431 accuracy; Analogies: 3178 correct, 7368 incorrect
Computing embeddings of size 350 and window 21...
Evaluating embeddings of size 350...
Size 350; wsize 21: 0.429 accuracy; Analogies: 3164 correct, 7382 incorrect
Computing embeddings of size 225 and window 22...
Evaluating embeddings of size 225...
Size 225; wsize 22: 0.424 accuracy; Analogies: 3142 correct, 7404 incorrect
Computing embeddings of size 250 and window 22...
Evaluating embeddings of size 250...
Size 250; wsize 22: 0.410 accuracy; Analogies: 3066 correct, 7480 incorrect
Computing embeddings of size 275 and window 22...
Evaluating embeddings of size 275...
Size 275; wsize 22: 0.423 accuracy; Analogies: 3134 correct, 7412 incorrect
Computing embeddings of size 300 and window 22...
Evaluating embeddings of size 300...
Size 300; wsize 22: 0.427 accuracy; Analogies: 3158 correct, 7388 incorrect
Computing embeddings of size 325 and window 22...
Evaluating embeddings of size 325...
Size 325; wsize 22: 0.424 accuracy; Analogies: 3138 correct, 7408 incorrect
Computing embeddings of size 350 and window 22...
Evaluating embeddings of size 350...
Size 350; wsize 22: 0.426 accuracy; Analogies: 3151 correct, 7395 incorrect
```