# Train word2vec locally

This allows a smart initialization of our neural net's word embeddings.
It seems that initializing the embeddings by training them locally, as opposed to using pre-trained word2vec embeddings (available online) can lead to better performance.

In [1]:
import os
import sys
print(sys.executable)

/Users/andrei/anaconda3/envs/cil/bin/python


In [2]:
from gensim.models.word2vec import Word2Vec

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [45]:
TRAIN = os.path.join('..', 'train')
TEST = os.path.join('..', 'test')
POS_TWEET_FILE = os.path.join(TRAIN, 'train_pos_full.txt')
NEG_TWEET_FILE = os.path.join(TRAIN, 'train_pos_full.txt')
TEST_TWEET_FILE = os.path.join(TEST, 'test_data.txt')
EMBEDDING_SIZE = 300

In [46]:
def read_tweets(fname):
    """Read the tweets in the given file.
    
    Returns a 2d array where every row is a tweet, split into words.
    """
    with open(fname, 'r') as f:
        return [l.split() for l in f.readlines()]

In [47]:
pos_tweets = read_tweets(POS_TWEET_FILE)

In [48]:
neg_tweets = read_tweets(NEG_TWEET_FILE)

In [49]:
test_tweets = read_tweets(TEST_TWEET_FILE)

In [50]:
sentences = pos_tweets + neg_tweets + test_tweets
print(len(sentences))

2510000


In [None]:
# Download this for testing: https://github.com/arfon/word2vec/blob/master/questions-words.txt
# Highly recommended!

question_file = "questions-words.txt"

def eval_embeddings(model):
    accuracy_results = model.accuracy(question_file)
    summary = accuracy_results[-1]
    assert summary['section'] == 'total'
    incorrect = summary['incorrect']
    correct = summary['correct']

    incorrect_n = len(incorrect)
    correct_n = len(correct)

    acc = correct_n / incorrect_n
    return acc, correct_n, incorrect_n

In [None]:
WORKERS = 8
# Note: Moises's team uses size=200 as of June 13.
# See: https://groups.google.com/forum/#!msg/gensim/ggCHGncd5rU/Z_pQDD69AAAJ
# for some parameter hints.
model = Word2Vec(sentences, size=EMBEDDING_SIZE, window=10, min_count=5, workers=WORKERS)# , alpha=0.05, cbow_mean=1)

In [None]:
# Should be queen
model.most_similar(positive=['woman', 'king'], negative=['man'])

In [None]:
# Should be germany
model.most_similar(positive=['france', 'berlin'], negative=['paris'])

In [None]:
model.doesnt_match("breakfast cereal dinner lunch".split())

In [None]:
model.estimate_memory()

In [None]:
# A few more sanity checks
print(model.similarity('woman', 'man'))
print(model.similarity('woman', 'coffee'))
print(model.similarity('woman', 'penis'))
print(model.similarity('woman', 'football'))

In [None]:
print(model.similarity('car','man'))
print(model.similarity('car','truck'))

In [None]:
acc, correct_n, incorrect_n = eval_embeddings(model)
print("{0:5.3f} accuracy; Analogies: {1} correct, {2} incorrect".format(
    acc, correct_n, incorrect_n))

### Accuracies (full Twitter data)
 * Vanilla (size=225, window=5, min_count=5): 0.319 accuracy; Analogies: 2233 correct, 7004 incorrect
 * (size=300, window=5,  min_count=5): 0.337 accuracy; Analogies: 2329 correct, 6908 incorrect
 * (size=500, window=5,  min_count=5): 0.330 accuracy; Analogies: 2292 correct, 6945 incorrect
 * (size=300, window=10, min_count=5): 0.346 accuracy; Analogies: 2374 correct, 6863 incorrect
 * (size=300, window=15, min_count=5): 0.342 accuracy; Analogies: 2356 correct, 6881 incorrect
 * (size=400, window=10, min_count=5): 0.341 accuracy; Analogies: 2340 correct, 6870 incorrect

### Accuracties (full Twitter data + Nikos 1st stage preprocessing)
 * (size=200, window=10, min_count=5): 0.327 accuracy; Analogies: 2316 correct, 7093 incorrect
 * (size=225, window=10, min_count=5): 0.331 accuracy; Analogies: 2342 correct, 7067 incorrect
 * (size=250, window=10, min_count=5): 0.330 accuracy; Analogies: 2336 correct, 7073 incorrect
 * (size=275, window=10, min_count=5): 0.337 accuracy; Analogies: 2374 correct, 7035 incorrect
 * (size=300, window=10, min_count=5): 0.334 accuracy; Analogies: 2355 correct, 7054 incorrect
 * (size=325, window=10, min_count=5): 0.334 accuracy; Analogies: 2356 correct, 7053 incorrect
 * (size=350, window=10, min_count=5): 0.330 accuracy; Analogies: 2334 correct, 7075 incorrect
 * (size=400, window=10, min_count=5): 0.321 accuracy; Analogies: 2289 correct, 7120 incorrect


In [None]:
print("Embedding dimensionality: {0}".format(EMBEDDING_SIZE))

In [None]:
fname = "./word2vec-local-gensim-{0}.bin".format(EMBEDDING_SIZE)
model.save(fname)

### Some experimentation

In [44]:
emb_sizes = [225, 250, 275, 300, 325, 350]

for emb_size in emb_sizes:
    print("Computing embeddings of size {0}...".format(emb_size))
    model = Word2Vec(sentences, size=emb_size, window=10, min_count=5, workers=WORKERS)
    print("Evaluating embeddings of size {0}...".format(emb_size))
    acc, correct_n, incorrect_n = eval_embeddings(model)
    print("Size {3}: {0:5.3f} accuracy; Analogies: {1} correct, {2} incorrect".format(
        acc, correct_n, incorrect_n, emb_size))

Computing embeddings of size 225...
Evaluating embeddings of size 225...
Size 225: 0.331 accuracy; Analogies: 2342 correct, 7067 incorrect
Computing embeddings of size 250...
Evaluating embeddings of size 250...
Size 250: 0.330 accuracy; Analogies: 2336 correct, 7073 incorrect
Computing embeddings of size 275...
Evaluating embeddings of size 275...
Size 275: 0.337 accuracy; Analogies: 2374 correct, 7035 incorrect
Computing embeddings of size 300...
Evaluating embeddings of size 300...
Size 300: 0.334 accuracy; Analogies: 2355 correct, 7054 incorrect
Computing embeddings of size 325...
Evaluating embeddings of size 325...
Size 325: 0.334 accuracy; Analogies: 2356 correct, 7053 incorrect
Computing embeddings of size 350...
Evaluating embeddings of size 350...
Size 350: 0.330 accuracy; Analogies: 2334 correct, 7075 incorrect
