# Train word2vec locally

This allows a smart initialization of our neural net's word embeddings.
It seems that initializing the embeddings by training them locally, as opposed to using pre-trained word2vec embeddings (available online) can lead to better performance.

In [1]:
import os
import sys
print(sys.executable)

/Users/andrei/anaconda3/envs/cil/bin/python


In [2]:
from gensim.models.word2vec import Word2Vec

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [5]:
TRAIN = os.path.join('..', 'train')
TEST = os.path.join('..', 'test')
POS_TWEET_FILE = os.path.join(TRAIN, 'train_pos_full.txt')
NEG_TWEET_FILE = os.path.join(TRAIN, 'train_pos_full.txt')
TEST_TWEET_FILE = os.path.join(TEST, 'test_data.txt')
EMBEDDING_SIZE = 350

In [6]:
def read_tweets(fname):
    """Read the tweets in the given file.
    
    Returns a 2d array where every row is a tweet, split into words.
    """
    with open(fname, 'r') as f:
        return [l.split() for l in f.readlines()]

In [7]:
pos_tweets = read_tweets(POS_TWEET_FILE)

In [8]:
neg_tweets = read_tweets(NEG_TWEET_FILE)

In [9]:
test_tweets = read_tweets(TEST_TWEET_FILE)

In [10]:
sentences = pos_tweets + neg_tweets + test_tweets
print(len(sentences))

2510000


In [11]:
# Download this for testing: https://github.com/arfon/word2vec/blob/master/questions-words.txt
# Highly recommended!

question_file = "questions-words.txt"

def eval_embeddings(model):
    accuracy_results = model.accuracy(question_file)
    summary = accuracy_results[-1]
    assert summary['section'] == 'total'
    incorrect = summary['incorrect']
    correct = summary['correct']

    incorrect_n = len(incorrect)
    correct_n = len(correct)

    acc = correct_n / incorrect_n
    return acc, correct_n, incorrect_n

In [12]:
WORKERS = 8
# Note: Moises's team uses size=200 as of June 13.
# See: https://groups.google.com/forum/#!msg/gensim/ggCHGncd5rU/Z_pQDD69AAAJ
# for some parameter hints.
model = Word2Vec(sentences, size=EMBEDDING_SIZE, window=10, min_count=5, workers=WORKERS)# , alpha=0.05, cbow_mean=1)

In [13]:
# Should be queen
model.most_similar(positive=['woman', 'king'], negative=['man'])

[('queen', 0.480918288230896),
 ('alpha', 0.384929895401001),
 ('madonna', 0.3734423816204071),
 ('lawrence', 0.3621298670768738),
 ('protects', 0.3620363771915436),
 ('poet', 0.3521104156970978),
 ('hawk', 0.34590622782707214),
 ('child', 0.3413478136062622),
 ('kenny', 0.34024181962013245),
 ('sinner', 0.3332200050354004)]

In [14]:
# Should be germany
model.most_similar(positive=['france', 'berlin'], negative=['paris'])

[('croatia', 0.598901629447937),
 ('sweden', 0.5939220190048218),
 ('switzerland', 0.5885233283042908),
 ('norway', 0.5879368782043457),
 ('belgium', 0.5865507125854492),
 ('finland', 0.5794695019721985),
 ('holland', 0.5722660422325134),
 ('germany', 0.5641714334487915),
 ('italy', 0.5595147609710693),
 ('canada', 0.5537332892417908)]

In [15]:
model.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'

In [16]:
model.estimate_memory()

{'syn0': 97742400, 'syn1neg': 97742400, 'total': 230392800, 'vocab': 34908000}

In [17]:
# A few more sanity checks
print(model.similarity('woman', 'man'))
print(model.similarity('woman', 'coffee'))
print(model.similarity('woman', 'penis'))
print(model.similarity('woman', 'football'))

0.519508487089
0.0890147546286
0.229477331801
0.0278565673937


In [18]:
print(model.similarity('car','man'))
print(model.similarity('car','truck'))

0.197424738269
0.698331116458


In [19]:
acc, correct_n, incorrect_n = eval_embeddings(model)
print("{0:5.3f} accuracy; Analogies: {1} correct, {2} incorrect".format(
    acc, correct_n, incorrect_n))

0.335 accuracy; Analogies: 2340 correct, 6992 incorrect


### Accuracies (full Twitter data)
 * Vanilla (size=225, window=5, min_count=5): 0.319 accuracy; Analogies: 2233 correct, 7004 incorrect
 * (size=300, window=5,  min_count=5): 0.337 accuracy; Analogies: 2329 correct, 6908 incorrect
 * (size=500, window=5,  min_count=5): 0.330 accuracy; Analogies: 2292 correct, 6945 incorrect
 * (size=300, window=10, min_count=5): 0.346 accuracy; Analogies: 2374 correct, 6863 incorrect
 * (size=300, window=15, min_count=5): 0.342 accuracy; Analogies: 2356 correct, 6881 incorrect
 * (size=400, window=10, min_count=5): 0.341 accuracy; Analogies: 2340 correct, 6870 incorrect

### Accuracties (full Twitter data + Nikos 1st stage preprocessing)
 * (size=200, window=10, min_count=5): 0.327 accuracy; Analogies: 2316 correct, 7093 incorrect
 * (size=225, window=10, min_count=5): 0.331 accuracy; Analogies: 2342 correct, 7067 incorrect
 * (size=250, window=10, min_count=5): 0.330 accuracy; Analogies: 2336 correct, 7073 incorrect
 * (size=275, window=10, min_count=5): 0.337 accuracy; Analogies: 2374 correct, 7035 incorrect
 * (size=300, window=10, min_count=5): 0.334 accuracy; Analogies: 2355 correct, 7054 incorrect
 * (size=325, window=10, min_count=5): 0.334 accuracy; Analogies: 2356 correct, 7053 incorrect
 * (size=350, window=10, min_count=5): 0.330 accuracy; Analogies: 2334 correct, 7075 incorrect
 * (size=400, window=10, min_count=5): 0.321 accuracy; Analogies: 2289 correct, 7120 incorrect


In [20]:
print("Embedding dimensionality: {0}".format(EMBEDDING_SIZE))

Embedding dimensionality: 350


In [21]:
fname = "./word2vec-local-gensim-{0}.bin".format(EMBEDDING_SIZE)
print("Writing embeddings to file {0}.".format(fname))
model.save(fname)

Writing embeddings to file ./word2vec-local-gensim-350.bin.


### Some experimentation

In [None]:
emb_sizes = [225, 250, 275, 300, 325, 350]

for emb_size in emb_sizes:
    print("Computing embeddings of size {0}...".format(emb_size))
    model = Word2Vec(sentences, size=emb_size, window=10, min_count=5, workers=WORKERS)
    print("Evaluating embeddings of size {0}...".format(emb_size))
    acc, correct_n, incorrect_n = eval_embeddings(model)
    print("Size {3}: {0:5.3f} accuracy; Analogies: {1} correct, {2} incorrect".format(
        acc, correct_n, incorrect_n, emb_size))