# 1. Word2Vec Model

In [None]:
import os
import gensim

from utils import *

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

Using TensorFlow backend.


## 1.1. Train Word2Vec Model

### Getting Data

In [None]:
class MySentences(object):
    def __init__(self, dirname):    
        """
        Sentences loading class
        A memory-friendly iterator for word2vec model.
        # Arguments
            dirname : directory path of sentencens/data files.
        # Returns
            Sentences.
        """
        self.dirname = dirname
 
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield line.split()

In [None]:
# I use a memory friendly iterator
text_w2v = MySentences(dirname='./Data/')

### CBOW Version

In [None]:
model_w2v = gensim.models.Word2Vec(sentences=text_w2v, size=300, sg=0, hs=1)
if not os.path.exists('./WE_models'):
    os.mkdir('./WE_models')
model_w2v.save('WE_models/w2v_cbow_300D')

### Skip-Gram Version

In [None]:
model_w2v = gensim.models.Word2Vec(sentences=text_w2v, size=300, sg=1, hs=1)
if not os.path.exists('./WE_models'):
    os.mkdir('./WE_models')
model_w2v.save('WE_models/w2v_sg_300D')

## 1.2. Load Word2Vec Model

### - CBOW Version

In [5]:
model_w2v = gensim.models.Word2Vec.load('WE_models/w2v_cbow_300D')

- #### Similar Words

In [6]:
model_w2v.wv.most_similar('difficile')

[('facile', 0.7146144509315491),
 ('dur', 0.675102710723877),
 ('pénible', 0.6174148321151733),
 ('agréable', 0.6010310649871826),
 ('amusant', 0.5417053699493408),
 ('étrange', 0.5337499976158142),
 ('impossible', 0.5333918929100037),
 ('triste', 0.5296621322631836),
 ('gênant', 0.5262279510498047),
 ('important', 0.5219485759735107)]

### - Skip-Gram Version

In [7]:
model_w2v = gensim.models.Word2Vec.load('WE_models/w2v_sg_300D')

- #### Similar Words

In [8]:
model_w2v.wv.most_similar('difficile')

[('dur', 0.5741609334945679),
 ('pénible', 0.5671994686126709),
 ('rude', 0.5307800769805908),
 ('facile', 0.5286517143249512),
 ('éprouvante', 0.5169097185134888),
 ('éprouvant', 0.5106732845306396),
 ("s'imaginer", 0.5098478198051453),
 ('instructif', 0.49842825531959534),
 ('stressante', 0.49489402770996094),
 ('frustrant', 0.4873944818973541)]