# 1. Word2Vec Model

In [1]:
import os
import gensim

from utils import *

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

Using TensorFlow backend.


## 1.1. Train Word2Vec Model

### Getting Data

In [2]:
class MySentences(object):
    def __init__(self, dirname):    
        """
        Sentences loading class
        A memory-friendly iterator for word2vec model.
        # Arguments
            dirname : directory path of sentencens/data files.
        # Returns
            Sentences.
        """
        self.dirname = dirname
 
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield line.split()

In [3]:
# I use a memory friendly iterator
text_w2v = MySentences(dirname='../data/')

### CBOW Version

In [4]:
model_w2v = gensim.models.Word2Vec(sentences=text_w2v, size=300, sg=0, hs=1)
if not os.path.exists('../models'):
    os.mkdir('../models')
model_w2v.save('../models/w2v_cbow_300D')

### Skip-Gram Version

In [5]:
model_w2v = gensim.models.Word2Vec(sentences=text_w2v, size=300, sg=1, hs=1)
if not os.path.exists('../models'):
    os.mkdir('../models')
model_w2v.save('../models/w2v_sg_300D')

## 1.2. Load Word2Vec Model

### - CBOW Version

In [6]:
model_w2v = gensim.models.Word2Vec.load('../models/w2v_cbow_300D')

- #### Similar Words

In [7]:
model_w2v.wv.most_similar('difficile')

[('dur', 0.701798677444458),
 ('facile', 0.6994661092758179),
 ('pénible', 0.5951988101005554),
 ('agréable', 0.5823794603347778),
 ('étrange', 0.5800133943557739),
 ('amusant', 0.5518156290054321),
 ('compliqué', 0.5509512424468994),
 ('impossible', 0.5317744612693787),
 ('gênant', 0.5241851806640625),
 ('important', 0.5237008333206177)]

### - Skip-Gram Version

In [7]:
model_w2v = gensim.models.Word2Vec.load('../models/w2v_sg_300D')

- #### Similar Words

In [8]:
model_w2v.wv.most_similar('difficile')

[('dur', 0.5741609334945679),
 ('pénible', 0.5671994686126709),
 ('rude', 0.5307800769805908),
 ('facile', 0.5286517143249512),
 ('éprouvante', 0.5169097185134888),
 ('éprouvant', 0.5106732845306396),
 ("s'imaginer", 0.5098478198051453),
 ('instructif', 0.49842825531959534),
 ('stressante', 0.49489402770996094),
 ('frustrant', 0.4873944818973541)]