# 3. FastText Model

In [2]:
import os
import gensim

from utils import *

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

Using TensorFlow backend.


## 3.1. Train FastText Model

### Getting Data

In [9]:
class MySentences(object):
    def __init__(self, dirname):    
        """
        Sentences loading class
        A memory-friendly iterator for word2vec model.
        # Arguments
            dirname : directory path of sentencens/data files.
        # Returns
            Sentences.
        """
        self.dirname = dirname
 
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield line.split()

In [10]:
# I use a memory friendly iterator
text_w2v = MySentences(dirname='../data/')

### CBOW Version

In [11]:
model_w2v = gensim.models.FastText(sentences=text_w2v, size=300, sg=0, hs=1, word_ngrams=1)
if not os.path.exists('../models'):
    os.mkdir('../models')
model_w2v.save('../models/fast_cbow_300D')

### Skip-Gram Version

In [12]:
model_w2v = gensim.models.FastText(sentences=text_w2v, size=300, sg=1, hs=1, word_ngrams=1)
if not os.path.exists('../models'):
    os.mkdir('../models')
model_w2v.save('../models/fast_sg_300D')

## 3.2. Load FastText Model

### - CBOW Version

In [6]:
model_w2v = gensim.models.fasttext.FastText.load('../models/fast_cbow_300D')

- #### Similar Words

In [11]:
model_w2v.wv.most_similar("m'appelle")

[("t'appelle", 0.9224079847335815),
 ("l'appelle", 0.8807339072227478),
 ("m'appelait", 0.854578971862793),
 ("m'appelais", 0.8342265486717224),
 ('appelle', 0.8243040442466736),
 ("m'appelleras", 0.8209994435310364),
 ("t'appellera", 0.8152071237564087),
 ("s'appelle", 0.8091527223587036),
 ("t'appellerai", 0.8021713495254517),
 ("t'appelait", 0.7965459823608398)]

In [21]:
model_w2v.wv.most_similar('difficile')

[('difficiles', 0.7628830671310425),
 ('diffère', 0.6367588043212891),
 ('difficulté', 0.6088806390762329),
 ('facile', 0.5900378227233887),
 ('difficultés', 0.5419623851776123),
 ('stressante', 0.5203461647033691),
 ('éprouvante', 0.5177608728408813),
 ('dur', 0.5096060037612915),
 ('intéressante', 0.4871010184288025),
 ('pénible', 0.4869566261768341)]

- #### Similar Words : Out of Vocabulary Words

In [15]:
model_w2v.wv.most_similar('diffici')

[('difficile', 0.926388680934906),
 ('difficiles', 0.8317775726318359),
 ('difficulté', 0.8042018413543701),
 ('difficultés', 0.7532557249069214),
 ('diffère', 0.726020872592926),
 ('durable', 0.6889504194259644),
 ('désagréable', 0.6839538812637329),
 ('efficacité', 0.6713117957115173),
 ('douloureux', 0.6706247329711914),
 ('dur', 0.6706138849258423)]

### - Skip-Gram Version

In [16]:
model_w2v = gensim.models.fasttext.FastText.load('../models/fast_sg_300D')

- #### Similar Words

In [17]:
model_w2v.wv.most_similar('difficile')

[('difficiles', 0.7628830671310425),
 ('diffère', 0.6367588043212891),
 ('difficulté', 0.6088806390762329),
 ('facile', 0.5900378227233887),
 ('difficultés', 0.5419623851776123),
 ('stressante', 0.5203461647033691),
 ('éprouvante', 0.5177608728408813),
 ('dur', 0.5096060037612915),
 ('intéressante', 0.4871010184288025),
 ('pénible', 0.4869566261768341)]

- #### Similar Words : Out of Vocabulary Words

In [18]:
model_w2v.wv.most_similar('diffici')

[('difficile', 0.9102813005447388),
 ('difficiles', 0.8346987962722778),
 ('difficulté', 0.7629407048225403),
 ('difficultés', 0.7220677137374878),
 ('diffère', 0.7090779542922974),
 ('diffusion', 0.5623677372932434),
 ('diffusé', 0.5351862907409668),
 ('différend', 0.5241727828979492),
 ('différends', 0.5176236629486084),
 ('différencie', 0.5041417479515076)]