## FastText Training on Corpus

In [2]:
import numpy as np
import fasttext
import pandas as pd
from resources.tokTT import CommentTokenizer
from resources.basicIO import InputOutput as IO
from resources.filterLang import FilterLanguage as FL
from gensim.models import Word2Vec
from gensim.test.utils import common_texts


In [3]:
print(common_texts)

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]


### Cleaning the Corpus

In [4]:
# clean and save file
tokenized_comment = CommentTokenizer.cleaned('datasets/corpus.txt','utf-8',['0','1'])
IO.save_text('datasets/tokenized_corpus.txt',tokenized_comment)

In [5]:
tokenized_texts = [x.split() for x in tokenized_comment]

### Making model from corpus

### N = 2

In [6]:
model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

In [7]:
model.save("models/word2vec.model")

In [10]:
#model.train([["hello", "modi"]], total_examples=1, epochs=1)
x = model.wv.most_similar('modi',topn=20)
x = [(a, b) for b,a in x]
print(x)

[(0.8407846689224243, 'bjp'), (0.8352469801902771, 'pm'), (0.8287320733070374, 'modiji'), (0.8255611658096313, 'yogi'), (0.808468222618103, 'vihari'), (0.8020656108856201, 'strict'), (0.7930921316146851, 'shah'), (0.7921292781829834, 'resign'), (0.7843181490898132, 'him'), (0.7835392355918884, 'sek'), (0.777195155620575, 'mr'), (0.7693535089492798, 'irani'), (0.7671559453010559, 'action'), (0.7658399939537048, 'weak'), (0.7648589015007019, 'vote'), (0.7642906308174133, 'spineless'), (0.7585046887397766, 'narendra'), (0.7573955655097961, 'manish'), (0.7555968165397644, 'amit'), (0.7513961791992188, 'kejriwal')]


In [12]:
x = model.wv.most_similar('modi',topn=20)
print(x)

[('bjp', 0.8407846689224243), ('pm', 0.8352469801902771), ('modiji', 0.8287320733070374), ('yogi', 0.8255611658096313), ('vihari', 0.808468222618103), ('strict', 0.8020656108856201), ('shah', 0.7930921316146851), ('resign', 0.7921292781829834), ('him', 0.7843181490898132), ('sek', 0.7835392355918884), ('mr', 0.777195155620575), ('irani', 0.7693535089492798), ('action', 0.7671559453010559), ('weak', 0.7658399939537048), ('vote', 0.7648589015007019), ('spineless', 0.7642906308174133), ('narendra', 0.7585046887397766), ('manish', 0.7573955655097961), ('amit', 0.7555968165397644), ('kejriwal', 0.7513961791992188)]


In [22]:
model.wv.vectors


array([[-1.0841911e+00, -2.9203877e-01,  5.3299743e-01, ...,
        -1.1112421e+00,  7.3087756e-03,  5.2137601e-01],
       [-3.0588737e-01,  6.4980751e-01,  1.6923246e-01, ...,
        -1.2952470e+00,  3.2830164e-01, -3.2935888e-01],
       [ 1.4465623e-01,  6.3111506e-02, -7.9254681e-01, ...,
        -8.4721380e-01,  2.2876136e-01, -9.7772294e-01],
       ...,
       [ 7.9495885e-04,  2.5331898e-02,  7.3992484e-03, ...,
        -2.3389194e-02,  1.5564258e-03, -7.3236128e-04],
       [ 1.0034982e-02,  1.0424025e-02,  1.0902163e-02, ...,
        -1.5874788e-02, -5.1443134e-03, -1.7828853e-03],
       [ 3.0757452e-05,  1.7287364e-02,  1.7987363e-02, ...,
        -3.3061348e-02,  1.4847288e-02,  8.4822671e-04]], dtype=float32)