In [4]:
import os 
from django.conf import settings
from django.core.management import call_command
from word_embeddings.models import Word2Vec
from data.models import SentenceBatch, Task, TrainSentence, TestSentence
from data.utils.preprocessing import Preprocessing
from data.utils.processing import Processing
from tqdm import tqdm_notebook

from gensim.models.wrappers import FastText
from gensim.models import KeyedVectors

In [2]:
raw_data_dir = os.path.join(os.path.dirname(settings.BASE_DIR), 'raw_data')
assert os.path.exists(raw_data_dir)
if not TrainSentence.objects.count():
    call_command('load_sentences', os.path.join(raw_data_dir, 'se16_ru_rest_train.xml'))
if not TestSentence.objects.count():
    call_command('load_sentences', os.path.join(raw_data_dir, 'se16_ru_rest_test.xml'))

# FAST TEXT

In [3]:
w2v_model = Word2Vec.objects.get(id=2)


In [None]:
fast_text = FastText.load_fasttext_format(w2v_model.file.path)

In [None]:
fast_text.most_similar('пиво')

In [5]:
model = KeyedVectors.load_word2vec_format('/Users/andrei/Downloads/wiki.ru/wiki.ru.vec')

In [20]:
model.most_similar('обслуживание', topn=100)

[('обслуживание\xa0—', 0.8725684285163879),
 ('медобслуживание', 0.8570499420166016),
 ('обслуживание»', 0.8405522704124451),
 ('техобслуживание', 0.8302351832389832),
 ('обслуживаний', 0.7906952500343323),
 ('обслуживанием', 0.774075448513031),
 ('обслуживания', 0.7729679942131042),
 ('обслуживания…', 0.7633925080299377),
 ('обслуживания\xa0—', 0.7526903748512268),
 ('обслуживания,', 0.7517712116241455),
 ('обслуживанию', 0.7498734593391418),
 ('обслуживания»', 0.7410985231399536),
 ('обслуживании»', 0.7370441555976868),
 ('обслуживании\xa0—', 0.7310150861740112),
 ('медобслуживания', 0.7306712865829468),
 ('самообслуживание', 0.727219820022583),
 ('обслуживании', 0.7236009240150452),
 ('сервисное', 0.721953272819519),
 ('техобслуживанием', 0.7078423500061035),
 ('обслуживанию\xa0—', 0.7047327160835266),
 ('техобслуживания', 0.6994536519050598),
 ('спецобслуживания', 0.6965265870094299),
 ('медоборудование', 0.695289671421051),
 ('техобслуживанию', 0.6801865100860596),
 ('обслуживающе

In [None]:
preprocessing = sentence_batch.preprocessing
processing = sentence_batch.processing
task = sentence_batch.task

if processing == Processing.NO_PROCESSING:
    return sentence_batch

if task.type == Task.Type.ASPECT_DETECTION:
    if processing == Processing.W2V_INDEXES_1:
        entity = task.aspect_entity or ''
        attribute = task.aspect_attribute or ''

        category = f'{entity.upper()}#{attribute.upper()}'

        train_sentences = TrainSentence.objects.filter(
            out_of_scope=False
        )

        test_sentences = TestSentence.objects.filter(
            out_of_scope=False
        )

        if not len(train_sentences) or not len(test_sentences):
            raise ValueError('You should loaded datasets')

        x_train_raw = [x.text for x in train_sentences]
        x_train_preproc = preprocess(x_train_raw, preprocessing)
        y_train_raw = [','.join(x.categories) for x in train_sentences]
        x_train = word2vec_indexes_v1(sentence_batch.w2v_model, x_train_preproc)
        y_train = get_ys(train_sentences, category)

        x_test_raw = [x.text for x in test_sentences]
        x_test_preproc = preprocess(x_test_raw, preprocessing)
        y_test_raw = [','.join(x.categories) for x in test_sentences]
        x_test = word2vec_indexes_v1(sentence_batch.w2v_model, x_test_preproc)
        y_test = get_ys(test_sentences, category)

        sentence_batch.x_train_raw = x_train_raw
        sentence_batch.x_train_preproc = x_train_preproc
        sentence_batch.y_train_raw = y_train_raw
        sentence_batch.x_train = x_train
        sentence_batch.y_train = y_train

        sentence_batch.x_test_raw = x_test_raw
        sentence_batch.x_test_preproc = x_test_preproc
        sentence_batch.y_test_raw = y_test_raw
        sentence_batch.x_test = x_test
        sentence_batch.y_test = y_test
        return sentence_batch