In [1]:
import gensim
gensim.__version__



'3.6.0'

In [2]:
import string
from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
import annoy
from gensim.models import Word2Vec, FastText
import pickle
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.metrics import accuracy_score

In [5]:
from preprocess_func import morpher, sw, exclude, preprocess_txt, embed_txt

In [6]:
import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

In [7]:
anecd_data = []
anecd_origin = []

c = 0

with open("../data/anek_djvu.txt", "r", encoding='utf-8') as fin:
    for line in tqdm(fin):
        if line.startswith("<|startoftext|>"):
            line = line.replace('<|startoftext|>', '')
            anecd_origin.append(line)
            spls = preprocess_txt(line)
            anecd_data.append(spls)
            
            if c > 10000:
                break
            c += 1

0it [00:00, ?it/s]

In [8]:
print(len(anecd_data))
anecd_data[5]

10002


['посещать',
 'мысль',
 'смерть',
 'полбеды',
 'беда',
 'смерть',
 'посещать',
 'мысль']

In [9]:
modelFT = FastText.load("../data/bot_trained/ft_model")

In [10]:
with open("../data/bot_trained/index_speaker.pkl","rb") as f:
    bin_data = f.read()
    index_loaded = pickle.loads(bin_data)

In [11]:
vectorizer = CountVectorizer(ngram_range=(1, 2))

In [12]:
idxs = set(np.random.randint(0, len(index_loaded), len(anecd_data)))
negative_speaker_texts = [" ".join(preprocess_txt(index_loaded[i])) for i in idxs]
positive_texts = [" ".join(val) for val in anecd_data]

In [13]:
len(negative_speaker_texts)

9034

In [14]:
dataset = negative_speaker_texts + positive_texts
labels = np.zeros(len(dataset))
labels[len(negative_speaker_texts):] = np.ones(len(positive_texts))

In [15]:
print(len(dataset))
print(len(labels))

19036
19036


In [16]:
X_train, X_test, y_train, y_test = train_test_split(dataset, labels, test_size=0.2, stratify=labels,
                                                    random_state=13)

In [17]:
x_train_vec = vectorizer.fit_transform(X_train)
x_test_vec = vectorizer.transform(X_test)


In [18]:
len(X_train)

15228

In [19]:
x_test_vec.shape

(3808, 319819)

In [20]:
lr_anecd = LogisticRegression(max_iter=1000).fit(x_train_vec, y_train)

In [21]:
accuracy_score(y_true=y_test, y_pred=lr_anecd.predict(x_test_vec))

0.7765231092436975

In [22]:
#pickle.dump(lr_anecd, open('../data/bot_trained/lr_model_anecd.sav', 'wb'))


In [23]:
# vectorizing for annoy
tfidf_vect_anecd = TfidfVectorizer().fit(X_train)

In [24]:
with open('../data/bot_trained/lr_vect_tfidf_anecd.pkl', 'wb') as fout:
    pickle.dump((vectorizer, tfidf_vect_anecd, lr_anecd), fout)

In [25]:
np.mean(tfidf_vect_anecd.idf_)

9.589800403314408

In [26]:
# idfs_anecd, midf_anecd

In [27]:
idfs_anecd = {v[0]: v[1] for v in zip(tfidf_vect_anecd.vocabulary_, tfidf_vect_anecd.idf_)}


In [28]:
print(list(idfs_anecd.keys())[:10])
print(list(idfs_anecd.values())[:10])

['протестировать', 'микро', 'работать', 'вообще', 'любой', 'желание', 'причина', 'создавать', 'цепь', 'событие']
[9.2446624227318, 7.0756087223622774, 9.937809603291745, 9.937809603291745, 9.937809603291745, 9.937809603291745, 9.937809603291745, 9.937809603291745, 9.937809603291745, 9.937809603291745]


In [29]:
# creating indexes for anecdote texts

ft_index_anecd = annoy.AnnoyIndex(100 ,'angular')

midf_anecd = np.mean(tfidf_vect_anecd.idf_)

index_map_anecd = {}
counter = 0

for i in tqdm(range(len(anecd_data))):
    n_ft = 0
    index_map_anecd[counter] = (anecd_origin[i])
    vector_ft = np.zeros(100)
    for word in anecd_data[i]:
        if word in modelFT.wv:
            vector_ft += modelFT.wv[word] * idfs_anecd.get(word, midf_anecd)
            n_ft += idfs_anecd.get(word, midf_anecd)
    if n_ft > 0:
        vector_ft = vector_ft / n_ft
    ft_index_anecd.add_item(counter, vector_ft)
    counter += 1

ft_index_anecd.build(10)
ft_index_anecd.save('../data/bot_trained/anecd.ann')

  0%|          | 0/10002 [00:00<?, ?it/s]

True

In [30]:
with open("../data/bot_trained/index_anecd.pkl", "wb") as f:
    pickle.dump(index_map_anecd, f)

In [31]:
ft_index_anecd.get_nns_by_vector(np.zeros(100), 1, include_distances=True)

([76], [1.4142135381698608])

In [32]:
np.save('../data/bot_trained/midf_anecd', midf_anecd)