In [1]:
import os
import string
import annoy

from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
from gensim.models import Word2Vec

import numpy as np
from tqdm import tqdm_notebook
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import pickle
from gensim.models.word2vec import Word2Vec
from multiprocessing import cpu_count
import pymorphy2
from nltk.corpus import stopwords

In [52]:
# Functions

def preprocess_txt(line):
    # Clean the line from punctuation
    exclude = set(string.punctuation)
    spls = "".join(i for i in line.strip() if i not in exclude).split()
    morpher = pymorphy2.MorphAnalyzer()
    sw = set(stopwords.words("russian"))
    # Lemmatize all words
    spls = [morpher.parse(i.lower())[0].normal_form for i in spls]
    spls = [i for i in spls if i not in sw and i != ""]
    return spls

def prepro_txt(line):
    spls = "".join(i for i in line.strip() if i not in exclude).split()
    spls = [morpher.parse(i.lower())[0].normal_form for i in spls]
    spls = [i for i in spls if i not in sw and i != ""]
    return spls

# Train the classifier “product request vs. talker"

Upload and preprocess the dataset for training the classifier:

In [3]:
dataset = pd.read_csv('text_dataset')
dataset.drop(['Unnamed: 0'], inplace = True, axis = 1)
dataset['text'] = dataset['text'].apply(lambda x: x[1:-1])

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71072 entries, 0 to 71071
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   71072 non-null  int64 
 1   text    71072 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.1+ MB


In [5]:
dataset # the dataset has already been preprocessed

Unnamed: 0,label,text
0,0,"'юбка', 'детский', 'orby', 'новый', 'носить', ..."
1,0,"'ботильон', 'новыепривезти', 'чехия', 'указать..."
2,0,"'брюки', 'размер', '4042', 'брюки', 'новый', '..."
3,0,"'продать', 'детский', 'шапка', 'продать', 'шап..."
4,0,"'блузка', 'темносиний', '42', 'размерсостояние..."
...,...,...
71067,1,"'энергетика', 'вредный', 'пить', 'энергетик', ..."
71068,1,"'почему', 'мочь', 'прищуриться', 'правый', 'гл..."
71069,1,"'правильно', 'произнести', 'английский', '16',..."
71070,1,"'вести', 'новый', 'школа', 'недавно', 'перейти..."


Split the data:

In [6]:
train, test = train_test_split(dataset, test_size=0.2, shuffle=True)

Vectorize the text:

In [7]:
# create TfidfVectorizer object and fit it on out training set texts

vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features = 50000)
vectorizer.fit(train['text'], train['label'])

TfidfVectorizer(max_features=50000, ngram_range=(1, 2))

In [8]:
# 1. convert texts to tf-idf vectors using .transform
# 2. convert your labels into numpy arrays 

X_train = vectorizer.transform(train['text'])
y_train = np.array(train['label'], int)
X_test = vectorizer.transform(test['text'])
y_test = np.array(test['label'], int)

Train the classifier:

In [9]:
# create LogisticRegression model object and fit the model

model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [10]:
predictions = model.predict(X_test)

Test:

In [11]:
accuracy = (predictions == y_test).mean()
accuracy

0.9865634892718959

In [12]:
q_1 = 'Блузка Темно-синяя'

vec = vectorizer.transform([q_1])
model.predict(vec)

array([0])

In [13]:
q_2 = 'Энергетики, вредно или нет?'

vec = vectorizer.transform([q_2])
model.predict(vec)

array([1])

Compare with model LinearSVC

In [14]:
clf = LinearSVC()
clf.fit(X_train, y_train)
predictions_clf = clf.predict(X_test)
accuracy_clf = (predictions_clf == y_test).mean()
accuracy_clf

0.990995427365459

Choose LinearSVC as a model

In [15]:
# Save the model to a file and load it:

with open('project14_clf.pkl', 'wb') as output:
    pickle.dump(clf, output) #save

with open('project14_clf.pkl', 'rb') as pkl_file:
    regressor_from_file = pickle.load(pkl_file) #load

In [127]:
def get_predictions(question):
    vec = vectorizer.transform([question])
    predicted_answer = model.predict(vec)[0]
    return predicted_answer

Check that the model is saved and loaded correctly

In [128]:
get_predictions('Блузка Темно-синяя')

0

In [129]:
get_predictions('Энергетики, вредно или нет?')

1

# Implement the search for similar products in the content part of the bot

In [98]:
product_data = pd.read_csv('ProductsDataset.csv')

All product names will be presented as vectors of Word2Vec

In [108]:
sentences = []

morpher = MorphAnalyzer()
sw = set(get_stop_words("ru"))
exclude = set(string.punctuation)
c = 0

for line in product_data['title']:
    spls = prepro_txt(line)
    sentences.append(spls)
    c += 1
    if c > 500000:
        break

In [109]:
# Train model word2vec 
sentences = [i for i in sentences if len(i) > 2]
model_wv = Word2Vec(sentences=sentences, vector_size=100, min_count=5, window=5)
model_wv.save("w2v_model")

Now we need to build an index on document names. We use the annoy library. We go through all the names, we believe that the sentence vector is the sum of the word2vecs of the words that are included in it (averaged, of course)

In [110]:
index_goods = annoy.AnnoyIndex(100 ,'angular')

index_map_goods = {}
counter = 0

for line in product_data['title']:
    n_w2v = 0
    spls = line.split("\t")
    index_map_goods[counter] = spls[0]
    question = prepro_txt(spls[0])
    vector = np.zeros(100)
    for word in question:
        if word in model_wv.wv:
            vector += model_wv.wv[word]
            n_w2v += 1
    if n_w2v > 0:
        vector = vector / n_w2v
    index_goods.add_item(counter, vector)
            
    counter += 1

index_goods.build(10)
index_goods.save('smth.ann')

True

Implement the search for the answer by index

In [111]:
# works ok
def find_answer(question, model):
    preprocessed_question = prepro_txt(question)
    n_w2v = 0
    vector = np.zeros(100)
    for word in preprocessed_question:
        if word in model_wv.wv:
            vector += model_wv.wv[word]
            n_w2v += 1
    if n_w2v > 0:
        vector = vector / n_w2v
    answer_index = index_goods.get_nns_by_vector(vector, 1)
    return index_map_goods[answer_index[0]]

In [97]:
# Test... 
find_answer('Юбка детская ORBY', model_wv)

'Юбка детская'

# Implement a talker

Preprocess the mail.ru answers from the file: add 1 answer to each question and write it to the file for the future. This will allow us to save time and resources during further text preprocessing.

In [99]:
question = None
written = False

# We go through all the records, take the first line as a question
# and after the sign --- we find the answer
with open("prepared_answers.txt", "w") as fout:
    with open("Otvety.txt", "r") as fin:
        for line in tqdm_notebook(fin):
            if line.startswith("---"):
                written = False
                continue
            if not written and question is not None:
                fout.write(question.replace("\t", " ").strip() + "\t" + line.replace("\t", " "))
                written = True
                question = None
                continue
            if not written:
                question = line.strip()
                continue

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for line in tqdm_notebook(fin):


0it [00:00, ?it/s]

Now we need to preprocess the text in order to train word2vec and get embeddings. Remove punctuation marks and do lemmatization

In [100]:
sentences = []

morpher = MorphAnalyzer()
sw = set(get_stop_words("ru"))
exclude = set(string.punctuation)
c = 0

with open("Otvety.txt", "r") as fin:
    for line in tqdm_notebook(fin):
        spls = prepro_txt(line)
        sentences.append(spls)
        c += 1
        if c > 500000:
            break

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for line in tqdm_notebook(fin):


0it [00:00, ?it/s]

In [101]:
# Train the word2vec model on our questions
sentences = [i for i in sentences if len(i) > 2]
model_chat = Word2Vec(sentences=sentences, vector_size=100, min_count=1, window=5)
model_chat.save("w2v_model_chat")

Now we need to add all the questions to the index. We use the annoy library. We go through all the answers, we believe that the sentence vector is the sum of the word2vecs of the words that are included in it (averaged, of course)

In [103]:
index = annoy.AnnoyIndex(100 ,'angular')

index_map = {}
counter = 0

with open("prepared_answers.txt", "r") as f:
    for line in tqdm_notebook(f):
        n_w2v = 0
        spls = line.split("\t")
        index_map[counter] = spls[1]
        question = prepro_txt(spls[0])
        vector = np.zeros(100)
        for word in question:
            if word in model_chat.wv:
                vector += model_chat.wv[word]
                n_w2v += 1
        if n_w2v > 0:
            vector = vector / n_w2v
        index.add_item(counter, vector)
            
        counter += 1

index.build(10)
index.save('speaker.ann')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for line in tqdm_notebook(f):


0it [00:00, ?it/s]

True

Now it remains to implement a method that will receive a question as an input and find an answer to it. We preprocess the question, find the closest question, and select the answer to the closest question.

In [104]:
def find_answer_chat(question):
    preprocessed_question = prepro_txt(question)
    n_w2v = 0
    vector = np.zeros(100)
    for word in preprocessed_question:
        if word in model_chat.wv:
            vector += model_chat.wv[word]
            n_w2v += 1
    if n_w2v > 0:
        vector = vector / n_w2v
    answer_index = index.get_nns_by_vector(vector, 1)
    return index_map[answer_index[0]]

In [115]:
# test
find_answer_chat('Как погодка?')

'у нас тепло и дождей не предвидеться до ноября. \n'

# Implementing a chat bot

In [155]:
def get_answer(question):
    
    # classify
    predicted_question = get_predictions(question)
    
    # look for an answer in the table
    if predicted_question == 0:
        find_in_table = find_answer(question, model_wv)
        for counter, item in enumerate(product_data.title):
            if item == find_in_table:
                answ_to_return = str([product_data.product_id[counter], product_data.title[counter]])[1:-1]
                break
    
    # chat
    else:
        answ_to_return = find_answer_chat(question)
        
    return answ_to_return

In [156]:
get_answer('Юбка детская ORBY')

"'5922cd12de885467545e72a2', 'Юбка для девочки.'"

In [157]:
get_answer('Как погодка?')

'у нас тепло и дождей не предвидеться до ноября. \n'

# Автотест

In [143]:
# Ok
assert(not get_answer('Где ключи от танка').startswith('5')) 