In [6]:
pip install -q pymorphy2 stop_words keras_preprocessing

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [8]:
import gensim
import pandas as pd
from sklearn.model_selection import train_test_split

from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re

import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")
from nltk.probability import FreqDist

import numpy as np
import tensorflow as tf
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard
from keras.losses import SparseCategoricalCrossentropy
from keras.callbacks import EarlyStopping

from sklearn.preprocessing import LabelEncoder

from pathlib import Path
import keras.backend as K

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
def get_f1(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [11]:
df = pd.read_excel('отзывы за лето.xls')
df.sample(2)

Unnamed: 0,Rating,Content,Date
6594,5,Меня все устраивает!) спасибо),2017-08-04
12973,5,Мне нравиться),2017-07-26


In [12]:
df['Rating'].value_counts()

5    14586
1     2276
4     2138
3      911
2      748
Name: Rating, dtype: int64

In [13]:
df_train, df_test = train_test_split(df, test_size=0.33, random_state=42)
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_train.shape, df_test.shape

((13841, 3), (6818, 3))

In [14]:
df_train.sample(2)

Unnamed: 0,Rating,Content,Date
11194,1,"Поменял оценку на единицу, после последнего об...",2017-07-31
6316,5,"Удобно,надёжно,спасибо",2017-08-04


In [15]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

df_train['Content'] = df_train['Content'].apply(preprocess_text)
df_test['Content'] = df_test['Content'].apply(preprocess_text)

In [16]:
train_corpus = " ".join(df_train["Content"])
train_corpus = train_corpus.lower()
tokens = word_tokenize(train_corpus)

In [18]:
tokens_filtered = [word for word in tokens if word.isalnum()]
dist = FreqDist(tokens_filtered)
dist

FreqDist({'приложение': 4123, 'удобно': 2201, 'работать': 1288, 'удобный': 1182, 'отлично': 860, 'нравиться': 763, 'хороший': 681, 'отличный': 677, 'телефон': 627, 'супер': 540, ...})

In [19]:
max_words = 200
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]
tokens_filtered_top[:5]

['приложение', 'удобно', 'работать', 'удобный', 'отлично']

In [20]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

In [21]:
max_len = 40
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [23]:
x_train = np.asarray([text_to_sequence(text, max_len) for text in df_train["Content"]], dtype=np.int32)
x_test = np.asarray([text_to_sequence(text, max_len) for text in df_test["Content"]], dtype=np.int32)
x_train[0]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0, 42, 22,  1,  2, 15], dtype=int32)

In [24]:
le = LabelEncoder()
train_enc_labels = le.fit_transform(df_train['Rating'])
test_enc_labels = le.transform(df_test['Rating'])
le.classes_

array([1, 2, 3, 4, 5])

In [25]:
train_enc_labels

array([4, 4, 4, ..., 0, 4, 4])

In [26]:
num_classes = 5
y_train = tf.keras.utils.to_categorical(train_enc_labels, num_classes=num_classes)
y_test = tf.keras.utils.to_categorical(test_enc_labels, num_classes=num_classes)
y_train[0]

array([0., 0., 0., 0., 1.], dtype=float32)

In [27]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [28]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=[get_f1])

In [29]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')

epochs = 20
batch_size = 512

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


In [30]:
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
print('\n')
print('Test loss:', score[0])
print('Test f1_score:', score[1])



Test loss: 0.671967625617981
Test f1_score: 0.7780848741531372


## Keras CONV модель с предобученным Embedding слоем

In [31]:
!wget -q http://vectors.nlpl.eu/repository/20/180.zip

In [32]:
!unzip -q 180.zip

In [33]:
word_vectors = gensim.models.KeyedVectors.load_word2vec_format('/content/model.bin', binary=True).vectors

In [35]:
word_vectors_matrix = [word_vectors[i][:128] for i in range(200)]
len(word_vectors_matrix[0])

128

In [36]:
initializer = tf.keras.initializers.Constant(word_vectors_matrix)

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, embeddings_initializer =initializer, input_length=max_len))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [37]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=[get_f1])

In [38]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')

epochs = 20
batch_size = 512

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


In [39]:
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)
print('\n')
print('Test loss:', score[0])
print('Test f1_score:', score[1])



Test loss: 0.7631232142448425
Test f1_score: 0.7507708668708801


Нейронная сеть без инициализации дает лучшую метрику f1 по сравнению с  предобученной сетью - 0.78 и 0.75 соответственно.