Берем отызывы за лето (из архива с материалами или предыдущего занятия)
1. Учим conv сеть для классификации.
2. Рассмотреть 2-а варианта:
* Инициализировать tf.keras.layers.Embedding предобученными векторами (зять к примеру с https://rusvectores.org/ru/.)
* Инициализировать слой tf.keras.layers.Embedding по умолчанию.
3. Сделать вывод

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re
from spacy.lang.ru.stop_words import STOP_WORDS
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from navec import Navec

In [3]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical, get_file
from tensorflow.keras.callbacks import TensorBoard 
from tensorflow.keras.losses import categorical_crossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import Constant

*****************************

In [4]:
MAX_WORDS = 2000
MAX_LEN = 100

In [5]:
data = pd.read_excel('отзывы за лето.xls')
le = LabelEncoder()
data['Rating'] = le.fit_transform(data['Rating'])
data.sample(5)

Unnamed: 0,Rating,Content,Date
15497,3,По два раза заходит и просит пароль,2017-07-21
17710,2,Перестало работать смс - оповещение. Ни при вх...,2017-07-10
2666,4,Отлично,2017-08-10
6076,0,Тупее приложения не найти.как сделать перевод ...,2017-08-05
15016,4,"Спасибо,что Вы есть!",2017-07-22


In [6]:
sw = set((get_stop_words("ru") + list(STOP_WORDS))) - {'не', 'ни', 'нет'}
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt, exclude=exclude, sw=sw, morpher=morpher):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
#     txt = re.sub("не\s", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

In [7]:
data['preprocessed_text'] = data['Content'].apply(preprocess_text)

In [8]:
data.sample(5)

Unnamed: 0,Rating,Content,Date,preprocessed_text
8491,4,Довольно удобно,2017-08-02,удобно
1969,4,Быстро и удобно мне очень помогает,2017-08-11,быстро удобно помогать
7616,4,Удобное приложение. Всё логично и понятно.,2017-08-03,удобный приложение логично понятно
18801,0,Удаляю,2017-06-30,удалять
17985,0,Уберите этот вирусник он только мешает апарат ...,2017-07-08,убрать вирусникнуть мешать апаратый мейза


In [9]:
train_corpus = " ".join(data["preprocessed_text"])
tokens = word_tokenize(train_corpus)
tokens_filtered = [word for word in tokens if word.isalnum()]
len(tokens_filtered)

105224

In [10]:
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(MAX_WORDS-1)]
tokens_filtered_top[:7]

['приложение', 'не', 'удобно', 'работать', 'удобный', 'отлично', 'нравиться']

In [11]:
vocabulary = {v: k for k, v in enumerate(tokens_filtered_top, 1)}

In [12]:
len(vocabulary)

1999

In [13]:
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [14]:
data_train = np.asarray([text_to_sequence(text, MAX_LEN) for text in data['preprocessed_text']], dtype=np.int32)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(data_train, data['Rating'], test_size=0.33)

*********************************

## Cлой Embedding по умолчанию

In [16]:
BATCH_SIZE = 1024
EPOCHES = 10

In [17]:
model = Sequential()
model.add(Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LEN))
model.add(Conv1D(128, 5))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(20))
model.add(Activation("relu"))
model.add(Dropout(0.25))
model.add(Dense(5))
model.add(Activation('softmax'))

In [18]:
opt = Adam(learning_rate=0.001)
model.compile(loss=SparseCategoricalCrossentropy(),
              optimizer=opt,
              metrics=['accuracy'])

In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 128)          256000    
_________________________________________________________________
conv1d (Conv1D)              (None, 96, 128)           82048     
_________________________________________________________________
activation (Activation)      (None, 96, 128)           0         
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 20)                2580      
_________________________________________________________________
activation_1 (Activation)    (None, 20)                0         
_________________________________________________________________
dropout (Dropout)            (None, 20)                0

In [20]:
history = model.fit(X_train, y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHES,
                    verbose=0,
                    validation_split=0.1)

In [21]:
score = model.evaluate(X_test, y_test, batch_size=BATCH_SIZE, verbose=1)
print('\n')
print('Test loss:', score[0])
print('Test accuracy:', score[1])



Test loss: 0.6366176009178162
Test accuracy: 0.7760340571403503


***********************************************

## Embedding с предобученными векторами

In [22]:
path = 'navec_news_v1_1B_250K_300d_100q.tar'
emb = Navec.load(path)

In [23]:
embedding_dim = 300

embedding_matrix = np.zeros((MAX_WORDS, embedding_dim))
for word in tokens_filtered_top:
    embedding_vector = emb.get(word)
    if embedding_vector is not None:
        embedding_matrix[tokens_filtered_top.index(word)] = embedding_vector

In [24]:
embedding_matrix.shape

(2000, 300)

In [25]:
embedding_layer = Embedding(
    MAX_WORDS,
    embedding_dim,
    embeddings_initializer=Constant(embedding_matrix),
    trainable=False,
)

In [26]:
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(128, 5))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(20))
model.add(Activation("relu"))
model.add(Dropout(0.25))
model.add(Dense(5))
model.add(Activation('softmax'))

In [27]:
model.compile(loss=SparseCategoricalCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])

In [28]:
history = model.fit(X_train, y_train,
          batch_size=BATCH_SIZE,
          epochs=EPOCHES,
          verbose=0,
          validation_split=0.1)

In [29]:
score = model.evaluate(X_test, y_test, batch_size=BATCH_SIZE, verbose=1)
print('\n')
print('Test loss:', score[0])
print('Test accuracy:', score[1])



Test loss: 0.674392580986023
Test accuracy: 0.764447033405304


**Вывод:** использование предобученных векторов показало схожий результат.