# Урок 7. Сверточные нейронные сети для анализа текста

Задание
Берем отызывы за лето (из архива с материалами или предыдущего занятия)
- Учим conv сеть для классификации
- Рассмотреть 2-а варианта сеточек
  - Инициализировать tf.keras.layers.Embedding предобученными векторами взять к примеру с https://rusvectores.org/ru/
  - Инициализировать слой tf.keras.layers.Embedding по умолчанию (ну то есть вам ничего не делать с весами)
- Сравнить две архитектуры с предобученными весами и когда tf.keras.layers.Embedding обучается сразу со всей сеточкой, что получилось лучше

## Загрузка библиотек

In [1]:
!pip install stop_words pymorphy2

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from google.colab import drive

from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re

import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

from gensim.models import KeyedVectors

drive.mount('/content/drive')

Collecting stop_words
  Downloading stop-words-2018.7.23.tar.gz (31 kB)
Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 1.6 MB/s 
Collecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 2.0 MB/s 
[?25hBuilding wheels for collected packages: stop-words
  Building wheel for stop-words (setup.py) ... [?25l[?25hdone
  Created wheel for stop-words: filename=stop_words-2018.7.23-py3-none-any.whl size=32912 sha256=9aae171c288b4624540823599f61de8dcea7851e75f97cfb35e3ea3086e45f42
  Stored in directory: /root/.cache/pip/wheels/fb/86/b2/277b10b1ce9f73ce15059bf6975d4547cc4ec3feeb651978e9
Successfully built stop-words
Installing collected packages: pymorphy2-dicts-ru, dawg-python, stop-words, pymorphy2
Successfully i

In [2]:
max_words = 200
max_len = 40
num_classes = 5
epochs = 20
batch_size = 128
print_batch_n = 100

In [47]:
url = '/content/drive/MyDrive/Colab Notebooks/data/gb_NLP_les7_data/'

df = pd.read_csv(url + 'отзывы за лето.csv')

In [48]:
df.head()

Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14
2,5,Отлично все,2017-08-14
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14
4,5,"Очень удобно, работает быстро.",2017-08-14


## Предобработка

In [49]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

df['Content'] = df['Content'].apply(preprocess_text)
df = df[df['Content'].loc[:] != ""]

In [50]:
X_train, X_test, y_train, y_test = train_test_split(
    df['Content'],
    df['Rating']
)

In [51]:
X_train.head(10)

12004                                                норма
5323     приложениен плохой проблема вход приходиться д...
16819    чертовый антивирус блокировать фига возможност...
7828     обновление введение ключ выдавать сообщение от...
9919                     удобный привязка телефонный книга
9980                            приложение быстрый удобный
4352                                      работать отлично
370      приложение работать дажена запускаться антивир...
11897          мочь зайти приложениеслетаетний поймупочему
3957     дать возможность использовать приложение гориз...
Name: Content, dtype: object

In [52]:
nltk.download("punkt")

train_corpus = " ".join(X_train)
tokens = word_tokenize(train_corpus)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Фильтрация

In [53]:
tokens_filtered = [word for word in tokens if word.isalnum()]

In [54]:
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]

In [55]:
tokens_filtered_top[:10]

['приложение',
 'удобно',
 'работать',
 'удобный',
 'отлично',
 'нравиться',
 'отличный',
 'хороший',
 'телефон',
 'супер']

In [56]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

In [13]:
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [57]:
X_train = np.asarray([text_to_sequence(text, max_len) for text in X_train], dtype=np.int32)
X_test = np.asarray([text_to_sequence(text, max_len) for text in X_test], dtype=np.int32)

In [58]:
X_train.shape

(15031, 40)

In [60]:
X_train[1]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,  85, 121,  24,  19,  34, 179,  36,  47,
        14], dtype=int32)

## Keras model (default)

In [18]:
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping

In [61]:
def change_category(y):
  return [cat - 1 for cat in y]

y_train = change_category(y_train)
y_test = change_category(y_test)

In [62]:
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)

y_train[0]

array([0., 0., 0., 0., 1.], dtype=float32)

In [63]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [64]:
history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [65]:
score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)

print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.8212544918060303
Test accuracy: 0.7533426284790039


## Keras model with weights

In [25]:
embedding_matrix = KeyedVectors.load_word2vec_format(url + 'tayga_1_2.vec')

In [34]:
# embedding_matrix.vocab

In [66]:
vocabulary = embedding_matrix.vocab

In [67]:
X_train, X_test, y_train, y_test = train_test_split(df['Content'], df['Rating'])

X_train = X_train.apply(preprocess_text)
X_test = X_test.apply(preprocess_text)

X_train = np.asarray([text_to_sequence(text, max_len) for text in X_train], dtype=np.int32)
X_test = np.asarray([text_to_sequence(text, max_len) for text in X_test], dtype=np.int32)

y_train = change_category(y_train)
y_test = change_category(y_test)

y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)

In [68]:
model_emb = Sequential()
model_emb.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model_emb.add(Conv1D(128, 3))
model_emb.add(Activation("relu"))
model_emb.add(GlobalMaxPool1D())
model_emb.add(Dense(10))
model_emb.add(Activation("relu"))
model_emb.add(Dense(num_classes))
model_emb.add(Activation('softmax'))

model_emb.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [69]:
history_emb = model_emb.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [70]:
score = model_emb.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)

print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 1.0000011920928955
Test accuracy: 0.6958690881729126


## Вывод

* Keras default = 0.75
* Keras with loaded weights = 0.69