# Классификация текстов сверточными нейронными сетями [convolutional neural network]

### Слой свертки

#### Фильтр [filter]:
* $w_{1,n}$ – последовательность слов, $k$  – размер окна
* $w_i$ , $d_{emb}$ – размерность эмбеддинга слова,  $\textbf{w}_i \in \mathbb{R}^{d_{emb}} $
* $\textbf{x}_i = [\textbf{w}_{i}, \textbf{w}_{i+1}, \ldots, \textbf{w}_{i+k-1}]$, $\textbf{x}_i \in \mathbb{R}^{k d_{emb}}$

Фильтр: $p_i = g(\textbf{x}_i  u)$, $p_i \in \mathbb{R}$, $u \in \mathbb{R}^{k d_{emb}}$


![title](img/cnn1.png)


Преобразуем каждое входное окно, но пока размерность входа не уменьшается!

#### Слой субдискретизации (пулинга, [pooling])

* $h_i$ – выходные значения фильтра

$\max$-пулинг:	$c = \max_i h_i$


![title](img/cnn2.png)

* Выбираем самый важный признак из полученных на предыдущем шаге 
* Можем использовать и $\min$, и усреднение



### Классификатор на основе сверточной сети

* $y \in [0,1] $ - истинные значения
* $\widehat{y} = c$ - предсказанные значения

![title](img/cnn3.png)

Для обучения сверточной сети можно использовать обычный алгоритм распространения ошибки

Одномерные фильтры – это сильное ограничение. Что делать, если $c=0.5$?

In [5]:
from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
from keras.utils.data_utils import pad_sequences
from keras.utils import np_utils
# from sklearn.preprocessing import LabelBinarizer, LabelEncoder

import keras

from keras.layers import Embedding, Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, Masking
from keras.models import Model, Sequential

import pandas as pd
import numpy as np

import seaborn as sns

import matplotlib.pyplot as plt

import random
random.seed(1228)

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report, confusion_matrix

%matplotlib inline

from keras.layers import Convolution1D, MaxPooling1D, GlobalMaxPooling1D
from keras.layers import  concatenate

import warnings

warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'tensorflow'

In [4]:
from pymystem3 import Mystem
import re
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

m = Mystem()


regex = re.compile("[А-Яа-я:=!\)\()A-z\_\%/|]+")

def words_only(text, regex=regex):
    try:
        return " ".join(regex.findall(text))
    except:
        return ""



def lemmatize(text, mystem=m):
    try:
        return "".join(m.lemmatize(text)).strip()  
    except:
        return " "


df_neg = pd.read_csv("C:/Users/adwiz/Documents/Courses/datascience_netology/datasets/nlp/negative.csv", sep=';', encoding='utf8', header = None, usecols = [3])
df_pos = pd.read_csv("C:/Users/adwiz/Documents/Courses/datascience_netology/datasets/nlp/positive.csv", sep=';', encoding='utf8', header = None, usecols = [3])
df_neg['sent'] = 'neg'
df_pos['sent'] = 'pos'
df = pd.concat([df_neg, df_pos])
df.columns = ['text', 'sent']
# df = df[:5]
df.text = df.text.apply(words_only)
df.text = df.text.apply(lemmatize)


X = df.text.tolist()
y = df.sent.tolist()

X, y = np.array(X), np.array(y)

X_text_train, X_text_test, y_train, y_test = train_test_split(X,y, test_size=0.33)
print ("total train examples %s" % len(y_train))
print ("total test examples %s" % len(y_test))

Installing mystem to /Users/adwiz/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.1-macosx.tar.gz


FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/adwiz/Documents/Courses/datascience_netology/datasets/nlp/negative.csv'

In [None]:
TEXT_LENGTH = 10
VOCABULARY_SIZE = 20000
EMBEDDING_DIM = 100
DIMS = 250
MAX_FEATURES = 5000

batch_size = 32

filter_length = 3

nb_filter = 50

hidden_dims = 100
nb_epoch = 20000

### Подготовка данных

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_text_train)
tokenizer

In [None]:
sequences = tokenizer.texts_to_sequences(X_text_train)
X_train = pad_sequences(sequences, maxlen=TEXT_LENGTH)
sequences = tokenizer.texts_to_sequences(X_text_test)
X_test = pad_sequences(sequences, maxlen=TEXT_LENGTH)

In [None]:
X_train[0]

In [None]:
le = LabelEncoder()
le.fit(['pos', 'neg'])
y_train_cat = np_utils.to_categorical(le.transform(y_train), 2)
y_test_cat = np_utils.to_categorical(le.transform(y_test), 2)

print(y_train_cat[0])

In [None]:
%%time

import numpy as np
emb_path = 'C:/Users/adwiz/Documents/Courses/datascience_netology/datasets/nlp/wiki.ru.vec'

words = []

embeddings_index = {}
f = open(emb_path, encoding='utf8')
for line in f:
    values = line.split()
    if len(values) == 301:
        word = values[0]
        words.append(word)
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
f.close()

In [None]:
print(len(embeddings_index))

In [None]:
word_index = tokenizer.word_index
len(word_index)

In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
words_input = Input(shape=(TEXT_LENGTH,), dtype='int32', name='words_input')



#Our word embedding layer
wordsEmbeddingLayer = Embedding(embedding_matrix.shape[0],
                    embedding_matrix.shape[1],                                     
                    weights=[embedding_matrix],
                    trainable=False)


words = wordsEmbeddingLayer(words_input)


words_conv = Convolution1D(filters=nb_filter,
                            kernel_size=filter_length,
                            padding='same',
                            activation='relu',
                            strides=1)(words)

words_conv = GlobalMaxPooling1D()(words_conv)  

output = words_conv

In [None]:
output = Dropout(0.5)(output)
output = Dense(hidden_dims, activation='tanh', kernel_regularizer=keras.regularizers.l2(0.01))(output)
output = Dropout(0.25)(output)
output = Dense(2, activation='softmax',  kernel_regularizer=keras.regularizers.l2(0.01))(output)

model = Model(inputs=[words_input], outputs=[output])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [None]:
model.fit(X_train, y_train_cat, epochs=nb_epoch, batch_size=batch_size,  validation_split=0.1)

In [None]:
pred = model.predict(X_test)

In [None]:
y_pred = pred.argmax(axis=-1)
y_pred = le.inverse_transform(y_pred)

print("Precision: {0:6.2f}".format(precision_score(y_test, y_pred, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_test, y_pred, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_test, y_pred, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))



sns.heatmap(data=confusion_matrix(y_test, y_pred), annot=True, fmt="d", cbar=False, xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Confusion matrix")
plt.show()

#### Задание

Реализуйте сверточную сеть с несколькими каналами с фильтрами разной размерности (т.е. с разными входными свертками ```words_conv```). 

Размеры фильтров: ```filter_lengths = [1,2,3]```.

Для конкатенации сверток используйте ```concatenate'''.

In [None]:
### Решение

filter_lengths = [1,2,3]
    
words_input = Input(shape=(TEXT_LENGTH,), dtype='int32', name='words_input')

wordsEmbeddingLayer = Embedding(embedding_matrix.shape[0],
                    embedding_matrix.shape[1],                                     
                    weights=[embedding_matrix],
                    trainable=False)

words = wordsEmbeddingLayer(words_input)

words_convolutions = []
for filter_length in filter_lengths:
    words_conv = Convolution1D(filters=nb_filter,
                            kernel_size=filter_length,
                            padding='same',
                            activation='relu',
                            strides=1)(words)
                            
    words_conv = GlobalMaxPooling1D()(words_conv)      
    
    words_convolutions.append(words_conv)  

output = concatenate(words_convolutions)

In [None]:
from keras.utils.vis_utils import plot_model

In [None]:
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
model.summary