# Задание 8
### На вебинаре мы говорили что долгое время CNN и RNN архитектуры были конурируещими выяснить какая архитектура больше подходит для задачи сантимент анализа на данных с вебинара

In [3]:
import pandas as pd
import numpy as np

import nltk
from nltk import word_tokenize
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re
import time

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense, Conv1D, LSTM, Flatten, Dropout, MaxPooling1D, Embedding,\
        GlobalMaxPool1D, BatchNormalization, Bidirectional, SimpleRNN, GRU, Masking
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras import utils

In [4]:
df_train = pd.read_csv('data/train.csv', index_col='id')
df_val = pd.read_csv('data/val.csv', index_col='id')
# test.csv без классов, здесь не пригодиться

In [5]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

df_train['text'] = df_train['text'].apply(preprocess_text)
df_val['text'] = df_val['text'].apply(preprocess_text)

In [6]:
# считаем количество уникальных слов в отзыве (максимум, в среднем)
df_train['text'].apply(lambda x: len(np.unique(str(x).split(' ')))).max(),\
df_train['text'].apply(lambda x: len(np.unique(str(x).split(' ')))).mean()

(24, 7.0138537585346095)

In [7]:
text_corpus_train = df_train['text'].values
text_corpus_valid = df_val['text'].values

In [8]:
tokenizer = Tokenizer(num_words=None, 
                     filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                     lower = False, split = ' ')
tokenizer.fit_on_texts(text_corpus_train)

sequences_train = tokenizer.texts_to_sequences(text_corpus_train)
sequences_val = tokenizer.texts_to_sequences(text_corpus_valid)

word_count = len(tokenizer.index_word) + 1
training_length = max([len(i.split()) for i in text_corpus_train])


In [9]:
%%time
X_train = pad_sequences(sequences_train, maxlen=training_length)
X_test = pad_sequences(sequences_val, maxlen=training_length)

Wall time: 609 ms


In [10]:
# y_train = utils.to_categorical(df_train['class'], num_classes=num_classes)
# y_test = utils.to_categorical(df_val['class'], num_classes=num_classes)

In [11]:
y_train = df_train['class'].values
y_test = df_val['class'].values

In [12]:
word_count, training_length

(258108, 27)

### 1. построить свёрточные архитектуры

In [14]:
model_cnn = Sequential([
    Embedding(input_dim=word_count, output_dim=64, input_length=training_length),
    Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'),
    GlobalMaxPool1D(),
    Dense(units=32, activation='relu'),
    Dense(units=1, activation='sigmoid')
])

model_cnn.compile(
    optimizer='Adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [15]:
batch_size = 512
epochs = 5

In [16]:
%%time
history = model_cnn.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test), verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Wall time: 6min 47s


In [17]:
history.history['val_accuracy'][-1]

0.7211127281188965

In [18]:
model_cnn = Sequential([
    Embedding(input_dim=word_count, output_dim=32, input_length=training_length),
    Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'),
    Conv1D(filters=64, kernel_size=5, padding='same', activation='relu'),
    GlobalMaxPool1D(),
    Dense(units=32, activation='relu'),
    Dense(units=1, activation='sigmoid')
])

model_cnn.compile(
    optimizer='Adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [19]:
%%time
history = model_cnn.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test), verbose=1)
history.history['val_accuracy'][-1]

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Wall time: 3min 28s


0.7160428762435913

In [20]:
model_cnn = Sequential([
    Embedding(input_dim=word_count, output_dim=32, input_length=training_length),
    Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'),
    BatchNormalization(),
    Conv1D(filters=64, kernel_size=5, padding='same', activation='relu'),
    GlobalMaxPool1D(),
    Dense(units=32, activation='relu'),
    Dropout(0.2),
    Dense(units=1, activation='sigmoid')
])

model_cnn.compile(
    optimizer='Adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [21]:
%%time
history = model_cnn.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test), verbose=1)
history.history['val_accuracy'][-1]

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Wall time: 3min 32s


0.7252568006515503

###  2. построить различные архитектуры с RNN

In [22]:
model_rnn = Sequential([
    Embedding(input_dim=word_count, output_dim=32, input_length=training_length, mask_zero=True),
    Masking(mask_value=0.0),    
    LSTM(32),
    Dense(units=32, activation='relu'),
    Dropout(0.2),
    Dense(units=1, activation='sigmoid')
])

model_rnn.compile(
    optimizer='Adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [23]:
%%time
history = model_rnn.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test), verbose=1)
history.history['val_accuracy'][-1]

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Wall time: 3min 33s


0.7185998558998108

In [26]:
model_rnn = Sequential([
    Embedding(input_dim=word_count, output_dim=32, input_length=training_length, mask_zero=True),
    Masking(mask_value=0.0),    
    SimpleRNN(64),
    Dense(units=32, activation='relu'),
    Dropout(0.2),
    Dense(units=1, activation='sigmoid')
])

model_rnn.compile(
    optimizer='Adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [27]:
%%time
history = model_rnn.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test), verbose=1)
history.history['val_accuracy'][-1]

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Wall time: 3min 36s


0.7159106135368347

In [28]:
model_rnn = Sequential([
    Embedding(input_dim=word_count, output_dim=32, input_length=training_length, mask_zero=True),
    Masking(mask_value=0.0),    
    GRU(64, recurrent_dropout=0.2),
    Dense(units=32, activation='relu'),
    Dropout(0.2),
    Dense(units=1, activation='sigmoid')
])

model_rnn.compile(
    optimizer='Adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)



In [29]:
%%time
history = model_rnn.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test), verbose=1)
history.history['val_accuracy'][-1]

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Wall time: 4min 41s


0.7211568355560303

In [36]:
model_rnn = Sequential([
    Embedding(input_dim=word_count, output_dim=32, input_length=training_length, mask_zero=True),
    Masking(mask_value=0.0),    
    Bidirectional(LSTM(64, recurrent_dropout=0.2)),
    Dense(units=32, activation='relu'),
    Dropout(0.2),
    Dense(units=1, activation='sigmoid')
])

model_rnn.compile(
    optimizer='Adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)



In [37]:
%%time
history = model_rnn.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test), verbose=1)
history.history['val_accuracy'][-1]

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Wall time: 7min 17s


0.7184234857559204

### 3. попробовать использовать совместно CNN и RNN

In [38]:
model_mix = Sequential([
    Embedding(input_dim=word_count, output_dim=64, input_length=training_length),
    Conv1D(filters=128, kernel_size=5, padding='same', activation='relu'),
    BatchNormalization(),
    Conv1D(filters=64, kernel_size=5, padding='same', activation='relu'),
    Bidirectional(LSTM(64, recurrent_dropout=0.2)),
    Dense(units=32, activation='relu'),
    Dropout(0.2),
    Dense(units=1, activation='sigmoid')
])

model_mix.compile(
    optimizer='Adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)



In [39]:
%%time
history = model_mix.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, y_test), verbose=1)
history.history['val_accuracy'][-1]

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Wall time: 9min 24s


0.7294008731842041

### 4. сделать выводы что получилось лучше

Все подходы показывают соизмеримый результат, около ~0.72. По крайней мере разница не настолько существенна, чтобы какой-то из методов отправлять на свалку. <br>
Комбинирование методов усложняет модели, и соответственно время работы.