# Выяснить какая архитектура больше подходит для задачи сантимент анализа на данных RNN | CNN
1. построить свёрточные архитектуры
2. построить различные архитектуры с RNN
3. построить совместные архитектуры CNN -> RNN и (RNN -> CNN)
4. сдлать выводы что получилось лучше

****************************

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re
from spacy.lang.ru.stop_words import STOP_WORDS

In [3]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Input, Embedding, GlobalMaxPool1D, Flatten
from tensorflow.keras.layers import  Conv1D, SimpleRNN, LSTM, GRU, Masking
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

***************

## Preprocessing

In [4]:
df_train = pd.read_csv("data/train.csv")
df_val = pd.read_csv("data/val.csv")
df_train.head(2)

Unnamed: 0,id,text,class
0,0,@alisachachka не уезжаааааааай. :(❤ я тоже не ...,0
1,1,RT @GalyginVadim: Ребята и девчата!\nВсе в кин...,1


In [5]:
sw = set((get_stop_words("ru") + list(STOP_WORDS))) - {'не', 'ни', 'нет'}
exclude = set(punctuation)
morpher = MorphAnalyzer()


def preprocess_text(txt, exclude=exclude, sw=sw, morpher=morpher):
    txt = str(txt)
#     txt = re.sub("@[\w]*", "", txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = [morpher.parse(
        word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

In [6]:
df_train['text'] = df_train['text'].apply(preprocess_text)
df_val['text'] = df_val['text'].apply(preprocess_text)

In [7]:
df_train.head(2)

Unnamed: 0,id,text,class
0,0,alisachachka не уезжаааааааать ❤ не хотеть уез...,0
1,1,rt galyginvadim ребята девчата кино любовь зав...,1


In [8]:
text_corpus_train = df_train['text'].values
text_corpus_valid = df_val['text'].values
y_train = df_train['class'].values
y_val = df_val['class'].values

In [9]:
tokenizer = Tokenizer(num_words=None,
                      filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                      lower=False, split=' ')
tokenizer.fit_on_texts(text_corpus_train)

sequences_train = tokenizer.texts_to_sequences(text_corpus_train)
sequences_val = tokenizer.texts_to_sequences(text_corpus_valid)

word_count = len(tokenizer.index_word) + 1
training_length = max([len(i.split()) for i in text_corpus_train])

X_train = pad_sequences(sequences_train, maxlen=training_length)
X_valid = pad_sequences(sequences_val, maxlen=training_length)

In [10]:
BATCH_SIZE = 2048
EPOCHES = 10
opt = Adam(learning_rate=0.0001)
early_stopping = EarlyStopping(monitor='val_accuracy', patience=2)

## CNN

In [11]:
model1 = Sequential()
model1.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model1.add(Masking(mask_value=0.0))
model1.add(Conv1D(64, 3))
model1.add(Activation("relu"))
model1.add(Flatten())
model1.add(Dense(128))
model1.add(Dropout(0.25))
model1.add(Activation("relu"))
model1.add(Dense(1))
model1.add(Activation('sigmoid'))

In [12]:
model1.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

history1 = model1.fit(X_train, y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHES,
                    verbose=0,
                    validation_split=0.1,
                    callbacks=[early_stopping]
                    )

In [13]:
score = model1.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('CNN Test loss:', score[0])
print('CNN Test accuracy:', score[1])



CNN Test loss: 0.5827595591545105
CNN Test accuracy: 0.73482346534729


## LSTM

In [14]:
model2 = Sequential()

model2.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model2.add(Masking(mask_value=0.0))
model2.add(LSTM(64, recurrent_dropout=0.25))
model2.add(Dense(128, activation='relu'))
model2.add(Dropout(0.25))
model2.add(Dense(1, activation='sigmoid'))



In [15]:
model2.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

history2 = model2.fit(X_train, y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHES,
                    verbose=0,
                    validation_split=0.1,
                    callbacks=[early_stopping]
                    )

In [16]:
score = model2.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('LSTM Test loss:', score[0])
print('LSTM Test accuracy:', score[1])



LSTM Test loss: 0.5759655237197876
LSTM Test accuracy: 0.7409513592720032


## LSTM + CNN

In [17]:
model3 = Sequential()

model3.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model3.add(Masking(mask_value=0.0))
model3.add(LSTM(64, recurrent_dropout=0.25, return_sequences=True))
model3.add(Conv1D(64, 3))
model3.add(Flatten())
model3.add(Dense(128, activation='relu'))
model3.add(Dropout(0.25))
model3.add(Dense(1, activation='sigmoid'))



In [18]:
model3.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

history3 = model3.fit(X_train, y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHES,
                    verbose=0,
                    validation_split=0.1,
                    callbacks=[early_stopping]
                    )

In [19]:
score = model3.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('LSTM+CNN Test loss:', score[0])
print('LSTM+CNN Test accuracy:', score[1])



LSTM+CNN Test loss: 0.6593348979949951
LSTM+CNN Test accuracy: 0.727064311504364


## CNN + LSTM

In [20]:
model4 = Sequential()

model4.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model4.add(Masking(mask_value=0.0))
model4.add(Conv1D(64, 3))
model4.add(Activation("relu"))
model4.add(LSTM(64, recurrent_dropout=0.25))
model4.add(Dense(128, activation='relu'))
model4.add(Dropout(0.25))
model4.add(Dense(1, activation='sigmoid'))



In [21]:
model4.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

history4 = model4.fit(X_train, y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHES,
                    verbose=0,
                    validation_split=0.1,
                    callbacks=[early_stopping]
                    )

In [22]:
score = model4.evaluate(X_valid, y_val, batch_size=512, verbose=1)
print('\n')
print('CNN+LSTM Test loss:', score[0])
print('CNN+LSTM Test accuracy:', score[1])



CNN+LSTM Test loss: 0.661159098148346
CNN+LSTM Test accuracy: 0.7292686104774475


**Вывод:** В базовых исполнениях All about the same.