# Task

* Build CNN

* Build RNN

* Build CNN -> RNN and/or RNN -> CNN

In [46]:
!pip install -q pymorphy2
!pip install -q stop_words

In [47]:
import numpy as np
import pandas as pd
import keras
import re
from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
from string import punctuation
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D,\
                         GlobalMaxPool1D, SimpleRNN, LSTM, GRU, Masking
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.callbacks import EarlyStopping
from keras.losses import categorical_crossentropy

In [48]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_val = pd.read_csv("val.csv")

In [49]:
df_train.head(3)

Unnamed: 0,id,text,class
0,0,@alisachachka не уезжаааааааай. :(❤ я тоже не ...,0
1,1,RT @GalyginVadim: Ребята и девчата!\nВсе в кин...,1
2,2,RT @ARTEM_KLYUSHIN: Кто ненавидит пробки ретви...,0


In [50]:
df_test.head(3)

Unnamed: 0,id,text
0,204150,Тектоника и рельеф-самое ужасное в мире мучение(
1,204151,"Ходили запускать шар желаний, но у нас не полу..."
2,204152,"Хочу лето только ради того, что бы направить н..."


In [51]:
df_val.head(2)

Unnamed: 0,id,text,class
0,181467,RT @TukvaSociopat: Максимальный репост! ))) #є...,1
1,181468,чтоб у меня з.п. ежегодно индексировали на инд...,0


# Preprocessing

In [52]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

df_train['text'] = df_train['text'].apply(preprocess_text)
df_val['text'] = df_val['text'].apply(preprocess_text)
df_test['text'] = df_test['text'].apply(preprocess_text)

In [53]:
text_corpus_train = df_train['text'].values
text_corpus_valid = df_val['text'].values
text_corpus_test = df_test['text'].values

In [54]:
tokenizer = Tokenizer(num_words=None,
                     filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                     lower=False, split=' ')
tokenizer.fit_on_texts(text_corpus_train)

sequences_train = tokenizer.texts_to_sequences(text_corpus_train)
sequences_val = tokenizer.texts_to_sequences(text_corpus_valid)
sequences_test = tokenizer.texts_to_sequences(text_corpus_test)

word_count = len(tokenizer.index_word) + 1
training_length = max([len(i.split()) for i in text_corpus_train])

X_train = pad_sequences(sequences_train, maxlen=training_length)
X_valid = pad_sequences(sequences_val, maxlen=training_length)

In [55]:
y_train = df_train['class'].values
y_val = df_val['class'].values

In [56]:
epochs = 5
batch_size = 512
comparing_list = []

# CNN

In [57]:
model_name = 'CNN'

model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
    )

model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 27, 30)            7743240   
                                                                 
 conv1d_3 (Conv1D)           (None, 25, 128)           11648     
                                                                 
 activation_3 (Activation)   (None, 25, 128)           0         
                                                                 
 global_max_pooling1d_3 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_14 (Dense)            (None, 64)                8256      
                                                                 
 dropout_7 (Dropout)         (None, 64)                0         
                                                      

In [58]:
def get_early_stop():
    early_stopping = EarlyStopping(monitor='val_loss')


    history = model.fit(X_train, y_train,
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=1,
                        validation_split=0.1,
                        callbacks=[early_stopping])

In [59]:
get_early_stop()

Epoch 1/5
Epoch 2/5


In [60]:
def get_score():
    score = model.evaluate(X_valid, y_val, batch_size=batch_size, verbose=1)

    print('\nVal loss:', round(score[0], 4))
    print('Val accuracy:', round(score[1], 4))

    comparing_list.append([model_name, round(score[1], 4)])

In [61]:
get_score()


Val loss: 0.5838
Val accuracy: 0.7417


# Simple RNN

In [62]:
model_name = 'Simple_RNN'
model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))

model.add(SimpleRNN(64))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy'])
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 27, 30)            7743240   
                                                                 
 masking_6 (Masking)         (None, 27, 30)            0         
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 64)                6080      
                                                                 
 dense_16 (Dense)            (None, 64)                4160      
                                                                 
 dropout_8 (Dropout)         (None, 64)                0         
                                                                 
 dense_17 (Dense)            (None, 1)                 65        
                                                                 
Total params: 7,753,545
Trainable params: 7,753,545
No

In [63]:
get_early_stop()

Epoch 1/5
Epoch 2/5


In [64]:
get_score()


Val loss: 0.5726
Val accuracy: 0.7362


# LSTM

In [65]:
model_name = 'LSTM'

model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))
model.add(LSTM(64, recurrent_dropout=0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy'])
model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 27, 30)            7743240   
                                                                 
 masking_7 (Masking)         (None, 27, 30)            0         
                                                                 
 lstm_3 (LSTM)               (None, 64)                24320     
                                                                 
 dense_18 (Dense)            (None, 64)                4160      
                                                                 
 dropout_9 (Dropout)         (None, 64)                0         
                                                                 
 dense_19 (Dense)            (None, 1)                 65        
                                                                 
Total params: 7,771,785
Trainable params: 7,771,785
No

In [66]:
get_early_stop()

Epoch 1/5
Epoch 2/5


In [67]:
get_score()


Val loss: 0.5759
Val accuracy: 0.7368


# GRU

In [68]:
model_name = 'GRU'

model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.2))
model.add(GRU(64, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))


model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy'])
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 27, 30)            7743240   
                                                                 
 masking_8 (Masking)         (None, 27, 30)            0         
                                                                 
 gru_2 (GRU)                 (None, 64)                18432     
                                                                 
 dense_20 (Dense)            (None, 64)                4160      
                                                                 
 dropout_10 (Dropout)        (None, 64)                0         
                                                                 
 dense_21 (Dense)            (None, 1)                 65        
                                                                 
Total params: 7,765,897
Trainable params: 7,765,897
N

In [69]:
get_early_stop()

Epoch 1/5
Epoch 2/5


In [70]:
get_score()


Val loss: 0.6071
Val accuracy: 0.7324


# CNN_RNN

In [71]:
model_name = 'CNN_RNN'

model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.3))

model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D(keepdims=True))

model.add(LSTM(128, recurrent_dropout=0.2))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy'])
model.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 27, 30)            7743240   
                                                                 
 masking_9 (Masking)         (None, 27, 30)            0         
                                                                 
 conv1d_4 (Conv1D)           (None, 25, 128)           11648     
                                                                 
 activation_4 (Activation)   (None, 25, 128)           0         
                                                                 
 global_max_pooling1d_4 (Glo  (None, 1, 128)           0         
 balMaxPooling1D)                                                
                                                                 
 lstm_4 (LSTM)               (None, 128)               131584    
                                                     

In [72]:
get_early_stop()

Epoch 1/5
Epoch 2/5


In [73]:
get_score()


Val loss: 0.5996
Val accuracy: 0.7327


# RNN_CNN

In [74]:
model_name = 'RNN_CNN'

model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.5))
model.add(LSTM(64, recurrent_dropout=0.2, return_sequences="True"))

model.add(Conv1D(64, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy'])
model.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 27, 30)            7743240   
                                                                 
 masking_10 (Masking)        (None, 27, 30)            0         
                                                                 
 lstm_5 (LSTM)               (None, 27, 64)            24320     
                                                                 
 conv1d_5 (Conv1D)           (None, 25, 64)            12352     
                                                                 
 activation_5 (Activation)   (None, 25, 64)            0         
                                                                 
 global_max_pooling1d_5 (Glo  (None, 64)               0         
 balMaxPooling1D)                                                
                                                     

In [75]:
get_early_stop()

Epoch 1/5
Epoch 2/5


In [76]:
get_score()


Val loss: 0.58
Val accuracy: 0.7292


In [77]:
pd.DataFrame(comparing_list, columns=['model_name', 'accuracy']).sort_values(by='accuracy', ascending=False)

Unnamed: 0,model_name,accuracy
0,CNN,0.7417
2,LSTM,0.7368
1,Simple_RNN,0.7362
4,CNN_RNN,0.7327
3,GRU,0.7324
5,RNN_CNN,0.7292


# Conclusion

All neural networks showed approximately the same result, the difference in hundredths