In [1]:
pip install -q stop_words pymorphy2

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for stop_words (setup.py) ... [?25l[?25hdone
  Building wheel for docopt (setup.py) ... [?25l[?25hdone


In [3]:
import pandas as pd
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re
from tqdm import tqdm
from utils import apostrophe_dict, emoticon_dict, short_word_dict
tqdm.pandas()

In [5]:
df_train = pd.read_csv("train.csv")
df_val = pd.read_csv("val.csv")

In [6]:
df_train.sample(2)

Unnamed: 0,id,text,class
49484,49484,"посмотрел в окно, увидел дождь, включил плеер ...",1
119572,119572,"Собираюсь смотреть фильм ""Реальная Любовь"" ))\...",1


In [7]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def replace_words(text,dict_):
    output = ''
    for word in text.split(' '):
        word = word.strip()
        if word in dict_.keys():
            output += ' ' + dict_[word]
        else:
            output += ' ' + word
    return output

def preprocess_text(txt):
    txt = str(txt)
    txt = re.sub("[\,]","",txt)
    txt = re.sub("@[\w]*","",txt)
    txt = replace_words(txt, emoticon_dict)
    txt = replace_words(txt, apostrophe_dict)
    txt = replace_words(txt, short_word_dict)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("\sне", "не", txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

In [8]:
df_train['text'] = df_train['text'].progress_apply(preprocess_text)
df_val['text'] = df_val['text'].progress_apply(preprocess_text)

100%|██████████| 181467/181467 [04:08<00:00, 729.13it/s]
100%|██████████| 22683/22683 [00:31<00:00, 720.46it/s]


In [9]:
df_train.sample(2)

Unnamed: 0,id,text,class
98384,98384,rt паранормальный явление херня полный,0
159699,159699,мын вместе другть дажена любить тебять сердце ...,0


In [10]:
import numpy as np
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D, SimpleRNN, LSTM, GRU, Masking,MaxPooling1D, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import EarlyStopping

In [11]:
text_corpus_train = df_train['text'].values
text_corpus_valid = df_val['text'].values

In [12]:
tokenizer = Tokenizer(num_words=None,
                     filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                     lower = False, split = ' ')
tokenizer.fit_on_texts(text_corpus_train)

sequences_train = tokenizer.texts_to_sequences(text_corpus_train)
sequences_val = tokenizer.texts_to_sequences(text_corpus_valid)

word_count = len(tokenizer.index_word) + 1
training_length = max([len(i.split()) for i in text_corpus_train])

X_train = pad_sequences(sequences_train, maxlen=training_length)
X_valid = pad_sequences(sequences_val, maxlen=training_length)

In [None]:
word_count, training_length

(188809, 28)

In [14]:
y_train = df_train['class'].values
y_val = df_val['class'].values

In [15]:
results = {
    "NN":[],
    "loss":[],
    "accuracy":[]
}

In [16]:
early_stopping=EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=3)

### CNN

In [17]:
model = Sequential()
model.add(Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [18]:
early_stopping=EarlyStopping(monitor='val_loss')


history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=5,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping]
                    )

Epoch 1/5
Epoch 2/5


In [19]:
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

results['NN'].append("CNN")
results['loss'].append(score[0])
results['accuracy'].append(score[1])

Test score: 0.4788275957107544
Test accuracy: 0.7681964635848999


## SimpleRNN

In [20]:

model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))

model.add(SimpleRNN(64))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [21]:
history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping]
                    )

Epoch 1/10
Epoch 2/10


In [22]:
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

results['NN'].append("SimpleRNN")
results['loss'].append(score[0])
results['accuracy'].append(score[1])

Test score: 0.5035539865493774
Test accuracy: 0.7632588148117065


## LSTM

In [23]:
model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))

model.add(LSTM(64))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [24]:
history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10


In [25]:
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

results['NN'].append("LSTM")
results['loss'].append(score[0])
results['accuracy'].append(score[1])

Test score: 0.4834473133087158
Test accuracy: 0.7603050470352173


## GRU

In [26]:
model = Sequential()

model.add(
    Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))
model.add(Masking(mask_value=0.0))

model.add(GRU(64))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [27]:
history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10


In [28]:
score = model.evaluate(X_valid, y_val, batch_size=512, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

results['NN'].append("GRU")
results['loss'].append(score[0])
results['accuracy'].append(score[1])

Test score: 0.4946509599685669
Test accuracy: 0.7622889280319214


## CNN+RNN

In [29]:
model = Sequential()
model.add(Embedding(input_dim=word_count,
                    input_length=training_length,
                    output_dim=30,
                    trainable=True,
                    mask_zero=True))
model.add(Masking(mask_value=0.0))

model.add(Conv1D(32, 2))
model.add(Activation("relu"))

model.add(Conv1D(16, 2))
model.add(Activation("relu"))
model.add(MaxPooling1D(1))

model.add(LSTM(16,return_sequences=True))

model.add(LSTM(16))

model.add(Dense(32))
model.add(Activation("relu"))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [30]:
history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

score = model.evaluate(X_valid, y_val, batch_size=512, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

results['NN'].append("CNN+RNN")
results['loss'].append(score[0])
results['accuracy'].append(score[1])

Epoch 1/10
Epoch 2/10
Test score: 0.47856590151786804
Test accuracy: 0.7638760209083557


## RNN+CNN

In [31]:
model = Sequential()
model.add(Embedding(input_dim=word_count,
              input_length=training_length,
              output_dim=30,
              trainable=True,
              mask_zero=True))

model.add(Masking(mask_value=0.0))
model.add(LSTM(32,return_sequences=True))
model.add(LSTM(32,return_sequences=True))

model.add(Conv1D(32, 3))
model.add(Activation("relu"))

model.add(MaxPooling1D(2))
model.add(Activation("relu"))

model.add(Conv1D(16, 3))
model.add(Activation("relu"))

model.add(GlobalMaxPool1D())
model.add(Dense(64))
model.add(Activation("relu"))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [32]:
history = model.fit(X_train, y_train,
                    batch_size=512,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[early_stopping])

score = model.evaluate(X_valid, y_val, batch_size=512, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

results['NN'].append("RNN+CNN")
results['loss'].append(score[0])
results['accuracy'].append(score[1])

Epoch 1/10
Epoch 2/10
Test score: 0.47120895981788635
Test accuracy: 0.7663448452949524


In [33]:
pd.DataFrame(results)

Unnamed: 0,NN,loss,accuracy
0,CNN,0.478828,0.768196
1,SimpleRNN,0.503554,0.763259
2,LSTM,0.483447,0.760305
3,GRU,0.494651,0.762289
4,CNN+RNN,0.478566,0.763876
5,RNN+CNN,0.471209,0.766345


Рассмотренные архитектуры показали примерно оденаковый результат.  