In [9]:
import pandas as pd
import numpy as np
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from pymystem3 import Mystem
from nltk import word_tokenize
from nltk.corpus import stopwords
import re
# fix random seed for reproducibility
np.random.seed(42)

In [3]:
def stemming(tokens: list) -> pd.Series:
    stem = Mystem()
    tokens = [word_tokenize("".join(stem.lemmatize(sentence))) for sentence in tokens]
    tokens = [[word for word in sentence if len(word) > 2] for sentence in tokens]
#     tokens = [
#         [
#             word
#             for word in sentence
#             if morph.parse(word)[0].tag.POS == "NOUN"
#             or morph.parse(word)[0].tag.POS == "ADJF"
#         ]
#         for sentence in tokens
#     ]
    return tokens

In [4]:
def check_stopwords(text: pd.Series) -> pd.Series:
    result = [
        [word for word in sentence if word not in ru_stopwords and word != " "]
        for sentence in text
    ]
    return result

In [5]:
ru_stopwords = stopwords.words("russian")

In [6]:
df = pd.read_csv('labeled_mails.csv', index_col=0).reset_index(drop=True)

In [10]:
df['clean_text'] = df.text.apply(lambda x: re.sub('[0-9]+', '', x))
df['clean_text'] = df.clean_text.apply(lambda x: re.sub('[^а-яА-Я]+', ' ', x))
df['clean_text'] = df.clean_text.apply(lambda x: x.lower())

In [11]:
df.loc[13, 'text'] = None
df = df[~df.text.isna()]

In [12]:
df.clean_text = stemming(df.clean_text)

In [13]:
df.clean_text = check_stopwords(df.clean_text)

In [15]:
X = df.clean_text.reset_index(drop=True)
y = df['class']

In [16]:
X = X.apply(lambda x: " ".join(x))

In [17]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100

In [26]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import SpatialDropout1D
from keras.callbacks import EarlyStopping

In [19]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(X.values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 896 unique tokens.


In [20]:
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (29, 250)


In [21]:
y = pd.get_dummies(y).values
print('Shape of label tensor:', y.shape)

Shape of label tensor: (29, 4)


In [32]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(4, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 5
batch_size = 64

history = model.fit(X, y, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
