#Сверточная нейронная сеть для классификации твитов

#Обучение без аугментации

##Загрузка модели FastText от DeepPavlov, предобученной на твитах

In [None]:
!wget -P /root/input/ -c "http://files.deeppavlov.ai/embeddings/ft_native_300_ru_twitter_nltk_word_tokenize.bin"

--2022-01-11 10:46:46--  http://files.deeppavlov.ai/embeddings/ft_native_300_ru_twitter_nltk_word_tokenize.bin
Resolving files.deeppavlov.ai (files.deeppavlov.ai)... 178.63.27.41
Connecting to files.deeppavlov.ai (files.deeppavlov.ai)|178.63.27.41|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://files.deeppavlov.ai/embeddings/ft_native_300_ru_twitter_nltk_word_tokenize.bin [following]
--2022-01-11 10:46:46--  https://files.deeppavlov.ai/embeddings/ft_native_300_ru_twitter_nltk_word_tokenize.bin
Connecting to files.deeppavlov.ai (files.deeppavlov.ai)|178.63.27.41|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3417475450 (3.2G) [application/octet-stream]
Saving to: ‘/root/input/ft_native_300_ru_twitter_nltk_word_tokenize.bin’


2022-01-11 10:49:42 (18.7 MB/s) - ‘/root/input/ft_native_300_ru_twitter_nltk_word_tokenize.bin’ saved [3417475450/3417475450]



In [None]:
from gensim.models.fasttext import FastText

In [None]:
fasttext = FastText()
fasttext.file_name='/root/input/ft_native_300_ru_twitter_nltk_word_tokenize.bin'
fasttext.load_binary_data()

##Загрузка и предобработка обучающих данных

In [None]:
from lxml import etree
import csv
import numpy as np
from typing import List, Tuple

In [None]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def load_sentirueval_2016(file_name: str) -> Tuple[List[str], List[str]]:
    texts = []
    labels = []
    with open(file_name, mode='rb') as fp:
        xml_data = fp.read()
    root = etree.fromstring(xml_data)
    for database in root.getchildren():
        if database.tag == 'database':
            for table in database.getchildren():
                if table.tag != 'table':
                    continue
                new_text = None
                new_label = None
                for column in table.getchildren():
                    if column.get('name') == 'text':
                        new_text = str(column.text).strip()
                        if new_label is not None:
                            break
                    elif column.get('name') not in {'id', 'twitid', 'date'}:
                        if new_label is None:
                            label_candidate = str(column.text).strip()
                            if label_candidate in {'0', '1', '-1'}:
                                new_label = 'negative' if label_candidate == '-1' else \
                                    ('positive' if label_candidate == '1' else 'neutral')
                                if new_text is not None:
                                    break
                if (new_text is None) or (new_label is None):
                    raise ValueError('File `{0}` contains some error!'.format(file_name))
                texts.append(new_text)
                labels.append(new_label)
            break
    return texts, labels

In [None]:
texts,labels=load_sentirueval_2016('bank_train_2016.xml')

In [None]:
label_to_num={'negative':0,'neutral':1,'positive':2}

In [None]:
num_labels=[label_to_num[label] for label in labels]

In [None]:
from sklearn.model_selection import train_test_split
import random
RANDOM_SEED = 42
random.seed(RANDOM_SEED)

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, num_labels, test_size=0.1, random_state=RANDOM_SEED)

In [None]:
vocab=[]
for text in texts:
  tokenized_text=word_tokenize(text)
  for word in tokenized_text:
    if word not in vocab:
      vocab.append(word)


In [None]:
embedding_matrix=np.zeros((len(vocab)+1, 100))
for i,word in enumerate(vocab):
  embedding_matrix[i,0:100]=fasttext[word]

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
max_length=max([len(word_tokenize(text)) for text in texts])

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
tokenizer=Tokenizer()
tokenizer.fit_on_texts(texts)

In [None]:
encoded_docs_train = tokenizer.texts_to_sequences(train_texts)
X_train = pad_sequences(encoded_docs_train, maxlen=max_length, padding='post')
y_train=np.asarray(train_labels)

In [None]:
encoded_docs_val = tokenizer.texts_to_sequences(val_texts)
X_val = pad_sequences(encoded_docs_val, maxlen=max_length, padding='post')
y_val=np.asarray(val_labels)

In [None]:
X_train.shape,y_train.shape

((8452, 46), (8452,))

In [None]:
X_val.shape,y_val.shape

((940, 46), (940,))

In [None]:
train_labels.count(0),train_labels.count(1), train_labels.count(2), len(y_train)

(1547, 6272, 633, 21416)

In [None]:
X_train

array([[   7,   12,  147, ...,    0,    0,    0],
       [ 436,    5,  230, ...,    0,    0,    0],
       [3112,  237,   38, ...,    0,    0,    0],
       ...,
       [ 123,    9,    4, ...,    0,    0,    0],
       [   9,    4,   33, ...,    0,    0,    0],
       [   9,  276,    4, ...,    0,    0,    0]], dtype=int32)

In [None]:
X_val

array([[  139,    17,  5343, ...,     0,     0,     0],
       [  123,   205,   388, ...,     0,     0,     0],
       [ 2160,  3701,    90, ...,     0,     0,     0],
       ...,
       [   28,    52,    21, ...,     0,     0,     0],
       [    8,    15,     6, ...,     0,     0,     0],
       [15011,    92,   251, ...,     0,     0,     0]], dtype=int32)

##Создание и обучение сверточной нейронной сети

In [None]:
import tensorflow as tf
tf.random.set_seed(42)

In [None]:
from tensorflow.keras.layers import Flatten, Conv1D, MaxPooling1D, SpatialDropout1D,Dense,Dropout,Embedding
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from keras.models import Sequential
from keras.initializers import he_uniform, glorot_uniform
from keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import ExponentialDecay

In [None]:
lr_schedule = ExponentialDecay(
    initial_learning_rate=2e-3,
    decay_steps=2 * len(X_train),
    decay_rate=0.9
)

In [None]:
cnn = Sequential()
cnn.add(Embedding(len(vocab)+1, 100,weights=[embedding_matrix], input_length=max_length, trainable=False))
cnn.add(Conv1D(64, 2, padding='valid', activation='relu',
               kernel_initializer=he_uniform(seed=RANDOM_SEED), name='Conv_Block1_Layer1'))
cnn.add(Conv1D(64, 2, activation='relu', kernel_initializer=he_uniform(seed=RANDOM_SEED),
               name='Conv_Block1_Layer2'))
cnn.add(MaxPooling1D(pool_size=2, name='MaxPool1'))
cnn.add(SpatialDropout1D(rate=0.15, name='SpatialDropout1', seed=RANDOM_SEED))

cnn.add(Conv1D(64, 3, padding='valid', activation='relu', kernel_initializer=he_uniform(seed=RANDOM_SEED),name='Conv_Block2_Layer1'))
cnn.add(Conv1D(64, 3, activation='relu', kernel_initializer=he_uniform(seed=RANDOM_SEED),name='Conv_Block2_Layer2'))
cnn.add(MaxPooling1D(pool_size=2, name='MaxPool2'))
cnn.add(SpatialDropout1D(rate=0.15, name='SpatialDropout2', seed=RANDOM_SEED))

cnn.add(Flatten())
cnn.add(Dense(512, activation='relu', kernel_initializer=he_uniform(seed=RANDOM_SEED), name='HiddenLayer'))
cnn.add(Dropout(rate=0.5, seed=RANDOM_SEED, name='DropoutAfterHidden'))
cnn.add(Dense(3, activation='softmax', kernel_initializer=glorot_uniform(seed=RANDOM_SEED), name='OutputLayer'))
cnn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['sparse_categorical_accuracy'])
cnn.summary()

Model: "sequential_71"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_71 (Embedding)    (None, 46, 100)           2170200   
                                                                 
 Conv_Block1_Layer1 (Conv1D)  (None, 45, 64)           12864     
                                                                 
 Conv_Block1_Layer2 (Conv1D)  (None, 44, 64)           8256      
                                                                 
 MaxPool1 (MaxPooling1D)     (None, 22, 64)            0         
                                                                 
 SpatialDropout1 (SpatialDro  (None, 22, 64)           0         
 pout1D)                                                         
                                                                 
 Conv_Block2_Layer1 (Conv1D)  (None, 20, 64)           12352     
                                                     

In [None]:
BATCH_SIZE=128
cnn.fit(
    X_train, y_train,
    batch_size=BATCH_SIZE,
    validation_data=(X_val, y_val),
    shuffle=True, epochs=100,
    callbacks=[
        EarlyStopping(
            monitor='val_loss', patience=5, restore_best_weights=True, verbose=1
        )
    ],
    verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 00011: early stopping


<keras.callbacks.History at 0x7ef9cef7e710>

##Оценка качества классификации

In [None]:
test_texts,test_labels=load_sentirueval_2016('banks_test_etalon.xml')

In [None]:
test_num_labels=[label_to_num[label] for label in test_labels]

In [None]:
encoded_docs_test = tokenizer.texts_to_sequences(test_texts)
X_test = pad_sequences(encoded_docs_test, maxlen=max_length, padding='post')
y_test=np.asarray(test_num_labels)

In [None]:
X_test

array([[ 64,   4,  25, ...,   0,   0,   0],
       [ 11,   2,   1, ...,   0,   0,   0],
       [ 64,   4,  25, ...,   0,   0,   0],
       ...,
       [  8,   7,  12, ...,   0,   0,   0],
       [ 60,  57, 173, ...,   0,   0,   0],
       [ 60,  57,  89, ...,   0,   0,   0]], dtype=int32)

Результаты классификации твитов с помощью сверточной нейронной сети

In [None]:
y_pred = np.argmax(cnn.predict(X_test, batch_size=128), axis=1)

In [None]:
from sklearn.metrics import classification_report

Результат не радует,но это лучшее, чего удалось достичь в результате экспериментов с количеством слоев, количествами карт признаков, размером ядер свертки. Ещё пыталась менять lerning rate и размер минибатча - не помогло.

In [None]:
print(classification_report(y_test, y_pred, target_names=label_to_num.keys(), digits=4))

              precision    recall  f1-score   support

    negative     0.5504    0.3272    0.4105       767
     neutral     0.7277    0.9004    0.8049      2238
    positive     0.1477    0.0422    0.0657       308

    accuracy                         0.6879      3313
   macro avg     0.4753    0.4233    0.4270      3313
weighted avg     0.6327    0.6879    0.6448      3313



In [None]:
from sklearn.metrics import f1_score

In [None]:
macro=f1_score(test_num_labels,y_pred,average='macro',labels=[0,2])
micro=f1_score(test_num_labels,y_pred,average='micro',labels=[0,2])
macro,micro

(0.23806131635240382, 0.32612723903644225)

#Обучение с аугментацией

##Аугментация опечатками

In [None]:
pip install textattack

Collecting textattack
  Downloading textattack-0.3.4-py3-none-any.whl (373 kB)
[K     |████████████████████████████████| 373 kB 5.1 MB/s 
[?25hCollecting bert-score>=0.3.5
  Downloading bert_score-0.3.11-py3-none-any.whl (60 kB)
[K     |████████████████████████████████| 60 kB 5.1 MB/s 
Collecting num2words
  Downloading num2words-0.5.10-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 9.2 MB/s 
[?25hCollecting flair
  Downloading flair-0.10-py3-none-any.whl (322 kB)
[K     |████████████████████████████████| 322 kB 9.5 MB/s 
Collecting language-tool-python
  Downloading language_tool_python-2.6.2-py3-none-any.whl (30 kB)
Collecting lru-dict
  Downloading lru-dict-1.1.7.tar.gz (10 kB)
Collecting transformers>=3.3.0
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 33.6 MB/s 
[?25hCollecting datasets
  Downloading datasets-1.17.0-py3-none-any.whl (306 kB)
[K     |████████████████████████████████|

In [None]:
pip install tensorflow_text

Collecting tensorflow_text
  Downloading tensorflow_text-2.7.3-cp37-cp37m-manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 5.2 MB/s 
Collecting importlib-metadata>=4.4
  Downloading importlib_metadata-4.10.0-py3-none-any.whl (17 kB)
Installing collected packages: importlib-metadata, tensorflow-text
  Attempting uninstall: importlib-metadata
    Found existing installation: importlib-metadata 3.10.1
    Uninstalling importlib-metadata-3.10.1:
      Successfully uninstalled importlib-metadata-3.10.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
konoha 4.6.5 requires importlib-metadata<4.0.0,>=3.7.0, but you have importlib-metadata 4.10.0 which is incompatible.[0m
Successfully installed importlib-metadata-4.10.0 tensorflow-text-2.7.3


In [None]:
from textattack.transformations import WordSwapRandomCharacterDeletion
from textattack.transformations import WordSwapQWERTY
from textattack.transformations import CompositeTransformation


from textattack.constraints.pre_transformation import RepeatModification
from textattack.constraints.pre_transformation import StopwordModification

from textattack.augmentation import Augmenter

Класс WordSwapQWERTY от textAttack не поддерживает кириллицу, поэтому я попыталась решить эту проблему, изменив свойство keyboard_adjacency в его конструкторе

In [None]:
class RusWordSwapQWERTY(WordSwapQWERTY):
  def __init__(
        self, random_one=True, skip_first_char=False, skip_last_char=False, **kwargs
    ):
        super().__init__(**kwargs)
        self.random_one = random_one
        self.skip_first_char = skip_first_char
        self.skip_last_char = skip_last_char

        self._keyboard_adjacency = {
            "ё":["е"],
            "й": ["ц","ф","ё"],
            "ц": ["й", "у", "ф", "ы", "в"],
            "у": ["ц", "ы", "в", "а", "к"],
            "к": ["у", "в", "а", "п", "е"],
            "е": ["к", "а", "п", "р", "н"],
            "н": ["е", "п", "р", "о", "г"],
            "г": ["н", "р", "о", "л", "ш"],
            "ш": ["г", "о", "л", "д", "щ"],
            "щ": ["ш", "л", "д", "з"],
            "з": ["щ", "д","ж","э","х"],
            "х": ["з", "ж","э","ъ"],
            "ъ": ["х", "ж","э"],
            "ф": ["й", "ц", "ы", "я", "ч"],
            "ы": ["й", "ц", "у", "а", "в", "я", "ч"],
            "в": ["ц", "у", "к", "а", "с", "ч", "ы"],
            "а": ["у", "к", "е", "п", "м", "с", "в"],
            "п": ["к", "е", "н", "р", "и", "м", "в"],
            "р": ["е", "н", "г", "п", "о", "и", "т"],
            "о": ["н", "г", "ш", "л", "ь", "т", "р"],
            "л": ["г", "ш", "щ", "д", "ь", "о"],
            "д": ["ш", "щ", "з", "л","ж","ю","б"],
            "ж": ["д", "щ", "з", "х","э","ю"],
            "э": ["ж", "з", "х", "ъ","ю"],
            "я": ["ф", "ы", "ч"],
            "ч": ["ы", "в", "я", "с"],
            "с": ["ч", "в", "а", "м"],
            "м": ["с", "а", "п", "и"],
            "и": ["м", "п", "р", "т"],
            "т": ["и", "р", "о", "ь"],
            "ь": ["т", "о", "л","б"],
            "б": ["ь", "л", "д","ю"],
            "ю": ["б", "д", "ж"]
        }
  def _get_adjacent(self, s):
        s_lower = s.lower()
        if s_lower in self._keyboard_adjacency:
            adjacent_keys = self._keyboard_adjacency[s_lower]
            if s.isupper():
                return [key.upper() for key in adjacent_keys]
            else:
                return adjacent_keys
        else:
            return []

  def _get_replacement_words(self, word):
        if len(word) <= 1:
            return []

        candidate_words = []

        start_idx = 1 if self.skip_first_char else 0
        end_idx = len(word) - (1 + self.skip_last_char)

        if start_idx >= end_idx:
            return []

        if self.random_one:
            i = random.randrange(start_idx, end_idx + 1)
            candidate_word = (
                word[:i] + random.choice(self._get_adjacent(word[i])) + word[i + 1 :]
            )
            candidate_words.append(candidate_word)
        else:
            for i in range(start_idx, end_idx + 1):
                for swap_key in self._get_adjacent(word[i]):
                    candidate_word = word[:i] + swap_key + word[i + 1 :]
                    candidate_words.append(candidate_word)

        return candidate_words

In [None]:
typo_transformation = CompositeTransformation([WordSwapRandomCharacterDeletion(),RusWordSwapQWERTY(random_one=False)])
typo_constraints = [RepeatModification(),StopwordModification(language='russian')]
typo_augmenter = Augmenter(transformation=typo_transformation, constraints=typo_constraints, pct_words_to_swap=0.5, transformations_per_example=5)

In [None]:
s = texts[1]
typo_augmenter.augment(s)

Аугментация обучающей выборки опечатками

In [None]:
texts_with_typos=[]
labels_of_texts_with_typos=[]
for indx,text in enumerate(tqdm(train_texts)):
    texts_with_typos.extend(typo_augmenter.augment(text))
    labels_of_texts_with_typos.extend([train_labels[indx]]*5)

Код выполнялся очень долго - 2 ч. 1 мин. 44 сек. Повторять это как-то не хочется) Поэтому запишем получившиеся тексты в csv-файл

In [None]:
with open('texts_with_typos.csv', 'w', encoding='utf-8') as file:
    writer = csv.DictWriter(file,fieldnames=['text','label'])
    writer.writeheader()
    for i,text in enumerate(texts):
        writer.writerow({'text':text,'label':labels_of_texts_with_typos[i]})

Считывание полученных при аугментации текстов из файла

In [None]:
with open('texts_with_typos.csv', 'r', encoding='utf-8') as file:
  texts_with_typos=[]
  labels_of_texts_with_typos=[]
  reader = csv.DictReader(file)
  for row in reader:
    if row!='':
      texts_with_typos.append(row['text'])
      labels_of_texts_with_typos.append(int(row['label']))

In [None]:
print(len(labels_of_texts_with_typos),len(texts_with_typos))

42260 42260


In [None]:
print(len(train_texts),len(train_labels))

8452 8452


In [None]:
# трансформации позитивных твитов
pos_texts_with_typos=[]
pos_labels_of_texts_with_typos=[]
for text_idx,text in enumerate(texts_with_typos):
  if labels_of_texts_with_typos[text_idx]==2:
    pos_texts_with_typos.append(text)
    pos_labels_of_texts_with_typos.append(labels_of_texts_with_typos[text_idx])

In [None]:
# трансформации негативных твитов
neg_texts_with_typos=[]
neg_labels_of_texts_with_typos=[]
for text_idx,text in enumerate(texts_with_typos):
  if labels_of_texts_with_typos[text_idx]==0:
    neg_texts_with_typos.append(text)
    neg_labels_of_texts_with_typos.append(labels_of_texts_with_typos[text_idx])

In [None]:
# трансформации позитивных и негативных твитов
pos_neg_texts_with_typos=[]
pos_neg_labels_of_texts_with_typos=[]
for text_idx,text in enumerate(texts_with_typos):
  if labels_of_texts_with_typos[text_idx]!=1:
    pos_neg_texts_with_typos.append(text)
    pos_neg_labels_of_texts_with_typos.append(labels_of_texts_with_typos[text_idx])

##Аугментация синонимами

Соответсвующий класс из textAttack не поддерживает русский язык. Я реализовала аугментацию синонимами с помощью библиотеки wiki_ru_wordnet

In [None]:
pip install spacy_udpipe

Collecting spacy_udpipe
  Downloading spacy_udpipe-1.0.0-py3-none-any.whl (11 kB)
Collecting ufal.udpipe>=1.2.0
  Downloading ufal.udpipe-1.2.0.3.tar.gz (304 kB)
[K     |████████████████████████████████| 304 kB 10.3 MB/s 
[?25hCollecting spacy<4.0.0,>=3.0.0
  Downloading spacy-3.2.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[K     |████████████████████████████████| 6.0 MB 52.5 MB/s 
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 54.1 MB/s 
Collecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.6-py3-none-any.whl (17 kB)
Collecting spacy-legacy<3.1.0,>=3.0.8
  Downloading spacy_legacy-3.0.8-py2.py3-none-any.whl (14 kB)
Collecting srsly<3.0.0,>=2.4.1
  Downloading srsly-2.4.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (451 kB)
[K     |████████████████████████████████| 451 kB 37.9 MB/s 
[?25hCollecting langcodes

In [None]:
import spacy_udpipe

spacy_udpipe.download("ru") 
nlp = spacy_udpipe.load("ru")

Downloaded pre-trained UDPipe model for 'ru' language


In [None]:
pip install wiki_ru_wordnet

Collecting wiki_ru_wordnet
  Downloading wiki_ru_wordnet-1.0.3.tar.gz (20.6 MB)
[K     |████████████████████████████████| 20.6 MB 1.3 MB/s 
[?25hBuilding wheels for collected packages: wiki-ru-wordnet
  Building wheel for wiki-ru-wordnet (setup.py) ... [?25l[?25hdone
  Created wheel for wiki-ru-wordnet: filename=wiki_ru_wordnet-1.0.3-py3-none-any.whl size=20890441 sha256=13796108fcdab09f7a016d34521b1585ea7555912d43b1b650d2e190ecaf8fe7
  Stored in directory: /root/.cache/pip/wheels/35/43/b7/7dc9d93394ea52e85730fde0ab446267f4855cb6c3a2a137ce
Successfully built wiki-ru-wordnet
Installing collected packages: wiki-ru-wordnet
Successfully installed wiki-ru-wordnet-1.0.3


In [None]:
from wiki_ru_wordnet import WikiWordnet
wikiwordnet = WikiWordnet()

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
rus_stopwords=stopwords.words('russian')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# трансофрмация, при которой на синоним заменяется одно слово в тексте
def augment_with_synonym(text,n:int)->List[str]:
  doc=nlp(text.lower())
  lemmatized_text=[token.lemma_ for token in doc]
  changed_texts=[]
  for i in range(n):
    changed_text=[]
    c=1
    word_to_change=random.choice(lemmatized_text)
    changed_word=word_to_change
    while True:
      if c>len(lemmatized_text):
        break
      elif word_to_change not in rus_stopwords:
        synsets=wikiwordnet.get_synsets(word_to_change)
        if synsets:
         synset=random.choice(synsets)
         words=synset.get_words()
         new_word=random.choice(list(words))
         if new_word.lemma()!=word_to_change:
             changed_word=new_word.lemma()
         break
      word_to_change=random.choice(lemmatized_text)
      changed_word=word_to_change
      c+=1
    for word in lemmatized_text:
      if word == word_to_change:
        changed_text.append(changed_word)
      else:
        changed_text.append(word)
    if changed_text!=lemmatized_text:
      changed_texts.append(' '.join(changed_text))
  return set(changed_texts)

In [None]:
# трансофрмация, при которой на синоним по возможности заменяются все слова в тексте (кроме стоп-слов)
def augment_with_synonyms(text,n:int)->List[str]:
  doc=nlp(text.lower())
  lemmatized_text=[token.lemma_ for token in doc]
  changed_texts=[]
  for i in range(n):
    changed_text=[]
    for word in lemmatized_text:
      if word not in rus_stopwords:
        synsets = wikiwordnet.get_synsets(word)
        if synsets:
          synset=random.choice(synsets)
          words=synset.get_words()
          new_word=random.choice(list(words))
          if new_word.lemma()!=word:
            changed_text.append(new_word.lemma())
        else:
         changed_text.append(word)
      else:
        changed_text.append(word)
    if changed_text!=lemmatized_text:
      changed_texts.append(' '.join(changed_text))
  return set(changed_texts)

In [None]:
from tqdm.notebook import tqdm

In [None]:
texts_with_synonyms=[]
labels_of_texts_with_synonyms=[]
for text_idx,text in enumerate(tqdm(train_texts)):
  if train_labels[text_idx]==2:
    augmented_texts=augment_with_synonyms(text,5)
    texts_with_synonyms.extend(augmented_texts)
    labels_of_texts_with_synonyms.extend([train_labels[text_idx]]*len(augmented_texts))
  '''elif train_labels[text_idx]==0:
    augmented_texts=augment_with_synonyms(text,3)
    texts_with_synonyms.extend(augmented_texts)
    labels_of_texts_with_synonyms.extend([train_labels[text_idx]]*len(augmented_texts))'''


  0%|          | 0/8452 [00:00<?, ?it/s]

In [None]:
print(len(texts_with_synonyms),len(labels_of_texts_with_synonyms))

3592 3592


In [None]:
texts_with_synonyms[:20]

['экономика : сбербанк запустить новый редакция свой сайт # news',
 'экономика : сбербанк запустить свежий версия свой сайт # news',
 'rt @gicenewopyv : ненавидеть сбербанк . брать , блин , мода принимать коммунальный платеж , только через банкомат . который однозначно не давать с',
 'rt @gicenewopyv : ненавидеть сбербанк . получать , блин , мода принимать коммунальный платеж , только через банкомат . который естественно не давать с',
 '# россия под последний санкция ес попасть крупный банк россия - сбербанк и втб http://t.co/1aa3jpbxqh',
 'райффайзенбанк запустить свежий версия мобильный приложение r-mobile : в свежий мобильный приложение r-mobile исп . . . http://t.co/bqbxgjirt0',
 'райффайзенбанк запустить новый редакция мобильный приложение r-mobile : в новый мобильный приложение r-mobile исп . . . http://t.co/bqbxgjirt0',
 'райффайзенбанк запустить недавний версия мобильный приложение r-mobile : в недавний мобильный приложение r-mobile исп . . . http://t.co/bqbxgjirt0',
 'rt @inte

##Аугментация с помощью BERT

In [None]:
from textattack.transformations.word_swaps.word_swap_masked_lm import WordSwapMaskedLM

In [None]:
bert_transformation = WordSwapMaskedLM(masked_language_model='DeepPavlov/rubert-base-cased')
bert_augmenter = Augmenter(transformation=bert_transformation, pct_words_to_swap=0.8)

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.57M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
from time import time

In [None]:
before=time()
print(bert_augmenter.augment(train_texts[0]))
time_passed=time()-before

['втб 24 бита вокруг цифры [UNK]://t.T/fBUE1VVdLD']


In [None]:
time_passed/60

3.882046409447988

Аугментация одного текста выполнялась почти 4 минуты

In [None]:
time_passed*len(train_texts)/(60*60*24)

62.18283505578284

Чтобы увеличить количество обучающих текстов в два раза понадобится 62 дня... Поэтому использовать этот вариант, видимо, нет смысла

In [None]:
bert_tiny_transformation = WordSwapMaskedLM(masked_language_model='cointegrated/rubert-tiny')
bert_tiny_augmenter = Augmenter(transformation=bert_tiny_transformation, pct_words_to_swap=0.8)

Downloading:   0%|          | 0.00/632 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/45.5M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/341 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/235k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/457k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
before=time()
print(bert_tiny_augmenter.augment(text))
time_passed=time()-before

['@ovegexifww d #11 #29 активно Ямал 2009 ет на $2,94 млрд:']


In [None]:
time_passed*len(train_texts)/(60*60*24)

10.343072936683892

Модель поменьше работает быстрее (всего 38 секунд на один текст). Но чтобы проаугментировать всю обучающую выборку все равно понадобится несколько дней

##Обучение модели и оценка качества

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, num_labels, test_size=0.1, random_state=RANDOM_SEED)

In [None]:
train_texts.extend(texts_with_synonyms)
train_labels.extend(labels_of_texts_with_synonyms)

In [None]:
train_texts.extend(pos_neg_texts_with_typos)
train_labels.extend(pos_neg_labels_of_texts_with_typos)

In [None]:
train_labels.count(0),train_labels.count(1), train_labels.count(2), len(y_train)

(9282, 6272, 6159, 21403)

Теперь дисбаланс классов не так сильно выражен

In [None]:
encoded_docs_train = tokenizer.texts_to_sequences(train_texts)
X_train = pad_sequences(encoded_docs_train, maxlen=max_length, padding='post')
y_train=np.asarray(train_labels)

In [None]:
cnn = Sequential()
cnn.add(Embedding(len(vocab)+1, 100,weights=[embedding_matrix], input_length=max_length, trainable=False))
cnn.add(Conv1D(64, 3, padding='valid', activation='relu',
               kernel_initializer=he_uniform(seed=RANDOM_SEED), name='Conv_Block1_Layer1'))
cnn.add(Conv1D(64, 3, activation='relu', kernel_initializer=he_uniform(seed=RANDOM_SEED),
               name='Conv_Block1_Layer2'))
cnn.add(MaxPooling1D(pool_size=2, name='MaxPool1'))
cnn.add(SpatialDropout1D(rate=0.15, name='SpatialDropout1', seed=RANDOM_SEED))

cnn.add(Conv1D(64, 4, padding='valid', activation='relu', kernel_initializer=he_uniform(seed=RANDOM_SEED),name='Conv_Block2_Layer1'))
cnn.add(Conv1D(64, 4, activation='relu', kernel_initializer=he_uniform(seed=RANDOM_SEED),name='Conv_Block2_Layer2'))
cnn.add(MaxPooling1D(pool_size=2, name='MaxPool2'))
cnn.add(SpatialDropout1D(rate=0.15, name='SpatialDropout2', seed=RANDOM_SEED))

cnn.add(Flatten())
cnn.add(Dense(512, activation='relu', kernel_initializer=he_uniform(seed=RANDOM_SEED), name='HiddenLayer'))
cnn.add(Dropout(rate=0.5, seed=RANDOM_SEED, name='DropoutAfterHidden'))
cnn.add(Dense(3, activation='softmax', kernel_initializer=glorot_uniform(seed=RANDOM_SEED), name='OutputLayer'))
cnn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['sparse_categorical_accuracy'])
cnn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 46, 100)           2170200   
                                                                 
 Conv_Block1_Layer1 (Conv1D)  (None, 44, 64)           19264     
                                                                 
 Conv_Block1_Layer2 (Conv1D)  (None, 42, 64)           12352     
                                                                 
 MaxPool1 (MaxPooling1D)     (None, 21, 64)            0         
                                                                 
 SpatialDropout1 (SpatialDro  (None, 21, 64)           0         
 pout1D)                                                         
                                                                 
 Conv_Block2_Layer1 (Conv1D)  (None, 18, 64)           16448     
                                                        

In [None]:
BATCH_SIZE=128
cnn.fit(X_train, y_train, batch_size=BATCH_SIZE, validation_data=(X_val, y_val), shuffle=True, epochs=100, callbacks=[EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1)], verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 00012: early stopping


<keras.callbacks.History at 0x7ef999d963d0>

In [None]:
y_pred = np.argmax(cnn.predict(X_test, batch_size=128), axis=-1)

Результат аугментации положительных текстов синонимами (с увеличением в 5 раз), при которой в тексте по возможности заменяется каждое слово (а не только одно)  и аугментацией положительных и отрицательных текстов опечатками.
Это лучший результат, которого удалось достичь. 
Еще пробовала варианты с полной аугментацией (для всех классов), различными комбинациями аугментаций для положительных и отрицательных текстов, изменением количества трансформаций с синонимами, аугментацию синонимами, при котором заменяется только одно слово.

In [None]:
print(classification_report(y_test, y_pred, target_names=label_to_num.keys(), digits=4))

              precision    recall  f1-score   support

    negative     0.3181    0.6910    0.4357       767
     neutral     0.7651    0.3682    0.4971      2238
    positive     0.1035    0.1916    0.1344       308

    accuracy                         0.4265      3313
   macro avg     0.3956    0.4169    0.3557      3313
weighted avg     0.6001    0.4265    0.4492      3313



Качество распознавания нейтральных текстов заменто упало, поэтому общие метрики стали ниже, чем для исходных данных, но положительные и в меньшей степени отрицательные твиты стали классифицироваться чуть лучше. 

In [None]:
macro=f1_score(test_num_labels,y_pred,average='macro',labels=[0,2])
micro=f1_score(test_num_labels,y_pred,average='micro',labels=[0,2])
macro,micro

(0.28503623768475783, 0.3557837511325883)