#Preparing Data

### Loading train and test data

In [None]:
! wget -O task2_ru_training.tsv https://www.dropbox.com/s/2nvhmusyozfrrn9/train.tsv?dl=0
! wget -O task2_ru_test.tsv https://www.dropbox.com/s/77s33v3q3q1i5mr/test.tsv?dl=0

### Splitting train set into train and dev, balancing and preparing data

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
train_path = r"task2_ru_training.tsv"
test_path = r"task2_ru_test.tsv"
res_dataset_dir = r"corpus_normalized/"
if not os.path.exists(res_dataset_dir):
    os.makedirs(res_dataset_dir)

In [None]:
train_df = pd.read_csv(train_path, sep=",", encoding="utf-8")
test_df = pd.read_csv(test_path, sep=",", encoding="utf-8")
train_df, dev_df, _, _ = \
    train_test_split(train_df, train_df, test_size=0.1, random_state=42)

train_positive_class_df = train_df[train_df['class'] == 1]
train_negative_class_df = train_df[train_df['class'] == 0]
num_positive_examples = train_positive_class_df.shape[0]
# For the training set, we take the same amount of positive and negative examples
train_negative_class_df = train_negative_class_df.sample(num_positive_examples, )
# Concatenating positive and negative examples and shuffling the training set
class_normalized_train_df = pd.concat([train_positive_class_df, train_negative_class_df]).sample(frac=1)


out_train_path = os.path.join(res_dataset_dir, "train.tsv")
out_test_path = os.path.join(res_dataset_dir, "test.tsv")
out_dev_path = os.path.join(res_dataset_dir, "dev.tsv")

class_normalized_train_df.to_csv(out_train_path, sep="\t", encoding="utf-8", index=False, )
test_df.to_csv(out_test_path, sep="\t", encoding="utf-8", index=False)
dev_df.to_csv(out_dev_path, sep="\t", encoding="utf-8", index=False, )

# Training CNN classifier

Model architecture is adopted from:

https://github.com/ShawnyXiao/TextClassification-Keras/tree/master/model/TextCNN

In [None]:
# %pip install tensorflow

For Russian language, we can use the [Fasttext model](https://drive.google.com/file/d/1su3IYY1avcj95tez69JI8f5qsTng72-I/view?usp=sharing) pretrained on the raw part of the [RuDReC corpus](https://github.com/cimm-kzn/RuDReC) to initialize a matrix of embeddings

In [None]:
! wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1su3IYY1avcj95tez69JI8f5qsTng72-I' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1su3IYY1avcj95tez69JI8f5qsTng72-I" -O fasttext_rudrec_raw.bin && rm -rf /tmp/cookies.txt

In [None]:
! pip install fasttext

Collecting fasttext
[?25l  Downloading https://files.pythonhosted.org/packages/f8/85/e2b368ab6d3528827b147fdb814f8189acc981a4bc2f99ab894650e05c40/fasttext-0.9.2.tar.gz (68kB)
[K     |████▊                           | 10kB 28.5MB/s eta 0:00:01[K     |█████████▌                      | 20kB 22.8MB/s eta 0:00:01[K     |██████████████▎                 | 30kB 17.8MB/s eta 0:00:01[K     |███████████████████             | 40kB 15.5MB/s eta 0:00:01[K     |███████████████████████▉        | 51kB 9.9MB/s eta 0:00:01[K     |████████████████████████████▋   | 61kB 10.3MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 6.1MB/s 
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp36-cp36m-linux_x86_64.whl size=3023454 sha256=9b0c6f43f31bc12f61777465cc1252f55be0f74dc303fd5cf609fc4bc2b02c26
  Stored in directory: /root/.cache/pip/wheels/98/ba/7f/b154944a1cf5a8cee91

In [None]:
import fasttext
import numpy as np
import pandas as pd
from keras_preprocessing import sequence
from keras_preprocessing.text import Tokenizer
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
train_path = r"corpus_normalized/train.tsv"
dev_path = r"corpus_normalized/dev.tsv"
test_path = r"corpus_normalized/test.tsv"
fasttext_model_path = r"fasttext_rudrec_raw.bin"

In [None]:
# Loading data
train_df = pd.read_csv(train_path, sep='\t', encoding="utf-8",)
dev_df = pd.read_csv(dev_path, sep='\t', encoding="utf-8",)
test_df = pd.read_csv(test_path, sep='\t', encoding="utf-8",)
# Loading pretrained fastext model
fasttext_model = fasttext.load_model(fasttext_model_path)

# Extracting tweet texts
train_tweet_texts = train_df.tweet.values
test_tweet_texts = test_df.tweet.values
dev_tweet_texts = dev_df.tweet.values

# Extracting tweet labels
train_labels = train_df['class'].values
#test_labels = test_df['class'].values
dev_labels = dev_df['class'].values




### Preprocessing

Preprocessing is adopted from:

https://github.com/akutuzov/webvectors/blob/master/preprocessing/modular_processing/unify.py

We unify letters to decrease the size of dictionary. We also unify and remove all punctuation.

In [None]:
import re
def list_replace(search, replacement, text):
    """
    Replaces all symbols of text which are present
    in the search string with the replacement string.
    """
    search = [el for el in search if el in text]
    for c in search:
        text = text.replace(c, replacement)
    return text

def clean_text(text):
    text = list_replace \
        ('\u00AB\u00BB\u2039\u203A\u201E\u201A\u201C\u201F\u2018\u201B\u201D\u2019', '\u0022', text)

    text = list_replace \
        ('\u2012\u2013\u2014\u2015\u203E\u0305\u00AF', '\u2003\u002D\u002D\u2003', text)

    text = list_replace('\u2010\u2011', '\u002D', text)

    text = list_replace \
            (
            '\u2000\u2001\u2002\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u202F\u205F\u2060\u3000',
            '\u2002', text)

    text = re.sub('\u2003\u2003', '\u2003', text)
    text = re.sub('\t\t', '\t', text)

    text = list_replace \
            (
            '\u02CC\u0307\u0323\u2022\u2023\u2043\u204C\u204D\u2219\u25E6\u00B7\u00D7\u22C5\u2219\u2062',
            '.', text)

    text = list_replace('\u2217', '\u002A', text)

    text = list_replace('…', '...', text)

    text = list_replace('\u00C4', 'A', text)
    text = list_replace('\u00E4', 'a', text)
    text = list_replace('\u00CB', 'E', text)
    text = list_replace('\u00EB', 'e', text)
    text = list_replace('\u1E26', 'H', text)
    text = list_replace('\u1E27', 'h', text)
    text = list_replace('\u00CF', 'I', text)
    text = list_replace('\u00EF', 'i', text)
    text = list_replace('\u00D6', 'O', text)
    text = list_replace('\u00F6', 'o', text)
    text = list_replace('\u00DC', 'U', text)
    text = list_replace('\u00FC', 'u', text)
    text = list_replace('\u0178', 'Y', text)
    text = list_replace('\u00FF', 'y', text)
    text = list_replace('\u00DF', 's', text)
    text = list_replace('\u1E9E', 'S', text)
    # Removing punctuation
    text = list_replace(',.[]{}()=+-−*&^%$#@!~;:§/\|\?"\n', ' ', text)
    # Replacing all numbers with masks
    text = list_replace('0123456789', 'x', text)

    currencies = list \
            (
            '\u20BD\u0024\u00A3\u20A4\u20AC\u20AA\u2133\u20BE\u00A2\u058F\u0BF9\u20BC\u20A1\u20A0\u20B4\u20A7\u20B0\u20BF\u20A3\u060B\u0E3F\u20A9\u20B4\u20B2\u0192\u20AB\u00A5\u20AD\u20A1\u20BA\u20A6\u20B1\uFDFC\u17DB\u20B9\u20A8\u20B5\u09F3\u20B8\u20AE\u0192'
        )

    alphabet = list \
            (
            '\t\r абвгдеёзжийклмнопрстуфхцчшщьыъэюяАБВГДЕЁЗЖИЙКЛМНОПРСТУФХЦЧШЩЬЫЪЭЮЯabcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ')

    allowed = set(currencies + alphabet)

    cleaned_text = [sym for sym in text if sym in allowed]
    cleaned_text = ''.join(cleaned_text)

    return cleaned_text

In [None]:
maxlen = 0
# Preprocessing training tweets
cleaned_train_texts = []
for tweet_text in train_tweet_texts:
    cleaned_text = clean_text(tweet_text).lower()
    split_cleaned_text = cleaned_text.split()
    # Estimating max length of all training tweets in tokens
    if len(split_cleaned_text) > maxlen:
        maxlen = len(split_cleaned_text)
    cleaned_train_texts.append(" ".join(split_cleaned_text))

# Preprocessing test tweets
cleaned_test_texts = []
for tweet_text in test_tweet_texts:
    cleaned_text = clean_text(tweet_text)
    cleaned_test_texts.append(" ".join(cleaned_text.split()))

# Preprocessing validation tweets
cleaned_dev_texts = []
for tweet_text in dev_tweet_texts:
    cleaned_text = clean_text(tweet_text)
    cleaned_dev_texts.append(" ".join(cleaned_text.split()))

### Encoding each tweet as a sequence of token ids. Initializing an embedding matrix using a pretrained Fasttext model

In [None]:
EMBEDDINGS_DIM = 200

tokenizer = Tokenizer(lower=True, char_level=False)
tokenizer.fit_on_texts(cleaned_train_texts + cleaned_test_texts + cleaned_dev_texts)
# Converting texts to lists of ids
word_seq_train = tokenizer.texts_to_sequences(cleaned_train_texts)
word_seq_test = tokenizer.texts_to_sequences(cleaned_test_texts)
word_seq_dev = tokenizer.texts_to_sequences(cleaned_dev_texts)
word_index = tokenizer.word_index

# Padding too short tweet texts with '0's
word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=maxlen)
word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=maxlen)
word_seq_dev = sequence.pad_sequences(word_seq_dev, maxlen=maxlen)

dictionary_size = len(word_index.keys())
# 0-th token of embedding matrix is a padding token
embedding_matrix = np.zeros((dictionary_size + 1, EMBEDDINGS_DIM))\

for word, i in word_index.items():
    embedding_vector = fasttext_model.get_word_vector((word))
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

### Compiling the model

We add early stopping callback and keep model weights from the epoch with the highest validation accuracy:

In [None]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding, Dense, Conv1D, GlobalMaxPooling1D, Concatenate, Flatten, Dropout, LeakyReLU

class TextCNN(Model):
    def __init__(self,
                 maxlen,
                 max_features,
                 embedding_dims,
                 kernel_sizes=[3, 4, 5],
                 class_num=1,
                 last_activation='sigmoid',
                 embedding_weights=None):

        super(TextCNN, self).__init__()
        self.maxlen = maxlen
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.kernel_sizes = kernel_sizes
        self.class_num = class_num
        self.last_activation = last_activation

        self.embedding = Embedding(self.max_features, self.embedding_dims,
                                   input_length=self.maxlen, weights=[embedding_weights], )
        self.convs = []
        self.max_poolings = []

        self.convs.append(Conv1D(128, 3, activation=LeakyReLU(0.1)))
        self.max_poolings.append(GlobalMaxPooling1D())

        self.convs.append(Conv1D(128, 4, activation=LeakyReLU(0.1)))
        self.max_poolings.append(GlobalMaxPooling1D())

        self.convs.append(Conv1D(128, 5, activation=LeakyReLU(0.1)))
        self.max_poolings.append(GlobalMaxPooling1D())

        self.classifier = Dense(1, activation=self.last_activation)

    def call(self, inputs):
        if len(inputs.get_shape()) != 2:
            raise ValueError('The rank of inputs of TextCNN must be 2, but now is %d' % len(inputs.get_shape()))
        if inputs.get_shape()[1] != self.maxlen:
            raise ValueError(
                'The maxlen of inputs of TextCNN must be %d, but now is %d' % (self.maxlen, inputs.get_shape()[1]))
        # Embedding part can try multichannel as same as origin paper

        embedding = self.embedding(inputs)
        convs = []
        for i in range(len(self.kernel_sizes)):
            c = self.convs[i](embedding)
            c = self.max_poolings[i](c)
            convs.append(c)

        x = Concatenate()(convs)

        #flat = Flatten()(x)
        #drop = Dropout(0.2)(x)

        output = self.classifier(x)
        return output

In [None]:
BATCH_SIZE = 128
EMBEDDINGS_DIM = 200
CLASSIFIER_TRAIN_EPOCHS = 20

model = TextCNN(maxlen, dictionary_size + 1, EMBEDDINGS_DIM, embedding_weights=embedding_matrix)
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'], )
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3 , mode='max', restore_best_weights=True)

### Training:

In [None]:
model.fit(word_seq_train, train_labels,
              batch_size=BATCH_SIZE,
              epochs=CLASSIFIER_TRAIN_EPOCHS,
              callbacks=[early_stopping, ],
              validation_data=(word_seq_dev, dev_labels))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


<tensorflow.python.keras.callbacks.History at 0x7f97e4404f28>

### Predicting labels for dev and test sets:

In [None]:
CLASSIFICATION_THRESHOLD = 0.5

predicted_test_prob = model.predict(word_seq_test)
predicted_test_labels = []
predicted_dev_prob = model.predict(word_seq_dev)
predicted_dev_labels = []

for subarray in predicted_test_prob:
    label = 1 if subarray[0] >= CLASSIFICATION_THRESHOLD else 0
    predicted_test_labels.append(label)

for subarray in predicted_dev_prob:
    label = 1 if subarray[0] >= CLASSIFICATION_THRESHOLD else 0
    predicted_dev_labels.append(label)

### Calculating dev precision, recall, f1-score and ROC_AUC:

In [None]:
dev_precision = precision_score(dev_labels, predicted_dev_labels)
dev_recall = recall_score(dev_labels, predicted_dev_labels)
dev_f_measure = f1_score(dev_labels, predicted_dev_labels)
dev_roc_auc = roc_auc_score(dev_labels, predicted_dev_labels)
print(f"Dev:\nPrecision: {dev_precision}\n"
        f"Recall: {dev_recall}\n"
        f"F-measure: {dev_f_measure}\n"
        f"ROC_AUC: {dev_roc_auc}")

Dev:
Precision: 0.27111111111111114
Recall: 0.8356164383561644
F-measure: 0.40939597315436244
ROC_AUC: 0.8245203921018592
