In [None]:
!pip install deeppavlov

In [None]:
!pip install tensorflow==1.15

In [None]:
from deeppavlov.models.embedders.elmo_embedder import ELMoEmbedder

import pandas as pd
import numpy as np
import seaborn as sns

from spacy.lang.ru import Russian
import nltk
from nltk.corpus import stopwords
from string import punctuation

from joblib import dump, load

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package perluniprops to /root/nltk_data...
[nltk_data]   Unzipping misc/perluniprops.zip.
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping corpora/nonbreaking_prefixes.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
train = pd.read_csv('trainSet.csv')
test = pd.read_csv('testSet.csv')

In [None]:
y_train = np.array(train['Class'])
y_test = np.array(test['Class'])

In [None]:
print(np.unique(y_train))
print(np.unique(y_test))

[ 1  3  8 21 22 23 31 32 41 42 43 44]
[ 1  3  8 21 22 23 31 32 41 42 43 44 51 52 53 54 55 56 57 58 59 60 61]


In [None]:
test = test[test['Class'] <=44]

y_test = np.array(test['Class'])

print(np.unique(y_train))
print(np.unique(y_test))

[ 1  3  8 21 22 23 31 32 41 42 43 44]
[ 1  3  8 21 22 23 31 32 41 42 43 44]


In [None]:
d = {}
inv_d = {}
l = np.unique(y_train)
for i in range(len(l)):
  d[i] = l[i]
  inv_d[l[i]] = i
print(d)
print(inv_d)
y_train = np.array([inv_d[i] for i in y_train])
y_test = np.array([inv_d[i] for i in y_test])
print(np.unique(y_train))
print(np.unique(y_test))
print(len(y_train))
print(len(y_test))

{0: 1, 1: 3, 2: 8, 3: 21, 4: 22, 5: 23, 6: 31, 7: 32, 8: 41, 9: 42, 10: 43, 11: 44}
{1: 0, 3: 1, 8: 2, 21: 3, 22: 4, 23: 5, 31: 6, 32: 7, 41: 8, 42: 9, 43: 10, 44: 11}
[ 0  1  2  3  4  5  6  7  8  9 10 11]
[ 0  1  2  3  4  5  6  7  8  9 10 11]
2244
963


In [None]:
X_train = train['Text']
X_test = test['Text']

print(len(X_train))
print(len(X_test))

2244
963


In [None]:
elmo = ELMoEmbedder("http://files.deeppavlov.ai/deeppavlov_data/elmo_ru-news_wmt11-16_1.5M_steps.tar.gz")

In [None]:
russian_stopwords = stopwords.words("russian")
nlp = Russian()

def preprocess_text(text):
  tokens = text.lower()
  sentence = nlp(text.lower())
  tokens = [token.text for token in sentence]
  tokens = [token for token in tokens if token not in russian_stopwords\
            and token != " " \
            and token.strip() not in punctuation]
  
  
  return tokens

In [None]:
X_train = np.array([elmo([preprocess_text(text)]) for text in X_train])
X_test = np.array([elmo([preprocess_text(text)]) for text in X_test])

X_train = X_train.reshape((X_train.shape[0], X_train.shape[2]))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[2]))

print(X_train.shape)
print(X_test.shape)

(2244, 1024)
(963, 1024)


In [None]:
clf = LogisticRegression(random_state=0).fit(X_train, y_train)



In [None]:
preds = clf.predict(X_test)

print(f1_score(y_test, preds, average='macro'))
print(accuracy_score(y_test, preds))
confusion_matrix(y_test, preds)

0.9331643139670344
0.9418483904465212


array([[120,   0,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  1,  85,   3,   0,   0,   0,   0,   1,   0,   0,   0,   0],
       [  2,   4,  74,   0,   0,   0,   0,   1,   0,   0,   0,   0],
       [  0,   0,   0, 109,   3,   2,   1,   0,   0,   1,   0,   0],
       [  0,   0,   0,   3,  87,   0,   2,   0,   0,   0,   1,   0],
       [  0,   0,   0,   1,   2,  56,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   1,   0,   0,  71,   4,   0,   0,   0,   0],
       [  0,   1,   0,   0,   0,   0,   0,  82,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0, 105,   1,   3,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   4,  40,   0,   1],
       [  0,   0,   0,   0,   3,   0,   0,   0,   0,   1,  54,   0],
       [  0,   0,   0,   0,   0,   5,   0,   0,   2,   0,   0,  24]])

In [None]:
dump(clf, 'log_reg_elmo.joblib')

['log_reg_elmo.joblib']

In [None]:
clf = SVC().fit(X_train, y_train)



In [None]:
preds = clf.predict(X_test)

print(f1_score(y_test, preds, average='macro'))
print(accuracy_score(y_test, preds))
confusion_matrix(y_test, preds)

0.7741326962463085
0.8483904465212876


  'precision', 'predicted', average, warn_for)


array([[115,   0,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  2,  79,   9,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  3,   3,  74,   0,   0,   0,   0,   1,   0,   0,   0,   0],
       [  0,   0,   0, 104,   6,   2,   0,   0,   3,   1,   0,   0],
       [  0,   0,   0,   6,  87,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,  11,   2,  45,   0,   0,   1,   0,   0,   0],
       [  0,   0,   0,   1,   2,   0,  66,   7,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,  83,   0,   0,   0,   0],
       [  0,   0,   0,   2,   1,   0,   0,   0, 102,   0,   4,   0],
       [  0,   0,   0,   4,   0,   0,   0,   0,  19,  22,   0,   0],
       [  0,   0,   0,   0,   6,   0,   0,   0,  12,   0,  40,   0],
       [  0,   0,   0,   0,   0,  19,   0,   0,  11,   1,   0,   0]])

In [None]:
dump(clf, 'svm_elmo.joblib')

['svm_elmo.joblib']

In [None]:
clf = RandomForestClassifier(n_estimators=200, max_depth=16, random_state=0).fit(X_train, y_train)

In [None]:
preds = clf.predict(X_test)

print(f1_score(y_test, preds, average='macro'))
print(accuracy_score(y_test, preds))
confusion_matrix(y_test, preds)

0.8171760705954747
0.857736240913811


array([[117,   0,   5,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  2,  81,   7,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  3,   4,  74,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,  97,   5,   3,   1,   0,   9,   1,   0,   0],
       [  0,   0,   0,   8,  81,   0,   2,   0,   0,   0,   2,   0],
       [  0,   0,   0,   6,   1,  46,   0,   0,   5,   0,   1,   0],
       [  0,   0,   0,   0,   2,   1,  67,   6,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   1,  82,   0,   0,   0,   0],
       [  0,   0,   0,   1,   0,   0,   0,   0, 106,   1,   1,   0],
       [  0,   0,   0,   0,   0,   2,   0,   0,  16,  27,   0,   0],
       [  0,   0,   0,   0,   7,   0,   0,   0,  11,   0,  40,   0],
       [  0,   0,   0,   0,   0,  13,   0,   0,   9,   1,   0,   8]])

In [None]:
dump(clf, 'rf_elmo.joblib')

['rf_elmo.joblib']

In [None]:
X_train = np.array([elmo2([preprocess_text(text)]) for text in X_train])
X_test = np.array([elmo2([preprocess_text(text)]) for text in X_test])

X_train = X_train.reshape((X_train.shape[0], X_train.shape[2]))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[2]))

print(X_train.shape)
print(X_test.shape)

(2244, 1024)
(963, 1024)
