# Загрузка необходимых библиотек

Все используемые библиотеки представлены в открытом доступе, их выбор основывался на качестве получаемого результата - модели и алгоритмы различных библиотек были сравнены между собой, среди множества решений было выбрано самое оптимальное и точное.

In [77]:
import re
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/annavlasova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Загрузка данных и предобработка

In [78]:
import json

with open('../data/full_docs.json', 'r+') as fp:
    full_docs1 = json.load(fp)

In [79]:
files = list(full_docs1.keys())

In [80]:
df_list = []
for i in range(len(files)):
    df1 = pd.DataFrame(full_docs1[files[i]])
    df1['id_doc'] = i
    df_list.append(df1)

In [81]:
df = pd.concat(df_list)

# df = df[df['target']!=0]

In [84]:
df_list = []
for i in range(1, df.shape[0]+1):
    try:
        df2 = df.iloc[i-1:i].explode(['few_texts', 'few_targets'], ignore_index=True)
        df_list.append(df2)
    except:
        continue

In [85]:
df = pd.concat(df_list)
df['ref_text'] = df['few_texts'].apply(lambda x: re.sub('[\W\d]+', ' ', x.lower()))

# Разделение на обучение и валидацию

In [61]:
X = df[df['id_doc']<=140]['ref_text']
# X = df['ref_text']

In [62]:
y = df[df['id_doc']<=140]['few_targets']
# y = df['few_targets']

In [69]:
y=y.astype('int')

# Обучение, сохранение и оценка моделей

# SVM

In [63]:
text_transformer = TfidfVectorizer(stop_words=russian_stopwords, 
                                   ngram_range=(1, 2), min_df = 2, lowercase=True, max_features=80000)

In [64]:
X_text = text_transformer.fit_transform(X)

In [76]:
# import pickle
# filename = '../models/tfidf_model_all_data.pickle'

# pickle.dump(text_transformer, open(filename, 'wb'))

In [16]:
# import pickle
# filename = '../models/svm_model.pickle'

# pickle.dump(clf, open(filename, 'wb'))

In [17]:
%%time
clf = SVC(probability=True, kernel='rbf')
clf.fit(X_text, y)

CPU times: user 8min 10s, sys: 922 ms, total: 8min 11s
Wall time: 8min 11s


SVC(probability=True)

### Оценим качество

In [18]:
X_test = df[df['id_doc']>142]['ref_text']
y_test = df[df['id_doc']>142]['few_targets']

In [19]:
X_test_text = text_transformer.transform(X_test)

In [20]:
test_preds = clf.predict(X_test_text)

In [21]:
from sklearn.metrics import accuracy_score, classification_report

accuracy_score(y_test, test_preds)

0.6646639128802103

In [22]:
df_results = pd.DataFrame()

In [23]:
df_results['true'] = y_test

In [24]:
df_results['svm'] = test_preds

# Log Reg

In [70]:
#%%time

logit = LogisticRegression(C=5e1, solver='lbfgs', multi_class='multinomial', random_state=17, n_jobs=4)
logit.fit(X_text, y)

LogisticRegression(C=50.0, multi_class='multinomial', n_jobs=4, random_state=17)

In [71]:
import pickle
filename = '../models/logreg_model_all_data.pickle'

pickle.dump(logit, open(filename, 'wb'))

In [72]:
# test_preds = logit.predict(X_test_text)
test_preds = logit.predict(X_text)

In [74]:
df_results['logreg'] = test_preds

In [75]:
# accuracy_score(y_test, test_preds)
accuracy_score(y, test_preds)

0.9327576280944156

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [30]:
df_results

Unnamed: 0,true,svm,logreg
0,2,2,2
0,4,4,4
0,3,3,3
0,1,1,1
0,1,1,1
...,...,...,...
0,0,0,0
0,0,24,24
0,0,0,0
0,0,0,0


# XGBoost

In [31]:
import xgboost as xgb
xgb_model = xgb.XGBClassifier()

In [32]:
xgb_model.fit(X_text, y)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, reg_alpha=0, ...)

In [33]:
test_preds = xgb_model.predict(X_test_text)

In [34]:
df_results['xgb'] = test_preds

In [35]:
accuracy_score(y_test, test_preds)

0.6549004881712355

In [36]:
df_results

Unnamed: 0,true,svm,logreg,xgb
0,2,2,2,2
0,4,4,4,4
0,3,3,3,3
0,1,1,1,1
0,1,1,1,1
...,...,...,...,...
0,0,0,0,2
0,0,24,24,0
0,0,0,0,2
0,0,0,0,0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# Ensamble

In [49]:
from sklearn.ensemble import GradientBoostingClassifier

In [50]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier

estimators = [
     ('logreg', LogisticRegression(C=5e1, solver='lbfgs', multi_class='multinomial', random_state=17, n_jobs=4)),
     ('svc', SVC(probability=True, kernel='rbf')),
     ('xgboost', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0))]
clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression())

clf.fit(X_text, y).score(X_test_text, y_test)                   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

0.6597822005257229

In [94]:
import pickle
filename = '../models/ensamble_model.pickle'

# pickle.dump(clf, open(filename, 'wb'))