In [7]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score, classification_report
import numpy as np
import pandas as pd
import re

In [2]:
# Для лучшей оценки воспользуемся kfold
kf = KFold(n_splits=5)
r = re.compile(r'[a-z]+')

In [3]:
train = pd.read_csv('train.csv')

X = train['comment_text']
y = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

In [4]:
def build_pipeline():
    p = Pipeline([
                  ('feature', TfidfVectorizer(preprocessor=lambda x: " ".join(r.findall(x.lower())), 
                                              max_features=300)),
                    ('svd', TruncatedSVD(n_components=100)),
                  ('classifier', OneVsRestClassifier(LogisticRegression(class_weight='balanced', dual=True)))
      ])
    return p

In [5]:
roc_auc_scores = {'micro': [], 'macro': []}

for tr_index, ts_index in kf.split(X):
    X_train, X_test = X[tr_index], X[ts_index]
    y_train, y_test = y[tr_index], y[ts_index]
    pipeline = build_pipeline()
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    for i in roc_auc_scores:
        roc_auc_scores[i].append(roc_auc_score(y_test, y_pred, average=i))





In [6]:
roc_auc_scores = pd.DataFrame(roc_auc_scores)
roc_auc_scores.mean()

micro    0.821548
macro    0.827017
dtype: float64

In [12]:
print(classification_report(y_test, y_pred, target_names=['toxic', 'severe_toxic', 
                                                          'obscene', 'threat', 'insult', 'identity_hate']))

               precision    recall  f1-score   support

        toxic       0.31      0.79      0.45      3037
 severe_toxic       0.07      0.87      0.13       311
      obscene       0.22      0.81      0.35      1669
       threat       0.02      0.79      0.04        92
       insult       0.21      0.83      0.34      1582
identity_hate       0.04      0.77      0.07       305

    micro avg       0.17      0.81      0.28      6996
    macro avg       0.15      0.81      0.23      6996
 weighted avg       0.24      0.81      0.36      6996
  samples avg       0.04      0.08      0.05      6996



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
