Создание модели

Определение токсичности комментария

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head(3)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0


In [4]:
df.isna().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [6]:
df['target'] = 0

In [7]:
df.loc[(df['toxic'] == 1) | (df['severe_toxic'] == 1) |
       (df.obscene == 1) | (df.threat) == 1 |
       (df.insult == 1) | (df.identity_hate == 1), 'target'] = 1

In [8]:
df.target.value_counts()

0    143729
1     15842
Name: target, dtype: int64

In [9]:
X = pd.concat([df.id, df.comment_text], axis=1)
y = df.target

In [10]:
cat_col = ['comment_text']

In [11]:
df['target'].value_counts()

0    143729
1     15842
Name: target, dtype: int64

In [12]:
from sklearn.metrics import classification_report, precision_recall_curve, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import StandardScaler

Создаем классы обработки признаков

In [13]:
class CatSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.key]

In [14]:
# final_transformers = []

# for cat_col in cat_cols:
#     cat_transformer = Pipeline([
#         ('selector', CatSelector(key=cat_col)),
#         ('tfdf', TfidfVectorizer())
#     ])
    
#     final_transformers.append((cat_col, cat_transformer))

In [15]:
# feats = FeatureUnion(final_transformers)

In [16]:
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

Разбиение данных на обучение и тест

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=50, stratify=y)

Подбор гиперпараметров, кросс-валидация

In [18]:
pipeline = Pipeline([
    ('selector', CatSelector(key='comment_text')),
    ('tfdf', TfidfVectorizer()),
    ('model', LogisticRegression(random_state=1, max_iter=1000))
])

In [19]:
pipeline.get_params()

{'memory': None,
 'steps': [('selector', CatSelector(key='comment_text')),
  ('tfdf', TfidfVectorizer()),
  ('model', LogisticRegression(max_iter=1000, random_state=1))],
 'verbose': False,
 'selector': CatSelector(key='comment_text'),
 'tfdf': TfidfVectorizer(),
 'model': LogisticRegression(max_iter=1000, random_state=1),
 'selector__key': 'comment_text',
 'tfdf__analyzer': 'word',
 'tfdf__binary': False,
 'tfdf__decode_error': 'strict',
 'tfdf__dtype': numpy.float64,
 'tfdf__encoding': 'utf-8',
 'tfdf__input': 'content',
 'tfdf__lowercase': True,
 'tfdf__max_df': 1.0,
 'tfdf__max_features': None,
 'tfdf__min_df': 1,
 'tfdf__ngram_range': (1, 1),
 'tfdf__norm': 'l2',
 'tfdf__preprocessor': None,
 'tfdf__smooth_idf': True,
 'tfdf__stop_words': None,
 'tfdf__strip_accents': None,
 'tfdf__sublinear_tf': False,
 'tfdf__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tfdf__tokenizer': None,
 'tfdf__use_idf': True,
 'tfdf__vocabulary': None,
 'model__C': 1.0,
 'model__class_weight': None,
 'model__d

In [20]:
parameters = {
#     'model__reg_lambda': [0.001, 0.01, 100, 10, 1, 0.1],
#     'model__learning_rate': [0.001, 0.01, 0.1, 0.5, 1],
#     'model__class_weights': ({1: 5}, None, {1:2}, {1:3}),
#     'tfdf__stop_words': ('english', None),
#     'tfdf__max_features': [5000, 10000, None],
#     'tfdf__lowercase': (True, False)
}


In [None]:
grid = GridSearchCV(pipeline, parameters, cv=3, verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
pipeline = Pipeline([
    ('selector', CatSelector(key='comment_text')),
    ('tfdf', TfidfVectorizer(max_features=10000)),
    ('model', LogisticRegression(random_state=1, max_iter=1000, class_weight={1:2}))
])

In [None]:
pipeline.fit(X_train, y_train)

In [26]:
train_pred = pipeline.predict(X_train)

In [27]:
print(classification_report(y_train, train_pred))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98    107797
           1       0.88      0.77      0.82     11881

    accuracy                           0.97    119678
   macro avg       0.93      0.88      0.90    119678
weighted avg       0.97      0.97      0.97    119678



In [28]:
# сохранение модели

In [29]:
X_test.to_csv('X_test.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

In [30]:
import dill

In [31]:
with open('logreg_pipeline.dill', 'wb') as f:
    dill.dump(pipeline, f)

Проверка модели

In [32]:
import pandas as pd
import numpy as np
import dill
from sklearn.metrics import roc_auc_score, precision_recall_curve

In [33]:
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')

In [34]:
X_test.head(2)

Unnamed: 0,id,comment_text
0,bb91be48ab5fd05f,Sources and notes \r\n\r\nPlease explain why y...
1,ecdf575af5c4f15f,"""\r\n\r\nThanks for clearing up the Absolute J..."


In [35]:
with open('logreg_pipeline.dill', 'rb') as f:
    pipeline = dill.load(f)

In [36]:
pipeline

Pipeline(steps=[('selector', CatSelector(key='comment_text')),
                ('tfdf', TfidfVectorizer(max_features=10000)),
                ('model',
                 LogisticRegression(class_weight={1: 2}, max_iter=1000,
                                    random_state=1))])

In [37]:
preds = pipeline.predict_proba(X_test)[:, 1]

In [38]:
print(roc_auc_score(y_test, preds))

0.9727670823030392


In [39]:
precision, recall, threshold = precision_recall_curve(y_test, preds)
b = 1 
f_score = (1 + b ** 2) * (precision * recall) / (b ** 2 * precision + recall)
idx = np.nanargmax(f_score)
print(f'b={b}, \
      \nthreshold = {threshold[idx]}, \
      \nprecision = {precision[idx]}, \
      \nrecall = {recall[idx]}, \
      \nf_score = {f_score[idx]}')

b=1,       
threshold = 0.4715937431746385,       
precision = 0.8470790378006873,       
recall = 0.7467811158798283,       
f_score = 0.7937743190661478


In [40]:
threshold[idx]

0.4715937431746385

## Flask

In [41]:
from flask import Flask, request, jsonify

In [42]:
import pandas as pd
import dill

In [43]:
with open('logreg_pipeline.dill', 'rb') as f:
    model = dill.load(f)

In [44]:
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')

In [45]:
X_test.sample(1)

Unnamed: 0,id,comment_text
11809,8b9bc929bd744fb2,"""\r\n\r\n STOP VANDALIZING \r\n\r\nPlease stop..."


In [46]:
app = Flask(__name__)


@app.route('/', methods=['GET'])
def general():
    return 'Welcome to prediction process'

@app.route('/predict', methods=['POST'])
def predict():
    data = {'succes': False}
    
    comment_text = ''
    request_json = request.get_json()
    
    if request_json['comment_text']:
        comment_text = request_json['comment_text']
    
    
    preds = model.predict_proba(pd.DataFrame({'comment_text': [comment_text]}))
    
    data['predictions'] = preds[:, 1][0]
    data['comment_text'] = comment_text
    
    data['succes'] = True
    
    return jsonify(data)

if __name__ == '__main__':
    app.run()

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000 (Press CTRL+C to quit)
