# Создание бейзлайна

## Подгрузка необходимых библиотек и данных

### Библиотеки

In [54]:
# Базовые библиотеки
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.2f}'.format
random.seed(42)
np.random.seed(42)

# Библиотеки ML

# Модели
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler  

# Подбор гиперпараметров, метрики, преобразование переменных
import optuna
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, auc

# Дополнительные библиотеки
from functools import partial
from IPython.display import display, HTML
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


### Данные

In [3]:
df = (pd.read_csv('modified_train.csv', parse_dates=['created_date'],
    date_format='ISO8601'
))

In [5]:
df.head()

Unnamed: 0,id,toxicity,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,created_date,parent_id,...,threat_b,sexual_explicit_b,toxicity_any_kind,num_words,num_sent,num_punct,tokens,tokens_cnt,tokens_ws,ctws
0,5118044,0.98,You can stick your dick up anyone's butt. Why ...,0.13,0.94,0.01,0.25,0.01,2017-04-10 23:30:10.076014+00:00,,...,0,1,1,17,2,3,"['you', 'can', 'stick', 'your', 'dick', 'up', ...",15,"['stick', 'dick', 'anyone', 'butt', 'law']",stick dick anyone butt law
1,5380193,0.98,I bet your fat bitter little nerd ass was wait...,0.2,0.94,0.02,0.97,0.02,2017-06-08 18:24:06.236150+00:00,5377903.0,...,0,1,1,30,2,4,"['i', 'bet', 'your', 'fat', 'bitter', 'little'...",28,"['bet', 'fat', 'bitter', 'little', 'nerd', 'as...",bet fat bitter little nerd as wait day find wa...
2,5292392,0.95,I hope you die today. The world will be much b...,0.25,0.08,0.01,0.76,0.86,2017-05-23 18:10:27.505027+00:00,5292009.0,...,1,0,1,20,2,2,"['i', 'hope', 'you', 'die', 'today', 'the', 'w...",18,"['hope', 'die', 'today', 'world', 'much', 'goo...",hope die today world much good without filth like
3,5481314,0.91,Let them die ..\nMaybe we should build concent...,0.28,0.01,0.05,0.1,0.88,2017-06-26 21:37:55.561744+00:00,5480942.0,...,1,0,1,22,1,7,"['let', 'them', 'die', 'maybe', 'we', 'should'...",18,"['let', 'die', 'maybe', 'build', 'concentratio...",let die maybe build concentration camp gas cha...
4,5588434,0.94,If we shoot them it will be a permanent soluti...,0.15,0.0,0.03,0.5,0.91,2017-07-13 21:15:03.037890+00:00,5586783.0,...,1,0,1,19,2,2,"['if', 'we', 'shoot', 'them', 'it', 'will', 'b...",17,"['shoot', 'permanent', 'solution', 'creature',...",shoot permanent solution creature waste space


In [7]:
df.toxicity_b.value_counts(normalize=True)

toxicity_b
0   0.89
1   0.11
Name: proportion, dtype: float64

In [9]:
df.shape

(1799551, 39)

In [15]:
df.ctws.isna().sum()

5052

In [19]:
df.dropna(subset='ctws', inplace=True)

In [21]:
df.columns

Index(['id', 'toxicity', 'comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'created_date', 'parent_id',
       'rating', 'funny', 'wow', 'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count', 'text_length',
       'year', 'year_month', 'toxicity_b', 'any_reaction', 'toxicity_bins',
       'severe_toxicity_b', 'obscene_b', 'identity_attack_b', 'insult_b',
       'threat_b', 'sexual_explicit_b', 'toxicity_any_kind', 'num_words',
       'num_sent', 'num_punct', 'tokens', 'tokens_cnt', 'tokens_ws', 'ctws'],
      dtype='object')

In [23]:
df = df.drop(columns=['id', 'toxicity', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'created_date', 'parent_id',
       'rating', 'funny', 'wow', 'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count', 'text_length','year', 'year_month',
        'any_reaction', 'toxicity_bins','severe_toxicity_b', 'obscene_b', 'identity_attack_b', 'insult_b',
       'threat_b', 'sexual_explicit_b', 'toxicity_any_kind', 'num_words',
       'num_sent', 'num_punct', 'tokens', 'tokens_cnt', 'tokens_ws'])

In [29]:
df.head()
df.rename(columns={'toxicity_b':'toxic'}, inplace=True)

In [35]:
X = df.drop(columns='toxic')
y = df['toxic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.07, random_state=42,stratify=y)

In [37]:
X_train.shape

(1668884, 2)

In [39]:
X_test.shape

(125615, 2)

## Обучение и сохранение моделей

Возьмем наилучшие гиперпараметры моделей Baseline и обучим модели для демонстрации

### Logistic Regression

In [74]:
lr_params = {'C': 0.15039683695924158, 'max_iter': 4959, 'class_weight': 'balanced'}
log_reg_example = Pipeline([
    ('vectorizer', CountVectorizer()), 
#    ('scaler', MaxAbsScaler()),                          
    ('classifier', LogisticRegression(**lr_params, random_state=42))
])

log_reg_example.fit(X_train['ctws'], y_train)

preds = log_reg_example.predict(X_test['ctws'])
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.97      0.90      0.94    111537
           1       0.51      0.80      0.62     14078

    accuracy                           0.89    125615
   macro avg       0.74      0.85      0.78    125615
weighted avg       0.92      0.89      0.90    125615



### SVM

In [72]:
svc_params = {'C': 0.012478543429588021, 'penalty': 'l2', 'class_weight': 'balanced'}
svc_example= Pipeline([
    ('vectorizer', CountVectorizer()),                 
#    ('scaler', MaxAbsScaler()),                       
    ('classifier', LinearSVC(**svc_params, random_state=42))  
])

svc_example.fit(X_train['ctws'], y_train)

preds_2 = svc_example.predict(X_test['ctws'])
print(classification_report(y_test, preds_2))

              precision    recall  f1-score   support

           0       0.97      0.91      0.94    111537
           1       0.52      0.78      0.63     14078

    accuracy                           0.89    125615
   macro avg       0.75      0.85      0.78    125615
weighted avg       0.92      0.89      0.90    125615



### Multinomial NB

In [70]:
mnb_params = {'alpha': 1.2560819456336068}

mnb_example = Pipeline([
    ('vectorizer', CountVectorizer()),                                        
    ('classifier', MultinomialNB(**mnb_params))  
])

mnb_example.fit(X_train['ctws'], y_train)

preds_3 = mnb_example.predict(X_test['ctws'])
print(classification_report(y_test, preds_3))

              precision    recall  f1-score   support

           0       0.95      0.93      0.94    111537
           1       0.49      0.58      0.53     14078

    accuracy                           0.89    125615
   macro avg       0.72      0.75      0.73    125615
weighted avg       0.89      0.89      0.89    125615



## Сохраненение моделей и данных для последующего использования

### Сохранение моделей

In [83]:
import cloudpickle

In [93]:
with open("model_lr_e.cloudpickle", "wb") as file:
    cloudpickle.dump(log_reg_example, file)

In [89]:
with open("model_svc_e.cloudpickle", "wb") as file:
    cloudpickle.dump(svc_example, file)

In [91]:
with open("model_mnb_e.cloudpickle", "wb") as file:
    cloudpickle.dump(mnb_example, file)

### Сохранение данных

Сохраним данные test в качестве датасета для пользователя

In [107]:
data = pd.concat([pd.DataFrame(y_test), X_test], axis=1)

In [113]:
data = data.reset_index(drop = True).drop(columns=['ctws'])

In [117]:
data.to_csv('demo_data_2.csv')