# Проект для «Викишоп»

Интернет-магазин «Викишоп» запускает новый сервис. Теперь пользователи могут редактировать и дополнять описания товаров, как в вики-сообществах. То есть клиенты предлагают свои правки и комментируют изменения других. Магазину нужен инструмент, который будет искать токсичные комментарии и отправлять их на модерацию.

Обучите модель классифицировать комментарии на позитивные и негативные. В вашем распоряжении набор данных с разметкой о токсичности правок.

Постройте модель со значением метрики качества *F1* не меньше 0.75.


**Описание данных**

Данные находятся в файле `toxic_comments.csv`. Столбец *text* в нём содержит текст комментария, а *toxic* — целевой признак.

## Подготовка

In [None]:
import pandas as pd
import matplotlib as plt
import warnings
import nltk
import re
import spacy

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from lightgbm import LGBMClassifier

from catboost import CatBoostClassifier

from random import shuffle

from nltk.stem import WordNetLemmatizer

from tqdm.notebook import tqdm

In [None]:
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('.csv')

In [None]:
df.shape

(159292, 3)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159292 entries, 0 to 159291
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  159292 non-null  int64 
 1   text        159292 non-null  object
 2   toxic       159292 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.6+ MB


In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,toxic
0,0,Explanation\nWhy the edits made under my usern...,0
1,1,D'aww! He matches this background colour I'm s...,0
2,2,"Hey man, I'm really not trying to edit war. It...",0
3,3,"""\nMore\nI can't make any real suggestions on ...",0
4,4,"You, sir, are my hero. Any chance you remember...",0


In [None]:
df.tail()

Unnamed: 0.1,Unnamed: 0,text,toxic
159287,159446,""":::::And for the second time of asking, when ...",0
159288,159447,You should be ashamed of yourself \n\nThat is ...,0
159289,159448,"Spitzer \n\nUmm, theres no actual article for ...",0
159290,159449,And it looks like it was actually you who put ...,0
159291,159450,"""\nAnd ... I really don't think you understand...",0


In [None]:
df.describe()

Unnamed: 0.1,Unnamed: 0,toxic
count,159292.0,159292.0
mean,79725.697242,0.101612
std,46028.837471,0.302139
min,0.0,0.0
25%,39872.75,0.0
50%,79721.5,0.0
75%,119573.25,0.0
max,159450.0,1.0


In [None]:
df.duplicated().sum()

0

In [None]:
df.isna().sum()

Unnamed: 0    0
text          0
toxic         0
dtype: int64

In [None]:
df = df.drop('Unnamed: 0', axis = 1)
df['text'] = df['text'].str.lower()

лемматизируeм данные

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

nlp = spacy.load("en_core_web_sm")

def clear_text(text):
    doc = nlp(text)
    doc = ' '.join([token.lemma_ for token in doc])
    clean_text = re.sub(r'[^a-zA-z]', ' ', text)
    clean_text = ' '.join(clean_text.split())
    return clean_text
df['clean_text'] = df['text'].progress_apply(clear_text)

  0%|          | 0/159292 [00:00<?, ?it/s]

In [None]:
df.head()

Unnamed: 0,text,toxic,clean_text
0,explanation\nwhy the edits made under my usern...,0,explanation why the edits made under my userna...
1,d'aww! he matches this background colour i'm s...,0,d aww he matches this background colour i m se...
2,"hey man, i'm really not trying to edit war. it...",0,hey man i m really not trying to edit war it s...
3,"""\nmore\ni can't make any real suggestions on ...",0,more i can t make any real suggestions on impr...
4,"you, sir, are my hero. any chance you remember...",0,you sir are my hero any chance you remember wh...


In [None]:
features = df['clean_text']
target = df['toxic']

In [None]:
features_train, features_test, target_train, target_test = train_test_split(features,target,
                                                                         test_size = 0.2,
                                                                         random_state = 12345,
                                                                         shuffle = True
                                                                         )
print(features_train.shape)
print(target_train.shape)
print(features_test.shape)
print(target_test.shape)

(127433,)
(127433,)
(31859,)
(31859,)


In [None]:
count_tf_idf = TfidfVectorizer(stop_words = 'english')

In [None]:
tf_idf_train = count_tf_idf.fit_transform(features_train)
tf_idf_test = count_tf_idf.transform(features_test)
print(tf_idf_train.shape)
print(tf_idf_test.shape)

(127433, 150734)
(31859, 150734)


## Обучение

In [None]:
lr = LogisticRegression(random_state = 12345, solver = 'sag', class_weight = 'balanced')
parametrs_lr = { 'C': range (1, 10),
              'max_iter': [100],
              }
grid_lr = GridSearchCV(lr, parametrs_lr, cv = 3, scoring = 'f1')
grid_lr.fit(tf_idf_train, target_train)
print(grid_lr.best_score_)

0.7619707857185424


0.7619707857185424



In [None]:
LGBM = LGBMClassifier()
parameters = {'max_iter': [90],
              'max_depth': range (15, 20), 'random_state' : [42]}

grid_LGBM = GridSearchCV(LGBM, parameters, scoring = 'f1')
grid_LGBM.fit(tf_idf_train, target_train)
print(grid_LGBM.best_score_)

0.7123331955438316

 лучшей моделью оказалась LG

In [None]:
cbc = CatBoostClassifier()
parametrs = {'iterations': [100],
        'depth': [5], 'random_state' : [42]}

grid_cbc = GridSearchCV(cbc, parametrs, scoring = 'f1')
grid_cbc.fit(tf_idf_train, target_train)
print(grid_cbc.best_score_)

0.7235303868632853

In [None]:
predicted_lr = grid_lr.predict(tf_idf_test)
f1_log_r_1 = f1_score(target_test, predicted_lr)
f1_log_r_1

0.7689904870083771

получили на тесте accuraсy  выше 0,75, это считается приемлемым результатом.




</div>

0.7689904870083771

<div class="alert alert-info"> <b>ВЫВОД:</b> Датасет объемный. В ходе проекта были загружены и изучены данные, удален неинформативный и не ценный столбец, выполнена лемматизация новым методом spacy, данные приведены к нижнему регистру, поделены на выборки и обучены три модели: LogisticRegression показавшая результат f1 = 0.7619707857185424 и в последствии выбранная как лучшая, CatBoostClassifier с резульатом f1 = 0.7235303868632853 и LGBMClassifier с f1 = 0.7123331955438316, после чего лучшая по результам f1 использована на финальном тестировании и получен результат 0.7689904870083771, что успешно превышает необходимую метрику в 0.75</div>