<a href="https://colab.research.google.com/github/aabitokh/home_repo/blob/develop/ntb_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# анализ отзывов для «Викишоп» с BERT

# imports

In [1]:
!pip install transformers
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.1-py3-none-any.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m78.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.2-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.2 tokenizers-0.13.2 transformers-4.27.1
Looking in indexes: https://pypi.org/simple, http

In [31]:
import pandas as pd 
import numpy as np 

from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import RandomizedSearchCV

import torch
import transformers
from tqdm import notebook
from torch import tensor


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import re
from sklearn.utils import shuffle

from catboost import Pool, CatBoostClassifier

In [3]:
#DATA_PATH = 'toxic_comments.csv'
DATA_PATH = '/datasets/toxic_comments.csv'

In [4]:
def clear_text(text):
#очистка текста 
    return ' '.join(re.sub(r'[^a-zA-Z\' ]', ' ', text).split())

In [5]:
#проверка на GPU, если есть, то ускорит BERT
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
!nvidia-smi

Sat Mar 18 11:20:41 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   63C    P0    26W /  70W |      3MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# first look 

In [9]:
df = pd.read_csv(DATA_PATH)

In [10]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,text,toxic
18311,18329,Probably right re commenting first. Just have...,0.0
2554,2554,"Hi \n\nHi, welcome to wikipedia. I just wante...",1.0
51381,51438,ps: I do realise that Mr Bahram Moshiri's reli...,0.0
22009,22029,""" (UTC)\n\nNot that I'm going to argue that Re...",0.0
2092,2092,Finally stop your edit wars and your crusade! ...,0.0
54047,54108,russian sources circa 1992-2004 have clarified...,0.0
55170,55231,""" Feb 2005 (UTC)\n\nI would like to know how y...",0.0
65489,65556,"""\n\nBridget Marquardt being divorced\nAfterno...",0.0
27757,27794,To start: this cannot be discussed with you. Y...,0.0
37827,37873,3/14/07 - Do not vandalize the talk pages\n an...,0.0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71681 entries, 0 to 71680
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  71681 non-null  int64  
 1   text        71681 non-null  object 
 2   toxic       71680 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 1.6+ MB


# data preprocessing

In [12]:
df = df.drop(['Unnamed: 0'], axis = 1) 

In [13]:
df

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0.0
1,D'aww! He matches this background colour I'm s...,0.0
2,"Hey man, I'm really not trying to edit war. It...",0.0
3,"""\nMore\nI can't make any real suggestions on ...",0.0
4,"You, sir, are my hero. Any chance you remember...",0.0
...,...,...
71676,Citation needed \n\nCitation needed - but I c...,0.0
71677,"Trouton did not use Ohm, you definitely don't ...",0.0
71678,"""\n\nAfD nomination of Phil Mims\nI have nomin...",0.0
71679,"""\n\n Please do not vandalize pages, as you di...",0.0


In [14]:
df.toxic.value_counts(normalize=True)

0.0    0.897573
1.0    0.102427
Name: toxic, dtype: float64

таргет сильно несбалансирован, лучше сделать downsampling, так модель луше выхватит целевой признак, заодно и сэко

In [15]:
df = pd.concat([df.query('toxic == 1'), df.query('toxic != 1').sample(20000, replace = False)])
df = shuffle(df)

In [16]:
df['toxic'].value_counts()

0.0    20000
1.0     7342
Name: toxic, dtype: int64

In [17]:
df['text'] = df['text'].apply(lambda x: clear_text(x))

# BERT

In [18]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (transformers.DistilBertModel,
                                                    transformers.DistilBertTokenizer,
                                                    'distilbert-base-uncased')

#model_class, tokenizer_class, pretrained_weights = (transformers.BertModel, transformers.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
tokenized = df['text'].apply(lambda x: \
                             tokenizer.encode(x, add_special_tokens=True, 
                                              truncation=True, max_length=500))

In [20]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)
        
padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])
attention_mask = np.where(padded != 0, 1, 0)

In [21]:
#перенесем все на GPU 
padded = torch.LongTensor(padded).to(DEVICE)
attention_mask = torch.LongTensor(attention_mask).to(DEVICE)
model.to(DEVICE)

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Linear(i

In [22]:
!nvidia-smi

Sat Mar 18 11:22:27 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P0    26W /  70W |   1055MiB / 15360MiB |     16%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [23]:
padded.shape

torch.Size([27342, 500])

In [24]:
batch_size = 100
embeddings = []

for i in notebook.tqdm(range((padded.shape[0] // batch_size)+1)):
    batch = padded[batch_size*i:batch_size*(i+1)]
    attention_mask_batch = attention_mask[batch_size*i:batch_size*(i+1)]

    with torch.no_grad():
        batch_embeddings = model(batch, attention_mask=attention_mask_batch)

    embeddings.append(batch_embeddings[0][:,0,:].cpu().numpy())

  0%|          | 0/274 [00:00<?, ?it/s]

In [25]:
features = np.concatenate(embeddings)
features

array([[-0.0775227 ,  0.03678919, -0.22271825, ..., -0.23536018,
         0.51783913,  0.01164163],
       [-0.18239626, -0.03707014, -0.04821394, ..., -0.21273685,
         0.40674904,  0.41701597],
       [ 0.05416229,  0.08990414, -0.14486082, ..., -0.07652187,
         0.41481078,  0.2078826 ],
       ...,
       [-0.14991918,  0.03042052, -0.10720155, ..., -0.0354307 ,
         0.44256067,  0.38860616],
       [-0.25284636, -0.03582216, -0.06295356, ..., -0.19481094,
         0.40149382,  0.31408086],
       [ 0.0447886 ,  0.14826185,  0.0346882 , ..., -0.03125236,
         0.3200365 ,  0.22304462]], dtype=float32)

# classification

In [26]:
x_train, x_test, y_train, y_test = train_test_split(features, df['toxic'], test_size= 0.2)

## LR 

In [27]:
lr = LogisticRegression()
scores = cross_val_score(lr, x_train, y_train, cv = 3, scoring= 'f1')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [28]:
print(scores.mean())

0.8257506802818666


отличный результат для простой модели

## ctb

In [29]:
%%time 

X_train_ctb = x_train.copy()

y_train_ctb = y_train.copy()

#без перебора гиперпаратметров, чтобы просто сэкономить ресурсы/время 
params = {'random_seed': 42,
          'eval_metric': 'F1',
          'task_type': 'GPU', 'verbose':False}

model = CatBoostClassifier(**params)

scores = cross_val_score(model, X_train_ctb, y_train_ctb, 
                         cv = 5, scoring= 'f1')

scores.mean()



CPU times: user 7min 14s, sys: 47.8 s, total: 8min 2s
Wall time: 7min 49s


In [33]:
params_grid = {'iterations': [100, 500, 1000],
               'learning_rate': [0.01, 0.05, 0.1],
               'depth': [4, 6, 8],
               'l2_leaf_reg': [1, 3, 5]}

# Create a CatBoostClassifier instance with the fixed parameters
model = CatBoostClassifier(random_seed=42, eval_metric='F1', 
                           task_type='GPU', verbose=False)

# Use RandomizedSearchCV to search over the parameter grid
random_search = RandomizedSearchCV(model, params_grid, cv=5, 
                                   scoring='f1', random_state=42)

In [34]:
random_search.fit(X_train_ctb, y_train_ctb)

# Print the best parameter combination and F1 score
print('Best parameters: ', random_search.best_params_)
print('Best F1 score: ', random_search.best_score_)



Best parameters:  {'learning_rate': 0.05, 'l2_leaf_reg': 5, 'iterations': 500, 'depth': 8}
Best F1 score:  0.809260150836329


## best model test

In [38]:
lr.fit(x_train, y_train) 
pred = lr.predict(x_test)
print(f1_score(pred, y_test))

0.8207115181401902


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#Итог

Лучше себя показала модель линейной регресси: быстро, просто, дешево. 

F1 мера около 0.82, что не так уж плохо. 

Что еще можно сделать: 
  - поиграться с трешхолдом 
  - апсемплинг 
  - поперебирать еще параметры катбуста 
  - добавить другой бустинг 
  - посмотреть другие модели для эмбедингов