# imports

In [1]:
import pandas as pd 
import numpy as np 

from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

import torch
import transformers
from tqdm import notebook

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import re

In [2]:
DATA_PATH = 'toxic_comments.csv'
#DATA_PATH = '/datasets/toxic_comments.csv'

In [3]:
def clear_text(text):
    return ' '.join(re.sub(r'[^a-zA-Z\' ]', ' ', text).split())

# first look 

In [4]:
df = pd.read_csv(DATA_PATH)

In [5]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,text,toxic
158146,158305,Jizz page needs cleanup but not Semen \n\nI wo...,1
41901,41951,"Hi, Could you please vote on whether [] articl...",0
21339,21359,Nothing what you say here is true. There was ...,0
51637,51694,Unspecified source for Image:Wupper.jpg\n\nTha...,0
55003,55064,"""\n{| style=""""background-color:#F5FFFA; paddin...",0
75185,75261,Theres also a cheat where u lag and can still ...,0
54600,54661,"""::::Here's what I experienced. (1)Fyslee has ...",0
32170,32210,"""====Regarding edits made during December 9 20...",0
123170,123279,""":You're no fun LSD. No fun at all. Talk• C...",0
85232,85313,No it does not. It is begging their case as be...,0


In [6]:
df = df.sample(1000)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 28261 to 120634
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1000 non-null   int64 
 1   text        1000 non-null   object
 2   toxic       1000 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 31.2+ KB


In [8]:
#print(df['toxic'][42])
#print(df['text'][42])

# data preprocessing

In [9]:
df = df.drop(['Unnamed: 0'], axis = 1) 

In [10]:
df

Unnamed: 0,text,toxic
28261,Stop messing with my userpage \nI can see from...,1
59524,This is really pathetic - I posted in good fai...,0
65762,""" March 2007 (UTC)\n\nThe only vandal here is ...",1
115001,"Antifreeze article \n\nHi, Andy Dingley. Perha...",0
3534,First the North Nicosia article will be elimin...,0
...,...,...
17988,"""\n\n While I respect the fact that you've obv...",1
50710,Also please note that Ubikwit has his own (I t...,0
123256,REDIRECT Talk:The Test (The O.C.),0
156489,AndyTheGrump \n\nMy issue is with AndyTheGrump...,0


In [11]:
df.toxic.value_counts(normalize=True)

0    0.899
1    0.101
Name: toxic, dtype: float64

In [12]:
df['text'] = df['text'].apply(lambda x: clear_text(x))

# BERT

In [77]:
import transformers
from torch import tensor
import torch


In [78]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (transformers.DistilBertModel, transformers.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (transformers.BertModel, transformers.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [79]:
tokenized = df['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=500))

In [80]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)
        
padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])
attention_mask = np.where(padded != 0, 1, 0)

In [81]:
batch_size = 10
embeddings = []

for i in notebook.tqdm(range(padded.shape[0] // batch_size)):
    batch = torch.LongTensor(padded[batch_size*i:batch_size*(i+1)]) 
    attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i:batch_size*(i+1)])

    with torch.no_grad():
        batch_embeddings = model(batch, attention_mask=attention_mask_batch)

    embeddings.append(batch_embeddings[0][:,0,:].numpy())

  0%|          | 0/100 [00:00<?, ?it/s]

In [82]:
features = np.concatenate(embeddings)
features

array([[ 0.03420232,  0.1805147 , -0.10840572, ..., -0.04821339,
         0.31065   ,  0.3482343 ],
       [ 0.16186336,  0.22357984, -0.00281759, ..., -0.03284591,
         0.4788848 ,  0.29641178],
       [ 0.16350597,  0.209815  , -0.12980905, ..., -0.2642115 ,
         0.4378849 ,  0.28107995],
       ...,
       [-0.10930499, -0.1766999 , -0.06792328, ..., -0.08012712,
         0.2104788 ,  0.62221926],
       [ 0.03608531,  0.15552112,  0.05457584, ..., -0.15147392,
         0.3687446 ,  0.33932787],
       [ 0.02723861, -0.02974499, -0.09860745, ..., -0.13710612,
         0.33990124,  0.31552246]], dtype=float32)

In [84]:
from sklearn.linear_model import LogisticRegression

In [85]:
lr = LogisticRegression()

lr.fit(features, df['toxic'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [88]:
pred = lr.predict(features)

In [89]:
from sklearn.metrics import accuracy_score

In [90]:
accuracy_score(pred, df['toxic'])

0.98