In [38]:
import pandas as pd
import numpy as np
import random

In [71]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from skmultilearn.problem_transform import BinaryRelevance

In [39]:
random.seed(1337)

In [9]:
data = pd.read_csv('../data/data.csv')

In [10]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [11]:
data['comment_text'].fillna(value = '_na_',inplace=True)

In [13]:
target_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [40]:
X_train_val, X_test, y_train_val, y_test = train_test_split(data.drop(target_cols, axis=1), data[target_cols], test_size=0.2)

In [41]:
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25)

In [42]:
for col in target_cols:
    print(y_train[col].value_counts())

0    86554
1     9188
Name: toxic, dtype: int64
0    94787
1      955
Name: severe_toxic, dtype: int64
0    90662
1     5080
Name: obscene, dtype: int64
0    95468
1      274
Name: threat, dtype: int64
0    91019
1     4723
Name: insult, dtype: int64
0    94908
1      834
Name: identity_hate, dtype: int64


In [43]:
for col in target_cols:
    print(y_val[col].value_counts())

0    28874
1     3040
Name: toxic, dtype: int64
0    31589
1      325
Name: severe_toxic, dtype: int64
0    30253
1     1661
Name: obscene, dtype: int64
0    31817
1       97
Name: threat, dtype: int64
0    30326
1     1588
Name: insult, dtype: int64
0    31636
1      278
Name: identity_hate, dtype: int64


In [44]:
for col in target_cols:
    print(y_test[col].value_counts())

0    28849
1     3066
Name: toxic, dtype: int64
0    31600
1      315
Name: severe_toxic, dtype: int64
0    30207
1     1708
Name: obscene, dtype: int64
0    31808
1      107
Name: threat, dtype: int64
0    30349
1     1566
Name: insult, dtype: int64
0    31622
1      293
Name: identity_hate, dtype: int64


In [45]:
for col in target_cols:
    print(y_train_val[col].value_counts())

0    115428
1     12228
Name: toxic, dtype: int64
0    126376
1      1280
Name: severe_toxic, dtype: int64
0    120915
1      6741
Name: obscene, dtype: int64
0    127285
1       371
Name: threat, dtype: int64
0    121345
1      6311
Name: insult, dtype: int64
0    126544
1      1112
Name: identity_hate, dtype: int64


In [49]:
predicted = pd.DataFrame()
for col in target_cols:
    predicted[col] = [0]*y_test.shape[0]
predicted.head(10)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
5,0,0,0,0,0,0
6,0,0,0,0,0,0
7,0,0,0,0,0,0
8,0,0,0,0,0,0
9,0,0,0,0,0,0


In [55]:
roc_auc = []
for col in target_cols:
    fpr, tpr, threshold = roc_curve(y_test[col], predicted[col])
    roc_auc.append(auc(fpr, tpr))

In [56]:
np.mean(roc_auc)

0.5

In [183]:
# Fit Count Vectorizer
countvec = CountVectorizer(analyzer='word', lowercase=True, stop_words='english', max_features=10000)
countvec.fit(X_train_val['comment_text'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [184]:
# Transform data as per Count Vectorizer
X_train_val_countvec = countvec.transform(X_train_val['comment_text'])
X_test_countvec = countvec.transform(X_test['comment_text'])

In [None]:
classifier_lr = BinaryRelevance(LogisticRegression())
classifier_lr.fit(X_train_val_countvec, y_train_val)
predictions_lr = classifier_lr.predict_proba(X_test_countvec)

classifier_nb = BinaryRelevance(BernoulliNB())
classifier_nb.fit(X_train_val_countvec, y_train_val)
predictions_nb = classifier_nb.predict_proba(X_test_countvec)

In [None]:
probabilities_lr = np.squeeze(np.asarray(predictions_lr.todense()))
probabilities_nb = np.squeeze(np.asarray(predictions_nb.todense()))

In [179]:
roc_auc_lr = []
for i, col in enumerate(target_cols):
    fpr, tpr, threshold = roc_curve(y_test[col], probabilities_lr[:,i])
    roc_auc_lr.append(auc(fpr, tpr))

In [180]:
roc_auc_nb = []
for i, col in enumerate(target_cols):
    fpr, tpr, threshold = roc_curve(y_test[col], probabilities_nb[:,i])
    roc_auc_nb.append(auc(fpr, tpr))

In [181]:
np.mean(roc_auc_lr)

0.7929808876340697

In [182]:
np.mean(roc_auc_nb)

0.7986508534870755