In [1]:
import nltk
import sys
import time
import numpy as np
import pandas as pd
import sklearn.feature_extraction.text as text
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from IPython.display import display

In [2]:
data = pd.read_csv('../data/train.csv')
data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [3]:
data = pd.read_csv('../data/train.csv')
sample = data.sample(frac=0.5)
sample.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
102791,2615cdb92fed6a59,"Pelasgian , pre-greek origin of Zeus \n\nHomer...",0,0,0,0,0,0
111713,559a1b26f6c25d95,"""\n\n I really need clarification!!!!!!!!!!! \...",0,0,0,0,0,0
90305,f1aca9fe42e59381,"""Ahoerstemeier|andy]] 17:23, 12 Feb 2005 (UTC)...",0,0,0,0,0,0
75673,ca7a8203fd38fe97,By distruptive you literally just mean 'disagr...,0,0,0,0,0,0
46877,7d48af38b2f62cfd,"Bot \n\nHi there, 2 things\n\n how do you make...",0,0,0,0,0,0


In [4]:
TARGET_LABELS = ['toxic', 'severe_toxic', 'obscene', 'threat','insult',	'identity_hate']

In [5]:
sample[TARGET_LABELS].values

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [6]:
sample.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,79786.0,79786.0,79786.0,79786.0,79786.0,79786.0
mean,0.096671,0.010064,0.054471,0.002933,0.050974,0.008761
std,0.295511,0.099816,0.226946,0.054077,0.219946,0.09319
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [67]:
tf = text.TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )
X = tf.fit_transform(sample['comment_text'])

In [68]:
tf.vocabulary_

{'pelasgian': 163366,
 ',': 6805,
 'pre': 168472,
 '-': 10475,
 'greek': 102882,
 'origin': 159019,
 'of': 150798,
 'zeus': 244951,
 'homer': 110417,
 'refers': 176790,
 'to': 218675,
 'him': 108826,
 'as': 46958,
 '.': 12847,
 'why': 236584,
 'there': 214552,
 'is': 120231,
 'no': 146317,
 'mention': 137799,
 'the': 206558,
 'much': 141853,
 'earlier': 83662,
 'than': 204528,
 'arrival': 46091,
 'tribes': 223096,
 'in': 114566,
 'nowadays': 149720,
 'greece': 102864,
 '?': 23667,
 ', pre': 9381,
 'pre -': 168473,
 '- greek': 11484,
 'greek origin': 102924,
 'origin of': 159032,
 'of zeus': 154189,
 'refers to': 176793,
 'to him': 219870,
 'him as': 108857,
 'zeus .': 244953,
 '. why': 16032,
 'why there': 236683,
 'there is': 214632,
 'is no': 121262,
 'no mention': 146623,
 'mention of': 137830,
 'of the': 153822,
 '. much': 14902,
 'much earlier': 141907,
 'earlier than': 83686,
 'than the': 204746,
 'the arrival': 207079,
 'arrival of': 46093,
 'tribes in': 223102,
 'greece ?': 102

In [69]:
len(tf.vocabulary_)

246309

In [80]:
%%time
X_one = tf.transform(pd.Series(data=['Abasda fuck asdasd']))

CPU times: user 1.7 ms, sys: 504 µs, total: 2.2 ms
Wall time: 1.38 ms


In [44]:
sample['comment_text']

102791    Pelasgian , pre-greek origin of Zeus \n\nHomer...
111713    "\n\n I really need clarification!!!!!!!!!!! \...
90305     "Ahoerstemeier|andy]] 17:23, 12 Feb 2005 (UTC)...
75673     By distruptive you literally just mean 'disagr...
46877     Bot \n\nHi there, 2 things\n\n how do you make...
145117    Thank you for experimenting with the page Bob ...
5031                   Thanks for the detailed explanation.
10931     Yeah, you're right. \n\nIt was just a dumb mis...
97166     "\n\n Now almost certain it is the same person...
29260     Lil Faggit  \n\nYou are a little bitch. I fuck...
17985              pathetic shitty arse mr-z u fucking cunt
57956     I suspect that they are actually the same person.
93797     The other guy is the one at fault.  I'm sure i...
13791     aircraft assessments \n\nThe aircraft articles...
158740    Yes, I saw dear old Slim Virgin on the war pat...
124334    BerksGuy \n\nI'm want you to know that if this...
120723    "\n\nAgain, arguing about some

In [15]:
y = sample[TARGET_LABELS].values

In [9]:
type(sample['comment_text'])

pandas.core.series.Series

In [16]:
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=.2)

# Naive Bayes

In [20]:
%%time
bnb = GridSearchCV(
    BernoulliNB(),
    param_grid={'alpha': np.logspace(-2., 2., 50)})
bnb.fit(X_train, y_train)



ValueError: bad input shape (42552, 6)

In [13]:
confusion_matrix(bnb.predict(X_test), y_test)

NotFittedError: This GridSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [19]:
bnb.score(X_test)

NotFittedError: This GridSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

# Multinomial Naive Bayes

In [21]:
%%time
mnb = GridSearchCV(
    MultinomialNB(),
    param_grid={'alpha': np.logspace(-2., 2., 50)})
mnb.fit(X_train, y_train)

ValueError: bad input shape (42552, 6)

In [None]:
confusion_matrix(mnb.predict(X_test), y_test)

In [None]:
f1_score(mnb.predict(X_test), y_test)

# SVM Linear

In [24]:
%%time
import random
random.seed(6)
# svm_svc = svm.LinearSVC()
svm_svc = OneVsRestClassifier(svm.LinearSVC())
svm_svc.fit(X_train, y_train)

CPU times: user 4.07 s, sys: 252 ms, total: 4.33 s
Wall time: 3.3 s


In [None]:
confusion_matrix(svm_svc.predict(X_test), y_test)

In [26]:
svm_svc.score(X_test, y_test)

0.9185361574132097

# Logistic Regression

In [None]:
%%time
log_reg = GridSearchCV(
    LogisticRegression(),
    param_grid={'C': [5, 10, 15, 20, 30]})
log_reg.fit(X_train, y_train)

In [None]:
log_reg.best_params_

In [None]:
confusion_matrix(log_reg.predict(X_test), y_test)

In [None]:
f1_score(log_reg.predict(X_test), y_test)

# Naive Bayes SVM

In [11]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self

In [None]:
y

In [17]:
%%time
nbsvm = OneVsRestClassifier(NbSvmClassifier(C=4, dual=True, n_jobs=-1))
nbsvm.fit(X_train, y_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


CPU times: user 29.9 s, sys: 447 ms, total: 30.4 s
Wall time: 22.4 s


In [18]:
confusion_matrix(nbsvm.predict(X_test), y_test)

ValueError: multilabel-indicator is not supported

In [75]:
X_one

<1x246309 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [37]:
X_test

<15958x246309 sparse matrix of type '<class 'numpy.float64'>'
	with 1732492 stored elements in Compressed Sparse Row format>

In [76]:
sparse.vstack([X_one, X_test])

<15959x246309 sparse matrix of type '<class 'numpy.float64'>'
	with 1732492 stored elements in Compressed Sparse Row format>

In [81]:
nbsvm.predict(X_one)

array([[1, 1, 1, 0, 1, 0]])

In [None]:
nbsvm.predict(pd.Series(['']))

In [53]:
nbsvm.score(X_test, y_test)

0.9214187241508961

In [56]:
X_test[0]

<1x245186 sparse matrix of type '<class 'numpy.float64'>'
	with 109 stored elements in Compressed Sparse Row format>

In [54]:
nbsvm.predict('Fuck you')

AttributeError: 'str' object has no attribute 'multiply'

In [44]:
f1_score(nbsvm.predict(X_test), y_test)

ValueError: Target is multilabel-indicator but average='binary'. Please choose another average setting.

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200, n_jobs=-1)
rf.fit(X_train, y_train)

In [None]:
f1_score(rf.predict(X_test), y_test)