In [1]:
%matplotlib inline
%load_ext autoreload

In [2]:
import os
import sys

src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

In [4]:
import pandas as pd
import re
import string
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import log_loss, roc_auc_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB

In [5]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [35]:
train.shape

(159571, 8)

In [36]:
test.shape

(153164, 2)

In [6]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
train.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [8]:
def tokenize(s):
    pattern = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
    return pattern.sub(r' \1 ', s).split()

In [10]:
train['comment_text'].fillna("unknown", inplace=True)
test['comment_text'].fillna("unknown", inplace=True)

In [9]:
X = train.comment_text.values
idx = np.arange(len(X))

In [11]:
X_train, X_test, idx_train, idx_test = train_test_split(X, idx, test_size=0.2)

In [12]:
tfidf = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
                        min_df=3, max_df=0.9, strip_accents='unicode',
                        use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english')

In [13]:
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

In [14]:
probs = np.zeros(shape=(len(X_test), 6))
for i, col in enumerate(train.columns[2:]):
    print("Training ", col)
    y_train = train.loc[idx_train, col].values
    y_test = train.loc[idx_test, col].values
    nb = MultinomialNB()
    nb.fit(X_train_vec, y_train)
    y_pred = nb.predict_proba(X_test_vec)[:,1]
    probs[:, i] = y_pred
    auc = roc_auc_score(y_test, y_pred)
    lg_loss = log_loss(y_test, y_pred)
    print("{} auc: {}".format(col, auc))
    print("{} log loss: {}".format(col, lg_loss))

Training  toxic
toxic auc: 0.9045449370040869
toxic log loss: 0.25907257022402497
Training  severe_toxic
severe_toxic auc: 0.9015865109088244
severe_toxic log loss: 0.07155859496510268
Training  obscene
obscene auc: 0.9036493893380705
obscene log loss: 0.18775410418218458
Training  threat
threat auc: 0.8218476450584689
threat log loss: 0.031428487614458905
Training  insult
insult auc: 0.8934371692909379
insult log loss: 0.20032718423225088
Training  identity_hate
identity_hate auc: 0.816352672040768
identity_hate log loss: 0.08138294618182527


# Full Version

In [22]:
X_vec = tfidf.fit_transform(X)

In [23]:
X_test_full = test.comment_text.values
X_test_full_vec = tfidf.transform(X_test_full)

In [38]:
train_probs = np.zeros(shape=(len(X), 6))
probs = np.zeros(shape=(len(X_test_full), 6))
for i, col in enumerate(train.columns[2:]):
    print("Training ", col)
    y = train.loc[:, col].values
    nb = MultinomialNB()
    nb.fit(X_vec, y)
    y_pred = nb.predict_proba(X_test_full_vec)[:,1]
    y_pred_train = nb.predict_proba(X_vec)[:,1]
    probs[:, i] = y_pred
    train_probs[:, i] = y_pred_train

Training  toxic
Training  severe_toxic
Training  obscene
Training  threat
Training  insult
Training  identity_hate


In [39]:
probs

array([[8.64126815e-01, 2.79122018e-05, 2.28309174e-01, 4.92949627e-07,
        8.34413892e-02, 3.30632191e-05],
       [2.96341008e-03, 2.05874197e-05, 7.86882549e-04, 1.68418789e-06,
        5.54108142e-04, 1.44910358e-05],
       [1.79275890e-02, 1.45812807e-04, 5.41318010e-03, 8.96626140e-06,
        3.74251017e-03, 8.72366659e-05],
       ...,
       [4.73906978e-05, 1.60980140e-08, 5.12136636e-06, 1.06145426e-09,
        3.40805073e-06, 2.04289137e-08],
       [3.14868177e-04, 9.99826604e-08, 2.28355167e-05, 9.95185721e-09,
        1.63887585e-05, 2.00304743e-07],
       [2.54685373e-02, 1.12739218e-06, 1.25760987e-03, 1.32915933e-07,
        6.98633692e-04, 1.67723511e-06]])

In [41]:
sample_submission = pd.read_csv('../data/sample_submission.csv')

In [51]:
features = train.drop('comment_text', axis=1)

In [52]:
sample_submission.iloc[:, 1:] = probs

In [53]:
sample_submission.to_csv('../submissions/naive_bayes.csv', index=False)

In [54]:
features.iloc[:, 1:] = train_probs

In [58]:
features.to_csv('../data/features/features_naive_bayes.csv', index=False)

In [59]:
features

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,5.620702e-04,1.432006e-07,4.491052e-05,1.436473e-08,3.505679e-05,1.161590e-07
1,000103f0d9cfb60f,1.121860e-04,1.040154e-07,1.416079e-05,1.052035e-08,9.193756e-06,8.438176e-08
2,000113f07ec002fd,1.814918e-03,1.644432e-07,1.399152e-04,1.133954e-08,9.087814e-05,1.558329e-07
3,0001b41b1c6bb37e,6.070643e-05,5.493114e-08,8.166827e-06,5.335631e-09,5.967435e-06,5.941972e-08
4,0001d958c54c6e35,1.034332e-02,1.547489e-05,1.911940e-03,2.863731e-06,1.583446e-03,1.785943e-05
5,00025465d4725e87,5.055774e-04,3.490369e-06,1.261156e-04,8.349567e-07,9.733848e-05,3.673594e-06
6,0002bcb3da6cb337,7.969113e-01,4.163987e-03,4.528069e-01,1.304982e-04,2.837519e-01,6.302855e-04
7,00031b1e95af7921,3.575774e-03,6.955148e-07,3.297472e-04,5.973536e-08,2.733917e-04,6.002205e-07
8,00037261f536c51d,6.256962e-04,8.099963e-08,4.059459e-05,1.275722e-08,2.469271e-05,1.155592e-07
9,00040093b2687caa,2.445870e-02,1.539329e-03,1.088458e-02,4.542491e-04,9.916890e-03,1.323108e-03


In [60]:
train

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0
