In [1]:
%matplotlib inline
%load_ext autoreload

In [2]:
import os
import sys

src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

In [4]:
import pandas as pd
import re
import string
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import log_loss, roc_auc_score, confusion_matrix
from sklearn.pipeline import Pipeline
%aimport nbsvm
from nbsvm import NbSvmClassifier

In [5]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [6]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
def tokenize(s):
    pattern = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
    return pattern.sub(r' \1 ', s).split()

In [8]:
X = train.comment_text.values
idx = np.arange(len(X))

In [9]:
train['comment_text'].fillna("unknown", inplace=True)
test['comment_text'].fillna("unknown", inplace=True)

In [10]:
X_train, X_test, idx_train, idx_test = train_test_split(X, idx, test_size=0.2)

In [11]:
tfidf = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
                        min_df=3, max_df=0.9, strip_accents='unicode',
                        use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english')

In [12]:
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

In [13]:
probs = np.zeros(shape=(len(X_test), 6))
for i, col in enumerate(train.columns[2:]):
    print("Training ", col)
    y_train = train.loc[idx_train, col].values
    y_test = train.loc[idx_test, col].values
    nbsvm = NbSvmClassifier()
    nbsvm.fit(X_train_vec, y_train)
    y_pred = nbsvm.predict_proba(X_test_vec)[:,1]
    probs[:, i] = y_pred
    auc = roc_auc_score(y_test, y_pred)
    lg_loss = log_loss(y_test, y_pred)
    print("{} auc: {}".format(col, auc))
    print("{} log loss: {}".format(col, lg_loss))

Training  toxic
toxic auc: 0.9750872968412777
toxic log loss: 0.10867338672620185
Training  severe_toxic
severe_toxic auc: 0.9860659021557826
severe_toxic log loss: 0.02707380385838993
Training  obscene
obscene auc: 0.990278792415265
obscene log loss: 0.05874267446166544
Training  threat
threat auc: 0.9869174469108342
threat log loss: 0.009703095452555967
Training  insult
insult auc: 0.9830332421226242
insult log loss: 0.07614472798260788
Training  identity_hate
identity_hate auc: 0.9821444098092523
identity_hate log loss: 0.025974602426269807


In [14]:
preds = pd.DataFrame()
preds['id'] = test.id

In [15]:
X_train_vec = tfidf.fit_transform(X)
X_test = test.comment_text.values
X_test_vec = tfidf.transform(X_test)

In [16]:
for col in train.columns[2:]:
    print("Training ", col)
    y_train = train.loc[:, col].values
    nbsvm = NbSvmClassifier(n_jobs=-1)
    nbsvm.fit(X_train_vec, y_train)
    preds[col] = nbsvm.predict_proba(X_test_vec)[:, 1]

Training  toxic


  " = {}.".format(self.n_jobs))


Training  severe_toxic
Training  obscene
Training  threat
Training  insult
Training  identity_hate


In [17]:
preds.to_csv('../submissions/nbsvm-baseline.csv', index=False)

In [20]:
sample = pd.read_csv('../data/sample_submission.csv').head()

In [19]:
pd.read_csv('../submissions/nbsvm-baseline.csv').head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999961,0.089285,0.999784,0.005767,0.980014,0.309978
1,0000247867823ef7,0.015709,0.002279,0.007859,0.000439,0.010349,0.00137
2,00013b17ad220c46,0.020649,0.001613,0.009098,0.000367,0.008493,0.001062
3,00017563c3f7919a,0.00413,0.00097,0.003088,0.00068,0.003585,0.000906
4,00017695ad8997eb,0.027654,0.00126,0.004645,0.000483,0.00677,0.001086
