In [1]:
import re
import sys
import string
import time
import pickle
import pandas as pd
import numpy as np
import NbSvmClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

In [2]:
# loading training data
train = pd.read_csv('data/train.csv')
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


### Data preprocessing

In [3]:
def comments_preprocessing(series):
    series.fillna("unknown", inplace=True)
    series = series.apply(lambda x: str(x))
    series = series.apply(lambda x: x.lower())
    # series = series.apply(lambda x: x.translate(str.maketrans('','',string.punctuation)))
    series = series.apply(lambda x: x.replace('\n',''))
    series = series.apply(lambda x: x.replace('\r',''))
    return series

In [4]:
train['comment_text'] = comments_preprocessing(train['comment_text'])

In [5]:
categories = train.columns.values.tolist()[2:]
toxic_dataframes = [train[train[x] == 1] for x in categories]
print(categories)

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [6]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): 
    output = re_tok.sub(r' \1 ', s).split()
    return output

### Naive Bayes SVM (TF-IDF trained with all text)

In [17]:
word_vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=True,
               smooth_idf=True, sublinear_tf=True)
X_nbsvm = word_vec.fit_transform(train['comment_text'])
Y_nbsvm = train.drop(columns=['comment_text'])

In [18]:
scores_nbsvm = pd.DataFrame()
clf_nbsvm = NbSvmClassifier.NbSvmClassifier()
for i, column in enumerate(categories):
    print('scoring', column)
    scores_nbsvm[column] = -1 * cross_val_score(clf_nbsvm, X_nbsvm, Y_nbsvm[column], cv=5, scoring='neg_log_loss')
scores_nbsvm['avg'] = scores_nbsvm.mean(axis=1)
scores_nbsvm.head()

scoring toxic
scoring severe_toxic
scoring obscene
scoring threat
scoring insult
scoring identity_hate


Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,avg
0,0.11292,0.031116,0.060784,0.01241,0.07684,0.02839,0.053743
1,0.114074,0.031234,0.059799,0.012346,0.080078,0.027426,0.054159
2,0.110993,0.031741,0.065447,0.01009,0.077035,0.027725,0.053838
3,0.108864,0.028352,0.066117,0.01135,0.081266,0.02931,0.05421
4,0.107909,0.027271,0.061639,0.01136,0.079025,0.028637,0.05264


### Character N-Grams (TF-IDF trained with train's comments only)

In [12]:
scores_chars = pd.DataFrame()
X_chars = train['comment_text']
Y_chars = train.drop(columns=['comment_text'])
clf_chars = Pipeline([('tfidf', TfidfVectorizer(sublinear_tf=True,analyzer='char', ngram_range=(1,4), 
                                                max_features=20000, min_df=2)), 
                      ('LR', LogisticRegression(C=10.0, solver='sag', max_iter=200, n_jobs=1))])
for column in categories:
    print('scoring', column)
    scores_chars[column] = -1 * cross_val_score(clf_chars, X_chars, Y_chars[column], cv=5, scoring='neg_log_loss')
scores_chars['avg'] = scores_chars.mean(axis=1)
scores_chars.head()

scoring toxic
scoring severe_toxic
scoring obscene
scoring threat




scoring insult
scoring identity_hate




Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,avg
0,0.109293,0.027826,0.055597,0.010075,0.074861,0.024372,0.050337
1,0.110451,0.025755,0.055782,0.010618,0.07395,0.02223,0.049797
2,0.107113,0.027857,0.062183,0.008863,0.075983,0.02391,0.050985
3,0.105024,0.025472,0.061823,0.01008,0.077696,0.024324,0.050737
4,0.107918,0.025592,0.059196,0.010148,0.079212,0.024973,0.051173


In [None]:
char_vec = TfidfVectorizer(sublinear_tf=True,analyzer='char', ngram_range=(1,4), 
                          max_features=20000, min_df=2)
X_chars = char_vec.fit_transform(train['comment_text'])
Y_chars = train.drop(columns=['comment_text'])

print('beginning regression')
scores_chars = pd.DataFrame()
clf_chars = LogisticRegression(C=10.0, solver='sag', max_iter=200, n_jobs=1)
for column in categories:
    print('scoring', column)
    scores_chars[column] = -1 * cross_val_score(clf_chars, X_chars, Y_chars[column], cv=5, scoring='neg_log_loss')
scores_chars['avg'] = scores_chars.mean(axis=1)
scores_chars.head()

beginning regression


In [48]:
mean_nbsvm = scores_nbsvm.mean()
mean_chars = scores_chars.mean()
mean_df = pd.DataFrame({'category': pd.Series(categories), 
                        'nbsvm': pd.Series(mean_nbsvm.tolist()[:6]),
                        'chars': pd.Series(mean_chars.tolist()[:6])})
mean_df.head(7)

Unnamed: 0,category,chars,nbsvm
0,toxic,0.10796,0.109777
1,severe_toxic,0.0265,0.02969
2,obscene,0.058916,0.062111
3,threat,0.009957,0.011384
4,insult,0.07634,0.078019
5,identity_hate,0.023962,0.027819


### Using trained model to predict test data

In [9]:
# load prediction dataframes
char_pred = pickle.load(open('pickles/predictions_char_grams.p', 'rb'))
word_pred = pickle.load(open('pickles/predictions_word_grams.p', 'rb'))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [10]:
char_pred.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,0.004611,0.000486,0.005925,0.000143,0.001889,0.000216
1,6102620,0.001717,0.000638,0.003004,0.000214,0.002436,0.000659
2,14563293,0.000869,8.1e-05,0.000484,5.1e-05,0.000561,9.3e-05
3,21086297,0.118261,0.000676,0.002028,0.000316,0.002507,0.001171
4,22982444,0.000257,0.000222,0.000243,0.000451,0.000215,0.000161


In [11]:
word_pred.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,0.012396,0.001535,0.005828,0.000462,0.005548,0.001601
1,6102620,0.008652,0.000812,0.00617,0.000401,0.004417,0.001327
2,14563293,0.00381,0.001084,0.003675,0.000427,0.003168,0.00093
3,21086297,0.037713,0.002595,0.011549,0.00054,0.011352,0.001166
4,22982444,0.010104,0.00205,0.00567,0.000501,0.003938,0.001761


In [32]:
# average
average = char_pred / 3 + word_pred * 2 / 3
average.to_csv('submissions/average_word_char_grams.csv', index=False)
average.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863.0,0.009801,0.001185,0.00586,0.000356,0.004328,0.001139
1,6102620.0,0.00634,0.000754,0.005114,0.000338,0.003756,0.001104
2,14563293.0,0.00283,0.00075,0.002611,0.000302,0.002299,0.000651
3,21086297.0,0.064562,0.001956,0.008375,0.000465,0.008404,0.001168
4,22982444.0,0.006822,0.001441,0.003861,0.000484,0.002697,0.001228


In [20]:
# drop id axis
char_in = char_pred.drop(columns=['id'], axis=1).as_matrix()
word_in = word_pred.drop(columns=['id'], axis=1).as_matrix()
print(char_in.shape)
print(word_in.shape)

(226998, 6)
(226998, 6)


In [28]:
# create combined input
combined_in = np.stack((char_in, word_in), axis=2)
print(combined_in.shape)
combined_in

(226998, 6, 2)


array([[[  4.61145227e-03,   1.23960687e-02],
        [  4.85827058e-04,   1.53512666e-03],
        [  5.92546642e-03,   5.82782948e-03],
        [  1.42705090e-04,   4.62361831e-04],
        [  1.88851363e-03,   5.54815478e-03],
        [  2.16290094e-04,   1.60091793e-03]],

       [[  1.71722187e-03,   8.65170368e-03],
        [  6.37835703e-04,   8.12495562e-04],
        [  3.00379929e-03,   6.16983731e-03],
        [  2.13747812e-04,   4.00730180e-04],
        [  2.43562093e-03,   4.41653520e-03],
        [  6.59349177e-04,   1.32653059e-03]],

       [[  8.69225169e-04,   3.81006830e-03],
        [  8.09007869e-05,   1.08449343e-03],
        [  4.83937949e-04,   3.67478586e-03],
        [  5.07653878e-05,   4.27094120e-04],
        [  5.60740597e-04,   3.16821045e-03],
        [  9.27498159e-05,   9.29882111e-04]],

       ..., 
       [[  1.20054532e-04,   1.14204496e-02],
        [  1.76249233e-04,   2.19099957e-03],
        [  7.76473994e-04,   7.36185328e-03],
        [  2.99