# Comparing Word N-Grams with Char N-Grams

In [14]:
import re
import sys
import string
import time
import pickle
import pandas as pd
import numpy as np
import NbSvmClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import log_loss

In [2]:
# loading training data
train = pd.read_csv('data/train.csv')
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


### Data Preprocessing

In [3]:
def comments_preprocessing(series):
    series.fillna("unknown", inplace=True)
    series = series.apply(lambda x: str(x))
    series = series.apply(lambda x: x.lower())
    # series = series.apply(lambda x: x.translate(str.maketrans('','',string.punctuation)))
    series = series.apply(lambda x: x.replace('\n',''))
    series = series.apply(lambda x: x.replace('\r',''))
    return series

In [4]:
train['comment_text'] = comments_preprocessing(train['comment_text'])

In [5]:
categories = train.columns.values.tolist()[2:]
toxic_dataframes = [train[train[x] == 1] for x in categories]
print(categories)

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [6]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): 
    output = re_tok.sub(r' \1 ', s).split()
    return output

### Comparing Scores of Wgrams and Cgrams
#### Word N-Grams with Naive Bayes SVM 

In [17]:
word_vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=True,
               smooth_idf=True, sublinear_tf=True)
X_nbsvm = word_vec.fit_transform(train['comment_text'])
Y_nbsvm = train.drop(columns=['comment_text'])

In [18]:
scores_nbsvm = pd.DataFrame()
clf_nbsvm = NbSvmClassifier.NbSvmClassifier()
for i, column in enumerate(categories):
    print('scoring', column)
    scores_nbsvm[column] = -1 * cross_val_score(clf_nbsvm, X_nbsvm, Y_nbsvm[column], cv=5, scoring='neg_log_loss')
scores_nbsvm['avg'] = scores_nbsvm.mean(axis=1)
scores_nbsvm.head()

scoring toxic
scoring severe_toxic
scoring obscene
scoring threat
scoring insult
scoring identity_hate


Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,avg
0,0.11292,0.031116,0.060784,0.01241,0.07684,0.02839,0.053743
1,0.114074,0.031234,0.059799,0.012346,0.080078,0.027426,0.054159
2,0.110993,0.031741,0.065447,0.01009,0.077035,0.027725,0.053838
3,0.108864,0.028352,0.066117,0.01135,0.081266,0.02931,0.05421
4,0.107909,0.027271,0.061639,0.01136,0.079025,0.028637,0.05264


#### Character N-Grams with Logistic Regression

In [51]:
char_vec = TfidfVectorizer(sublinear_tf=True,analyzer='char', ngram_range=(1,4), 
                           max_features=20000, min_df=2)
X_chars = char_vec.fit_transform(train['comment_text'])
Y_chars = train.drop(columns=['comment_text'])

beginning regression
scoring toxic
scoring severe_toxic
scoring obscene
scoring threat
scoring insult
scoring identity_hate


Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,avg
0,0.1091,0.027835,0.055575,0.010072,0.074661,0.024292,0.050256
1,0.11033,0.025767,0.055634,0.01059,0.07396,0.022187,0.049745
2,0.107164,0.028056,0.062286,0.008883,0.076058,0.024113,0.051093
3,0.104959,0.02546,0.061643,0.010085,0.077827,0.024332,0.050718
4,0.107618,0.025626,0.058841,0.010132,0.07888,0.024859,0.050993


In [None]:
scores_chars = pd.DataFrame()
clf_chars = LogisticRegression(C=10.0, solver='sag', max_iter=200, n_jobs=1)
for column in categories:
    print('scoring', column)
    scores_chars[column] = -1 * cross_val_score(clf_chars, X_chars, Y_chars[column], cv=5, scoring='neg_log_loss')
scores_chars['avg'] = scores_chars.mean(axis=1)
scores_chars.head()

#### Comparing LogLoss Scores

In [80]:
mean_nbsvm = scores_nbsvm.mean()
mean_chars = scores_chars.mean()
mean_df = pd.DataFrame({'category': categories + ['overall'], 
                        'nbsvm': mean_nbsvm.tolist(),
                        'chars': mean_chars.tolist()})
mean_df

Unnamed: 0,category,chars,nbsvm
0,toxic,0.107834,0.109777
1,severe_toxic,0.026549,0.02969
2,obscene,0.058796,0.062111
3,threat,0.009952,0.011384
4,insult,0.076277,0.078019
5,identity_hate,0.023956,0.027819
6,overall,0.050561,0.053133


Cgrams outpreforms Wgrams in all categories. The difference is more noticable in the following categories: severe_toxic, obscene, and identity_hate.

### Differences between Wgrams and Cgrams (incomplete)

In [7]:
# creating train/test split 
X = train['comment_text']
y = train.drop(columns=['comment_text'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('num in each category')
temp = pd.DataFrame()
for column in categories:
    temp[column] = [y_train[column].sum(), y_test[column].sum()]
temp.head()

num in each category


Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,7401,773,4083,249,3791,666
1,1836,192,1026,56,974,148


#### Word N-Grams with Naive Bayes SVM 

In [34]:
word_vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=True,
               smooth_idf=True, sublinear_tf=True)
X_train_nbsvm = word_vec.fit_transform(X_train)
X_test_nbsvm = word_vec.transform(X_test)

In [42]:
preds_nbsvm = pd.DataFrame({'id': y_test['id']})
clf_nbsvm = NbSvmClassifier.NbSvmClassifier()
for column in categories:
    clf_nbsvm.fit(X_train_nbsvm, y_train[column])
    preds_nbsvm[column] = clf_nbsvm.predict_proba(X_test_nbsvm)[:,1]
preds_nbsvm.describe()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,19171.0,19171.0,19171.0,19171.0,19171.0,19171.0,19171.0
mean,498822600000.0,0.091748,0.008298,0.049299,0.002029,0.045254,0.005603
std,286539800000.0,0.224168,0.058494,0.177519,0.027837,0.156093,0.042895
min,138560500.0,5.8e-05,0.000201,0.000545,0.000232,5.7e-05,0.000341
25%,250357300000.0,0.00937,0.001009,0.005055,0.000484,0.004823,0.001257
50%,498275300000.0,0.018212,0.001293,0.007156,0.000562,0.007604,0.001543
75%,745108900000.0,0.038121,0.001707,0.010806,0.000719,0.012904,0.002069
max,999988200000.0,1.0,0.99736,1.0,0.999248,1.0,0.99997


In [36]:
# scoring
score_nbsvm = pd.DataFrame()
for column in categories:
    score_nbsvm[column] = [log_loss(y_test[column], preds_nbsvm[column])]
score_nbsvm['overall'] = score_nbsvm.mean(axis=1)
score_nbsvm.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,overall
0,0.107459,0.028484,0.057845,0.011196,0.080018,0.026509,0.051918


#### Character N-Grams with Logistic Regression

In [37]:
char_vec = TfidfVectorizer(sublinear_tf=True,analyzer='char', ngram_range=(1,4), 
                           max_features=20000, min_df=2)
X_train_chars = char_vec.fit_transform(X_train)
X_test_chars = char_vec.transform(X_test)

In [41]:
preds_chars = pd.DataFrame({'id': y_test['id']})
clf_chars = LogisticRegression(C=10.0, solver='sag', max_iter=200, n_jobs=1)
for column in categories:
    clf_chars.fit(X_train_chars, y_train[column])
    preds_chars[column] = clf_chars.predict_proba(X_test_chars)[:,1]
preds_chars.describe()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,19171.0,19171.0,19171.0,19171.0,19171.0,19171.0,19171.0
mean,498822600000.0,0.094772,0.009089,0.05002,0.002639,0.046572,0.006918
std,286539800000.0,0.243265,0.062058,0.185726,0.030532,0.167522,0.05175
min,138560500.0,1e-05,6e-06,1.7e-05,3e-06,8e-06,5e-06
25%,250357300000.0,0.001381,0.00018,0.000934,9.6e-05,0.000741,0.000189
50%,498275300000.0,0.005403,0.000462,0.002488,0.000231,0.002255,0.000493
75%,745108900000.0,0.027516,0.001288,0.007982,0.000632,0.008614,0.001511
max,999988200000.0,1.0,0.974806,1.0,0.994349,0.999998,0.99964


In [40]:
# scoring
score_chars = pd.DataFrame()
for column in categories:
    score_chars[column] = [log_loss(y_test[column], preds_chars[column])]
score_chars['overall'] = score_chars.mean(axis=1)
score_chars.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,overall
0,0.106328,0.026515,0.055661,0.009578,0.080722,0.023537,0.05039


#### Ratio Combination of Wgrams and Cgrams

In [85]:
ratio = 2/5
preds_combo = pd.DataFrame({'id': y_test['id']})
for column in categories:
    preds_combo[column] = preds_nbsvm[column] * ratio + preds_chars[column] * (1 - ratio)

In [86]:
# scoring
score_combo = pd.DataFrame()
for column in categories:
    score_combo[column] = [log_loss(y_test[column], preds_combo[column])]
score_combo['overall'] = score_combo.mean(axis=1)
score_combo.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,overall
0,0.099451,0.025654,0.052231,0.009662,0.075705,0.023157,0.047643


#### Performance Comparison

In [125]:
# if both predictions were wrong 
wrong_ids = {'both wrong, ans=0': set(),
             'both wrong, ans=1': set(),
             'nbsvm wrong, ans=0': set(),
             'nbsvm wrong, ans=1': set(),
             'chars wrong, ans=0': set(),
             'chars wrong, ans=1': set()}
performance_df = pd.DataFrame(columns=['a category', 'total wrong', 
                                       'both wrong', 'one wrong', 
                                       'both wrong, ans=0', 'both wrong, ans=1',
                                       'nbsvm wrong, ans=0', 'nbsvm wrong, ans=1', 
                                       'chars wrong, ans=0', 'chars wrong, ans=1'])
for column in categories:
    temp_df = pd.DataFrame({'nbsvm': preds_nbsvm[column], 
                            'chars': preds_chars[column]})
    total_count = 0
    one_wrong_count = 0
    nbsvm_wrong_true_count = 0
    nbsvm_wrong_false_count = 0
    chars_wrong_true_count = 0
    chars_wrong_false_count = 0
    both_wrong_count = 0
    both_wrong_true_count = 0
    both_wrong_false_count = 0
    for i, row in temp_df.iterrows():
        answer = y_test.loc[[i]][column][i]
        dif_nbsvm = abs(row['nbsvm'] - answer)
        dif_chars = abs(row['chars'] - answer)
        if dif_nbsvm > 0.5 and dif_chars > 0.5:
            both_wrong_count += 1
            if answer > 0.5:
                both_wrong_true_count += 1
                wrong_ids['both wrong, ans=1'].add(i)
            else:
                both_wrong_false_count += 1
                wrong_ids['both wrong, ans=0'].add(i)
        if abs(row['chars'] - row['nbsvm']) > 0.5:
            one_wrong_count += 1
            if dif_nbsvm > dif_chars:
                if answer > 0.5:
                    nbsvm_wrong_true_count += 1
                    wrong_ids['nbsvm wrong, ans=1'].add(i)
                else:
                    nbsvm_wrong_false_count += 1
                    wrong_ids['nbsvm wrong, ans=0'].add(i)
            else:
                if answer > 0.5:
                    chars_wrong_true_count += 1
                    wrong_ids['chars wrong, ans=1'].add(i)
                else:
                    chars_wrong_false_count += 1
                    wrong_ids['chars wrong, ans=0'].add(i)
        # print(row['chars'], row['nbsvm'], y_test.index.get_loc(i))
    total_count = one_wrong_count + both_wrong_count
    col_df = pd.DataFrame({'a category': [column], 
                           'total wrong': [total_count],
                           'both wrong': [both_wrong_count],
                           'one wrong': [one_wrong_count],
                           'both wrong, ans=0': [both_wrong_false_count],
                           'both wrong, ans=1': [both_wrong_true_count],
                           'nbsvm wrong, ans=0': [nbsvm_wrong_false_count],
                           'nbsvm wrong, ans=1': [nbsvm_wrong_true_count],
                           'chars wrong, ans=0': [chars_wrong_false_count],
                           'chars wrong, ans=1': [chars_wrong_true_count]})
    performance_df = performance_df.append(col_df)
performance_df

Unnamed: 0,a category,both wrong,"both wrong, ans=0","both wrong, ans=1","chars wrong, ans=0","chars wrong, ans=1","nbsvm wrong, ans=0","nbsvm wrong, ans=1",one wrong,total wrong
0,toxic,532,64,468,47,16,7,80,150,682
0,severe_toxic,151,24,127,9,4,2,6,21,172
0,obscene,290,34,256,12,14,3,37,66,356
0,threat,45,5,40,0,0,0,2,2,47
0,insult,452,84,368,14,9,6,32,61,513
0,identity_hate,111,8,103,5,1,0,9,15,126


to be continued

### Using trained model to predict test data (incomplete)

In [9]:
# load prediction dataframes
char_pred = pickle.load(open('pickles/predictions_char_grams.p', 'rb'))
word_pred = pickle.load(open('pickles/predictions_word_grams.p', 'rb'))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [10]:
char_pred.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,0.004611,0.000486,0.005925,0.000143,0.001889,0.000216
1,6102620,0.001717,0.000638,0.003004,0.000214,0.002436,0.000659
2,14563293,0.000869,8.1e-05,0.000484,5.1e-05,0.000561,9.3e-05
3,21086297,0.118261,0.000676,0.002028,0.000316,0.002507,0.001171
4,22982444,0.000257,0.000222,0.000243,0.000451,0.000215,0.000161


In [11]:
word_pred.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,0.012396,0.001535,0.005828,0.000462,0.005548,0.001601
1,6102620,0.008652,0.000812,0.00617,0.000401,0.004417,0.001327
2,14563293,0.00381,0.001084,0.003675,0.000427,0.003168,0.00093
3,21086297,0.037713,0.002595,0.011549,0.00054,0.011352,0.001166
4,22982444,0.010104,0.00205,0.00567,0.000501,0.003938,0.001761


In [32]:
# average
average = char_pred / 3 + word_pred * 2 / 3
average.to_csv('submissions/average_word_char_grams.csv', index=False)
average.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863.0,0.009801,0.001185,0.00586,0.000356,0.004328,0.001139
1,6102620.0,0.00634,0.000754,0.005114,0.000338,0.003756,0.001104
2,14563293.0,0.00283,0.00075,0.002611,0.000302,0.002299,0.000651
3,21086297.0,0.064562,0.001956,0.008375,0.000465,0.008404,0.001168
4,22982444.0,0.006822,0.001441,0.003861,0.000484,0.002697,0.001228


In [20]:
# drop id axis
char_in = char_pred.drop(columns=['id'], axis=1).as_matrix()
word_in = word_pred.drop(columns=['id'], axis=1).as_matrix()
print(char_in.shape)
print(word_in.shape)

(226998, 6)
(226998, 6)


In [28]:
# create combined input
combined_in = np.stack((char_in, word_in), axis=2)
print(combined_in.shape)
combined_in

(226998, 6, 2)


array([[[  4.61145227e-03,   1.23960687e-02],
        [  4.85827058e-04,   1.53512666e-03],
        [  5.92546642e-03,   5.82782948e-03],
        [  1.42705090e-04,   4.62361831e-04],
        [  1.88851363e-03,   5.54815478e-03],
        [  2.16290094e-04,   1.60091793e-03]],

       [[  1.71722187e-03,   8.65170368e-03],
        [  6.37835703e-04,   8.12495562e-04],
        [  3.00379929e-03,   6.16983731e-03],
        [  2.13747812e-04,   4.00730180e-04],
        [  2.43562093e-03,   4.41653520e-03],
        [  6.59349177e-04,   1.32653059e-03]],

       [[  8.69225169e-04,   3.81006830e-03],
        [  8.09007869e-05,   1.08449343e-03],
        [  4.83937949e-04,   3.67478586e-03],
        [  5.07653878e-05,   4.27094120e-04],
        [  5.60740597e-04,   3.16821045e-03],
        [  9.27498159e-05,   9.29882111e-04]],

       ..., 
       [[  1.20054532e-04,   1.14204496e-02],
        [  1.76249233e-04,   2.19099957e-03],
        [  7.76473994e-04,   7.36185328e-03],
        [  2.99

to be continued