In [2]:
import pandas as pd
import numpy as np
import string
import time
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

In [3]:
def comments_preprocessing(df):
    df['comment_text'] = df['comment_text'].apply(lambda x: str(x))
    df['comment_text'] = df['comment_text'].apply(lambda x: x.lower())
    df['comment_text'] = df['comment_text'].apply(lambda x: x.translate(str.maketrans('','',string.punctuation)))
    df['comment_text'] = df['comment_text'].apply(lambda x: x.replace('\n',''))
    df['comment_text'] = df['comment_text'].apply(lambda x: x.replace('\r',''))
    return df

In [4]:
train = pd.read_csv('data/train.csv')
train = comments_preprocessing(train)
test = pd.read_csv('data/test.csv')
test = comments_preprocessing(test)

print('train shape', train.shape)
print('test shape', test.shape)
train.head()
test.head()

train shape (95851, 8)
test shape (226998, 2)


Unnamed: 0,id,comment_text
0,6044863,orphaned nonfree media image41cd1jboevl ss500 jpg
1,6102620,kentuckiana is colloquial even though the are...
2,14563293,hello fellow wikipediansi have just modified ...
3,21086297,akc suspensions the morning call feb 24 2001 ...
4,22982444,wikilink talkcelts


In [5]:
categories = train.columns.values.tolist()[2:]
toxic_dataframes = [train[train[x] == 1] for x in categories]
print(categories)

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [13]:
predictions = pd.DataFrame({'id': test['id']})
prob_predictions = pd.DataFrame({'id':test['id']})

for i, name in enumerate(categories):
    print(name, toxic_dataframes[i].shape)
    clf = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('mnb', MultinomialNB(alpha=1.0))])
    clf.fit(train['comment_text'], train[name])
    pred = clf.predict(test['comment_text'])
    pred2 = clf.predict_proba(test['comment_text'])
    predictions[name] = pred
    prob_predictions[name] = pred2.max(axis=1)

print('predictions shape', predictions.shape)
predictions.head(20)
print('prob_predictions shape', prob_predictions.shape)
prob_predictions.head(20)

toxic (9237, 8)
severe_toxic (965, 8)
obscene (5109, 8)
threat (305, 8)
insult (4765, 8)
identity_hate (814, 8)
predictions shape (226998, 7)
prob_predictions shape (226998, 7)


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,0.996024,0.999721,0.997932,0.999919,0.998408,0.999783
1,6102620,0.999912,1.0,0.99999,1.0,0.999991,1.0
2,14563293,0.999995,1.0,1.0,1.0,1.0,1.0
3,21086297,0.983193,0.999541,0.995921,0.999863,0.996742,0.999653
4,22982444,0.975993,0.997222,0.986045,0.999116,0.986924,0.997658
5,24388733,0.998508,0.999962,0.999447,0.99999,0.999544,0.999967
6,26195914,0.999991,1.0,0.999999,1.0,0.999999,1.0
7,31769073,0.995183,0.999948,0.998262,0.999988,0.99876,0.999946
8,35289443,0.995577,0.999971,0.99902,0.999993,0.999163,0.999976
9,38393350,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
print(prob_predictions)

                  id     toxic  severe_toxic   obscene    threat    insult  \
0            6044863  0.996024      0.999721  0.997932  0.999919  0.998408   
1            6102620  0.999912      1.000000  0.999990  1.000000  0.999991   
2           14563293  0.999995      1.000000  1.000000  1.000000  1.000000   
3           21086297  0.983193      0.999541  0.995921  0.999863  0.996742   
4           22982444  0.975993      0.997222  0.986045  0.999116  0.986924   
5           24388733  0.998508      0.999962  0.999447  0.999990  0.999544   
6           26195914  0.999991      1.000000  0.999999  1.000000  0.999999   
7           31769073  0.995183      0.999948  0.998262  0.999988  0.998760   
8           35289443  0.995577      0.999971  0.999020  0.999993  0.999163   
9           38393350  1.000000      1.000000  1.000000  1.000000  1.000000   
10          51720630  0.998675      0.999999  0.999818  1.000000  0.999821   
11          52808210  0.998659      1.000000  0.999854  1.000000