In [1]:
import pandas as pd
import numpy as np
import string
import time
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

def comments_preprocessing(df):
    df['comment_text'] = df['comment_text'].apply(lambda x: str(x))
    df['comment_text'] = df['comment_text'].apply(lambda x: x.lower())
    df['comment_text'] = df['comment_text'].apply(lambda x: x.translate(str.maketrans('','',string.punctuation)))
    df['comment_text'] = df['comment_text'].apply(lambda x: x.replace('\n',''))
    df['comment_text'] = df['comment_text'].apply(lambda x: x.replace('\r',''))
    df['comment_text'] = df['comment_text'].apply(lambda x: x.translate(str.maketrans('','',string.digits)))
    
    return df

train = pd.read_csv('data/train.csv')
train = comments_preprocessing(train)
test = pd.read_csv('data/test.csv')
test = comments_preprocessing(test)

print('train shape', train.shape)
print('test shape', test.shape)
train.head()
test.head()

categories = train.columns.values.tolist()[2:]
toxic_dataframes = [train[train[x] == 1] for x in categories]
print(categories)

train shape (95851, 8)
test shape (226998, 2)
['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

predictions = pd.DataFrame({'id': test['id']})
prob_predictions = pd.DataFrame({'id':test['id']})



for i, name in enumerate(categories):
    print(name, toxic_dataframes[i].shape)
    clf = Pipeline([('tfidf', TfidfVectorizer(sublinear_tf=True,analyzer='char', ngram_range=(1,4), max_features=20000, min_df=2)), ('LR', LogisticRegression(C=10.0, solver='sag', n_jobs=-1))])
    #param_grid = {'LR__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
    clf.fit(train['comment_text'], train[name])
    #gs_clf = GridSearchCV(clf, param_grid, n_jobs=-1)
    #gs_clf.fit(train['comment_text'], train[name])
    pred = clf.predict(test['comment_text'])
    pred2 = clf.predict_proba(test['comment_text'])
    predictions[name] = pred
    prob_predictions[name] = pred2[:,1]
    #print(gs_clf.best_params_)

print('predictions shape', predictions.shape)
predictions.head(20)
print('prob_predictions shape', prob_predictions.shape)
prob_predictions.head(20)

toxic (9237, 8)
severe_toxic (965, 8)
obscene (5109, 8)
threat (305, 8)




insult (4765, 8)
identity_hate (814, 8)
predictions shape (226998, 7)
prob_predictions shape (226998, 7)


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,0.004611,0.000486,0.005925,0.000143,0.001889,0.000216
1,6102620,0.001717,0.000638,0.003004,0.000214,0.002436,0.000659
2,14563293,0.000869,8.1e-05,0.000484,5.1e-05,0.000561,9.3e-05
3,21086297,0.118263,0.000676,0.002028,0.000316,0.002507,0.001171
4,22982444,0.000257,0.000222,0.000243,0.000451,0.000215,0.000161
5,24388733,9.5e-05,0.000189,0.000179,7.9e-05,1.6e-05,2.9e-05
6,26195914,0.000559,1.5e-05,0.001516,0.000191,0.000279,0.000241
7,31769073,0.00279,0.000255,0.000547,8.7e-05,0.000979,0.000602
8,35289443,0.027298,0.001202,0.028075,0.000342,0.067038,0.000273
9,38393350,4.4e-05,2.4e-05,0.000151,0.000286,0.000275,0.000123


In [3]:
print('amount marked true in each category')
for name in categories:
    total = predictions[name].sum()
    print(name, total)

amount marked true in each category
toxic 6184
severe_toxic 435
obscene 3260
threat 73
insult 2791
identity_hate 285


In [4]:
for name in categories:
    print('---', name.upper(), '---')
    cond = predictions[name] == True
    df = predictions[cond]
    count = 0
    for index, row in df.iterrows():
        print('- ', test[test['id'] == row['id']]['comment_text'].tolist()[0])
        count += 1
        if count > 5:
            break


--- TOXIC ---
-  stop being a foolif hes going to post up racist lies and claim hes following policy im going to call that fool feldspar what he is a racist fool
-  what right do you have to block the whites people including myself when you ethnic european and ethnic african descents are the most racist and belicuse creatures on my planet which planet you come from said chooyooo was rude and impolite than anything i wrote who or what ethnicity is he trying to insult with his ridiculous usertalkyou bizarre stubborn and desperate act of ethnic european ironfist censorship thats what it is will only succeed to increase my determination and drive to break the back of western racism paraphrase the negro or ethnic african doctor kings in the sif you continue to block whites as myself youre only succeeding in demonstrating your ethnic european desperate cunnings and or western despotism why do you show what kind blond yoke you are with your lowly blond perils and socalled white maggots
-  i w

In [5]:
# output to csv
prob_predictions.to_csv('lg_submission.csv', index=False)

In [6]:
with open('lg_submission.csv', 'rb') as f:
    print(f)

<_io.BufferedReader name='lg_submission.csv'>
