In [6]:
import pandas as pd
import numpy as np
import string
import time
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

def comments_preprocessing(df):
    df['comment_text'] = df['comment_text'].apply(lambda x: str(x))
    df['comment_text'] = df['comment_text'].apply(lambda x: x.lower())
    df['comment_text'] = df['comment_text'].apply(lambda x: x.translate(str.maketrans('','',string.punctuation)))
    df['comment_text'] = df['comment_text'].apply(lambda x: x.replace('\n',''))
    df['comment_text'] = df['comment_text'].apply(lambda x: x.replace('\r',''))
    return df

train = pd.read_csv('data/train.csv')
train = comments_preprocessing(train)
test = pd.read_csv('data/test.csv')
test = comments_preprocessing(test)

print('train shape', train.shape)
print('test shape', test.shape)
train.head()
test.head()

categories = train.columns.values.tolist()[2:]
toxic_dataframes = [train[train[x] == 1] for x in categories]
print(categories)

train shape (95851, 8)
test shape (226998, 2)
['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

predictions = pd.DataFrame({'id': test['id']})
prob_predictions = pd.DataFrame({'id':test['id']})



for i, name in enumerate(categories):
    print(name, toxic_dataframes[i].shape)
    clf = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('LR', LogisticRegression(C=10.0, solver='sag', n_jobs=-1))])
    #param_grid = {'LR__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
    clf.fit(train['comment_text'], train[name])
    #gs_clf = GridSearchCV(clf, param_grid, n_jobs=-1)
    #gs_clf.fit(train['comment_text'], train[name])
    pred = clf.predict(test['comment_text'])
    pred2 = clf.predict_proba(test['comment_text'])
    predictions[name] = pred
    prob_predictions[name] = pred2[:,1]
    #print(gs_clf.best_params_)

print('predictions shape', predictions.shape)
predictions.head(20)
print('prob_predictions shape', prob_predictions.shape)
prob_predictions.head(20)

toxic (9237, 8)
severe_toxic (965, 8)
obscene (5109, 8)
threat (305, 8)
insult (4765, 8)
identity_hate (814, 8)
predictions shape (226998, 7)
prob_predictions shape (226998, 7)


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,0.012599,0.003571,0.016542,0.001034,0.006584,0.001763
1,6102620,0.008636,0.000365,0.000979,0.000341,0.006334,0.000945
2,14563293,0.001118,0.000631,0.001094,0.000333,0.002492,0.000296
3,21086297,0.093734,0.002331,0.003511,0.000768,0.002744,0.001235
4,22982444,0.019767,0.003109,0.008489,0.001067,0.010722,0.002712
5,24388733,0.006987,0.001229,0.003981,0.000625,0.003885,0.001297
6,26195914,0.000662,0.000961,0.001181,0.00027,0.000916,0.000851
7,31769073,0.002864,0.000654,0.006189,0.000442,0.003683,0.001882
8,35289443,0.025662,0.003174,0.006336,0.002329,0.016744,0.001156
9,38393350,0.00021,9.8e-05,0.000653,6.2e-05,0.000579,0.000135


In [8]:
print('amount marked true in each category')
for name in categories:
    total = predictions[name].sum()
    print(name, total)

amount marked true in each category
toxic 5806
severe_toxic 421
obscene 3108
threat 88
insult 2577
identity_hate 227


In [9]:
for name in categories:
    print('---', name.upper(), '---')
    cond = predictions[name] == True
    df = predictions[cond]
    count = 0
    for index, row in df.iterrows():
        print('- ', test[test['id'] == row['id']]['comment_text'].tolist()[0])
        count += 1
        if count > 5:
            break


--- TOXIC ---
-  stop being a foolif hes going to post up racist lies and claim hes following policy im going to call that fool feldspar what he is a racist fool
-  i wish the euro74rash would stop fucking up jewish articles only jews should be allowed to edit jewish articles because the fucking antisemites who run wikipedia cant be trustedjust look at the garbage this dutchman is doing httpenwikipediaorgwindexphptitlekatzrindiff607502596oldid607494715
-  unblockmaterial scientist isa dum dum and so is pbass607 yolo swag they are both gay with each other
-  i reckon you should die
-  your agendayou are the one going around assuming peoples ethnicity who are you to say this is this and that is that im on to your racist agenda
-  lmao what weak threats i havent been banned yet bitch go vandalism lmao what weak threats i havent been banned yet bitch go vandalism lmao what weak threats i havent been banned yet bitch go vandalism lmao what weak threats i havent been banned yet bitch go vand

In [10]:
# output to csv
prob_predictions.to_csv('lg_submission.csv', index=False)

In [11]:
with open('lg_submission.csv', 'rb') as f:
    print(f)

<_io.BufferedReader name='lg_submission.csv'>
