In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss

import warnings
warnings.filterwarnings("ignore")
np.random.seed(29)

In [2]:
columns = ['toxic_auc','toxic_precision','toxic_recall','toxic_f1','severe_auc',\
           'severe_precision','severe_recall','severe_f1','obscene_auc','obscene_precision',\
           'obscene_recall','obscene_f1','threat_auc','threat_precision','threat_recall','threat_f1',\
           'insult_auc','insult_precision','insult_recall','insult_f1','identity_auc','identity_precision',\
           'identity_recall','identity_f1','final_auc']

In [3]:
def run_predictions(data, on='comment_text', verbose=0):
    
    results = pd.DataFrame(columns=columns)
    
    vec = TfidfVectorizer()
    X = vec.fit_transform(data[on])
    
    if verbose:
        print(X.shape)

    msk = data['split'].values
    X_train = X[msk]
    X_test = X[~msk]
    
    scores = []
    models = {}
    bad = {}
    
    labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    
    baseline_res = []

    for label in labels:
    
        y = data[label]
        y_train = y[msk]
        y_test = y[~msk]
    
        model = LogisticRegression()    
        model.fit(X_train, y_train)
        models[label] = model
    
        y_pred = model.predict(X_test)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
           
        wrong = [x==y for x,y in zip(y_test,y_pred)]
        
        indices = np.where(np.array(wrong) == False)
        bad[label] = indices
    
        y_pred = model.predict_proba(X_test) 
        score = roc_auc_score(y_test, y_pred[:,1])
        scores.append(score)
        
        baseline_res += [score, precision, recall, f1]
        
        if verbose:
            print('roc_auc_score score on {0}: {1}'.format(label, score))
            print('Precision/Recall on {0}: {1}/{2}'.format(label, precision,recall))
            print('F1 score on {0}: {1}'.format(label, f1))
            print('')
    
    final_score = sum(scores)/6
    
    baseline_res += [final_score]
    
    baseline_res = {k:v for k,v in zip(columns, baseline_res)}
    results = results.append(baseline_res, ignore_index=True)
    results.to_csv('results/baseline.csv', index=False)
    print('Final roc_auc_score score: {0}'.format(final_score))
    return bad, models, vec, baseline_res

**Default dataset**

In [4]:
train = pd.read_csv('data/train_pre.csv')
train.replace(np.nan, '', regex=True, inplace=True)
# running the baseline on non-augmented dataset
train = train[train['lang'] == 'en']
print('Size of the dataset: {}'.format(len(train)))

Size of the dataset: 159571


In [5]:
bad, models, vec, res = run_predictions(train, on='tokenized_text_upper')

Final roc_auc_score score: 0.9761572006464002
