In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score



In [2]:
import xgboost as xgb

In [3]:
#Testing
df = pd.read_csv('./train.csv')
train, test = train_test_split(df, test_size=0.2)
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train_target = train[classes]
test_target = test[classes]
#Kaggle Submission
#train = pd.read_csv('./train.csv')
#test = pd.read_csv('./test.csv')

In [4]:
import re, string
re_tok = re.compile('([' + string.punctuation + '“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [38]:
vec = TfidfVectorizer(ngram_range=(1,5), tokenizer=tokenize, min_df=3, max_df=0.8, strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1, analyzer='word')
train_doc = vec.fit_transform(train['comment_text'])
test_doc = vec.transform(test['comment_text'])


In [39]:
from sklearn.metrics import roc_auc_score
def calc_auc(y_true, y_pred):
    return np.mean([roc_auc_score(y_true[:, i], y_pred[:, i]) 
                    for i in range(y_true.shape[1])])

In [40]:
model = xgb.XGBClassifier(max_depth=5, learning_rate=.3, nthread=-1)
xgbpred = np.zeros((len(test), len(classes)))
for i, j in enumerate(classes):
    print('fitting ', j)
    model.fit(train_doc, train[j])
    print('predicting ', j)
    xgbpred[:,i] = model.predict_proba(test_doc)[:,1]

fitting  toxic
predicting  toxic
fitting  severe_toxic
predicting  severe_toxic
fitting  obscene
predicting  obscene
fitting  threat
predicting  threat
fitting  insult
predicting  insult
fitting  identity_hate
predicting  identity_hate


In [41]:
print (calc_auc(test[classes].values, xgbpred))

0.971392673731


In [9]:
from __future__ import print_function

from pprint import pprint
from time import time
import logging

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

print(__doc__)

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

# #############################################################################
# Define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('clf', xgb.XGBClassifier(n_jobs=-1)),
])
# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    #'vect__max_df': (.8),
    #'vect__ngram_range': ((1, 2)),  # unigrams or bigrams
    'clf__learning_rate': (.01, .1,.2),
    'clf__max_depth': (3,4,5),
    #'clf__subsample': (.3, .6, .8, 1),
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=0)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(train_doc, train['toxic'])
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Automatically created module for IPython interactive environment
Performing grid search...
pipeline: ['clf']
parameters:
{'clf__learning_rate': (0.01, 0.1, 0.2), 'clf__max_depth': (3, 4, 5)}
done in 695.435s

Best score: 0.953
Best parameters set:
	clf__learning_rate: 0.2
	clf__max_depth: 5
