In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import train_test_split



In [2]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        y = y.values
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self

In [3]:
df = pd.read_csv('./train.csv')
train, test = train_test_split(df, test_size=0.2)
#train = pd.read_csv('./train.csv')
#test = pd.read_csv('./test.csv')


In [4]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [5]:
train['comment_text'].fillna("cbarcelon", inplace=True)
test['comment_text'].fillna("cbarcelon", inplace=True)
classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_ratings = train[classes].values
test_ratings = test[classes].values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [6]:
import re, string
re_tok = re.compile('([' + string.punctuation + '“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [None]:
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize, min_df=11, max_df=0.8, strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1, analyzer='word')
trn_term_doc = vec.fit_transform(train['comment_text'])
test_term_doc = vec.transform(test['comment_text'])

In [28]:
from sklearn.model_selection import Kfold
clf = NbSvmClassifier(C=1, dual=True, n_jobs=-1)
nbsvmpred = np.zeros((len(test), len(classes)))


for i, j in enumerate(classes):
    nbsvmpred[:,i] = cross_val_predict(clf, )
    #clf.fit(trn_term_doc, train[j])
    #nbsvmpred[:,i] = clf.predict_proba(test_term_doc)[:,1]

In [19]:
for c_value in Values:
    model = NbSvmClassifier(C=c_value, dual=True, n_jobs=-1)
    nbsvmpred = np.zeros((len(test), len(label_cols)))
    for i, j in enumerate(label_cols):
        model.fit(trn_term_doc, train[j])
        nbsvmpred[:,i] = model.predict_proba(test_term_doc)[:,1]
     
    auc = calc_auc(test_ratings, nbsvmpred)
    print('auc_roc ' , c_value, ' ', auc)

('auc_roc ', 1, ' ', 0.98719821780243)
('auc_roc ', 2, ' ', 0.98683902330862594)
('auc_roc ', 3, ' ', 0.98643985742463103)
('auc_roc ', 4, ' ', 0.98607463476628643)
('auc_roc ', 5, ' ', 0.98575495490256981)
('auc_roc ', 6, ' ', 0.98547187893932486)
('auc_roc ', 7, ' ', 0.98521712810686968)
('auc_roc ', 8, ' ', 0.98498909112226529)
('auc_roc ', 9, ' ', 0.98478306268918214)
('auc_roc ', 10, ' ', 0.98459768202759601)
('auc_roc ', 50, ' ', 0.98150291562394232)
('auc_roc ', 100, ' ', 0.98011387042940201)
('auc_roc ', 500, ' ', 0.97727500163175696)
('auc_roc ', 1000, ' ', 0.97630011659776761)


In [13]:
from sklearn.metrics import roc_auc_score
def calc_auc(y_true, y_pred):
    return np.mean([roc_auc_score(y_true[:, i], y_pred[:, i]) 
                    for i in range(y_true.shape[1])])

In [14]:
auc = calc_auc(test_ratings, nbsvmpred)
print('auc_roc ', auc)

('auc_roc ', 0.98719821780243)


In [29]:
subm =pd.read_csv('./sample_submission.csv')
submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(nbsvmpred, columns = classes)], axis=1)
submission.to_csv('NBSVM_char_submission.csv', index=False)

In [None]:
from __future__ import print_function

from pprint import pprint
from time import time
import logging

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

print(__doc__)

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

# #############################################################################
# Define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('clf', NbSvmClassifier(n_jobs=-1)),
])
# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    #'vect__max_df': (.8),
    #'vect__ngram_range': ((1, 2)),  # unigrams or bigrams
    'clf__C': (1,10),
    'clf__dual': (True, False),
    #'clf__subsample': (.3, .6, .8, 1),
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=0)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(trn_term_doc, train['toxic'])
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Automatically created module for IPython interactive environment
Performing grid search...
pipeline: ['clf']
parameters:
{'clf__C': (1, 10), 'clf__dual': (True, False)}
