In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import train_test_split

In [2]:
df = pd.read_csv('./train.csv')
train, test = train_test_split(df, test_size=0.2)


In [3]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['none'] = 1-train[label_cols].max(axis=1)
train.describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
count,76680.0,76680.0,76680.0,76680.0,76680.0,76680.0,76680.0,76680.0
mean,500118300000.0,0.095527,0.009781,0.052921,0.003091,0.049165,0.008359,0.898722
std,289129700000.0,0.293943,0.098414,0.223878,0.055509,0.216215,0.091047,0.301698
min,22256640.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,248048300000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,501249800000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,750806800000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,999988200000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
train['comment_text'].fillna("cbarcelon", inplace=True)
test['comment_text'].fillna("cbarcelon", inplace=True)
classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_ratings = train[classes].values
test_ratings = test[classes].values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [5]:
import re, string
re_tok = re.compile('([' + string.punctuation + '“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [6]:
n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize, min_df=20, max_df=0.7, strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1, analyzer='word')
trn_term_doc = vec.fit_transform(train['comment_text'])
test_term_doc = vec.transform(test['comment_text'])

In [7]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        y = y.values
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self

In [9]:
#log loss score function
from sklearn.metrics import log_loss
def calc_loss(y_true, y_pred):
    return np.mean([log_loss(y_true[:, i], y_pred[:, i]) 
                    for i in range(y_true.shape[1])])

In [8]:
model = NbSvmClassifier(C=1, dual=True, n_jobs=-1)
preds = np.zeros((len(test), len(label_cols)))
for i, j in enumerate(label_cols):
    print('fit', j)
    model.fit(trn_term_doc, train[j])
    preds[:,i] = model.predict_proba(test_term_doc)[:,1]

('fit', 'toxic')
('fit', 'severe_toxic')
('fit', 'obscene')
('fit', 'threat')
('fit', 'insult')
('fit', 'identity_hate')


In [10]:
score = calc_loss(test_ratings, preds)
print("test text", score)

('test text', 0.053913745602110791)


In [20]:
ngram_ranges = [(2,2),(2,3),(2,4),(2,5),(2,6),(2,7),(2,8),(2,9),(2,10),(3,3),(3,4),(3,5),(3,6),(3,7),(4,4),(4,5),(4,6),(4,7),(5,5),(5,6),(5,7),(5,7),(6,7),(6,8)]
n = train.shape[0]
for ngram in ngram_ranges:
    vec = TfidfVectorizer(ngram_range=(2,8), tokenizer=tokenize, min_df=11, max_df=0.7, strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1, analyzer='char')
    trn_term_doc = vec.fit_transform(train['comment_text'])
    test_term_doc = vec.transform(test['comment_text'])
    model = NbSvmClassifier(C=1, dual=True, n_jobs=-1)
    preds = np.zeros((len(test), len(label_cols)))
    for i, j in enumerate(label_cols):
    #    print('fit', j)
        model.fit(trn_term_doc, train[j])
        preds[:,i] = model.predict_proba(test_term_doc)[:,1]
    score = calc_loss(test_ratings, preds)
    print(ngram, score)

((2, 2), 0.051821200445514463)
((2, 3), 0.05182120092594824)
((2, 4), 0.051821199900999114)
((2, 5), 0.051821200849504805)
((2, 6), 0.05182119877801477)
((2, 7), 0.051821199423204983)
((2, 8), 0.051821201218196562)
((2, 9), 0.051821200818894235)
((2, 10), 0.051821200007301005)
((3, 3), 0.051821200881658765)
((3, 4), 0.051821200179876327)
((3, 5), 0.051821200808329998)
((3, 6), 0.051821200093848503)


KeyboardInterrupt: 

In [49]:
print(ngrams[1])

(1, 3)


In [53]:
model.get_params()

{'C': 4, 'dual': True, 'n_jobs': -1}

In [18]:
preds[0][1]

0.0013252675065700934