In [1]:
import pandas as pd
import re
import string
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss, roc_auc_score

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [3]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [4]:
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, X):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(X.multiply(self._r))

    def predict_proba(self, X):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(X.multiply(self._r))

    def fit(self, X, y):
        # Check that X and y have correct shape
        # y = y.values
        X, y = check_X_y(X, y, accept_sparse=True)

        def pr(X, y_i, y):
            p = X[y == y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(X,1,y) / pr(X,0,y)))
        X_nb = X.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(X_nb, y)
        return self

In [5]:
def tokenize(s):
    pattern = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
    return pattern.sub(r' \1 ', s).split()

In [6]:
X = train.comment_text.values
idx = np.arange(len(X))

In [7]:
train['comment_text'].fillna("unknown", inplace=True)
test['comment_text'].fillna("unknown", inplace=True)

In [8]:
# X_train, X_test, idx_train, idx_test = train_test_split(X, idx, test_size=0.2)

In [9]:
tfidf = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
                        min_df=3, max_df=0.9, strip_accents='unicode',
                        use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english')

In [10]:
# X_train_vec = tfidf.fit_transform(X_train)
# X_test_vec = tfidf.transform(X_test)

In [11]:
# probs = np.zeros(shape=(len(X_test), 6))
# for i, col in enumerate(train.columns[2:]):
#     print("Training ", col)
#     y_train = train.loc[idx_train, col].values
#     y_test = train.loc[idx_test, col].values
#     nbsvm = NbSvmClassifier()
#     nbsvm.fit(X_train_vec, y_train)
#     y_pred = nbsvm.predict_proba(X_test_vec)[:,1]
#     probs[:, i] = y_pred
#     auc = roc_auc_score(y_test, y_pred)
#     lg_loss = log_loss(y_test, y_pred)
#     print("{} auc: {}".format(col, auc))
#     print("{} log loss: {}".format(col, lg_loss))

In [12]:
preds = pd.DataFrame()
preds['id'] = test.id

In [13]:
X_train_vec = tfidf.fit_transform(X)
X_test = test.comment_text.values
X_test_vec = tfidf.transform(X_test)

In [14]:
for col in train.columns[2:]:
    print("Training ", col)
    y_train = train.loc[:, col].values
    nbsvm = NbSvmClassifier(n_jobs=-1)
    nbsvm.fit(X_train_vec, y_train)
    preds[col] = nbsvm.predict_proba(X_test_vec)[:, 1]

Training  toxic


  " = {}.".format(self.n_jobs))


Training  severe_toxic
Training  obscene
Training  threat
Training  insult
Training  identity_hate


In [15]:
preds[preds.toxic > 0.5]

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
58,183006597,0.961123,0.002865,0.078256,0.001627,0.633589,0.061537
109,404313935,0.982104,0.005122,0.731802,0.002747,0.080953,0.039362
138,553531363,0.823787,0.036916,0.062688,0.544601,0.084131,0.004633
231,957753792,0.858749,0.011579,0.577243,0.001060,0.326341,0.004454
258,1076484652,0.746105,0.032405,0.839003,0.018041,0.323707,0.008781
262,1096536986,0.910828,0.011371,0.070387,0.001453,0.253790,0.002939
459,1984985554,0.937835,0.004545,0.730015,0.000754,0.056345,0.003639
463,1992027545,0.999999,0.160569,0.999997,0.001288,0.998185,0.979768
468,2008477639,0.933678,0.002184,0.238794,0.000669,0.669571,0.007752
480,2086856127,0.590558,0.009104,0.064511,0.004638,0.047301,0.011155


In [17]:
preds.to_csv('../submissions/nbsvm-baseline.csv', index=False)

In [18]:
pd.read_csv('../data/sample_submission.csv').head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,0.5,0.5,0.5,0.5,0.5,0.5
1,6102620,0.5,0.5,0.5,0.5,0.5,0.5
2,14563293,0.5,0.5,0.5,0.5,0.5,0.5
3,21086297,0.5,0.5,0.5,0.5,0.5,0.5
4,22982444,0.5,0.5,0.5,0.5,0.5,0.5


In [19]:
pd.read_csv('../submissions/nbsvm-baseline.csv').head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,0.009025,0.001267,0.004971,0.000436,0.005066,0.001514
1,6102620,0.012725,0.000929,0.00642,0.000398,0.007306,0.00113
2,14563293,0.003849,0.000987,0.003595,0.000387,0.002797,0.000877
3,21086297,0.042515,0.002469,0.010394,0.000555,0.011235,0.001013
4,22982444,0.010157,0.0018,0.005487,0.00048,0.004195,0.00159
