In [1]:
%matplotlib inline
%load_ext autoreload

In [2]:
import os
import sys

src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

In [3]:
import pandas as pd
import re
import string
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import log_loss, roc_auc_score, confusion_matrix
from sklearn.pipeline import Pipeline
%aimport nbmodels
from nbmodels import NbLogisticClassifier
from nbextractor import NBExtractor
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.multiclass import OneVsRestClassifier

In [4]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [5]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
train.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [7]:
def tokenize(s):
    pattern = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
    return pattern.sub(r' \1 ', s).split()

In [8]:
X = train.comment_text.values
idx = np.arange(len(X))

In [9]:
train['comment_text'].fillna("unknown", inplace=True)
test['comment_text'].fillna("unknown", inplace=True)

In [10]:
X_train, X_test, idx_train, idx_test = train_test_split(X, idx, test_size=0.2)

In [11]:
tfidf = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
                        min_df=3, max_df=0.9, strip_accents='unicode',
                        use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english')

In [None]:
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

In [None]:
probs = np.zeros(shape=(len(X_test), 6))
for i, col in enumerate(train.columns[2:]):
    print("Training ", col)
    y_train = train.loc[idx_train, col].values
    y_test = train.loc[idx_test, col].values
    nblog = NbLogisticClassifier()
    nblog.fit(X_train_vec, y_train)
    y_pred = nblog.predict_proba(X_test_vec)[:,1]
    probs[:, i] = y_pred
    auc = roc_auc_score(y_test, y_pred)
    lg_loss = log_loss(y_test, y_pred)
    print("{} auc: {}".format(col, auc))
    print("{} log loss: {}".format(col, lg_loss))


# Pipelined version

In [14]:
logistic_params = {
    'penalty': ['l1', 'l2'],
    'C': [100, 10, 1, 1e-1, 1e-2, 1e-3 ,1e-4, 1e-5]
}

In [15]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=tokenize, strip_accents='unicode', stop_words='english')),
    ('one_vs_rest', OneVsRestClassifier(Pipeline([
        ('naive_bayes_extractor', NBExtractor()),
        ('logistic_regression', GridSearchCV(LogisticRegression(n_jobs=2, solver='saga'), logistic_params, verbose=True))
    ])
    )),
])

tfidf_params = {
    'tfidf__ngram_range': [(1,2), (1,3)],
    'tfidf__min_df': [3],
    'tfidf__max_df': [0.9],
    'tfidf__use_idf': [1],
    'tfidf__smooth_idf': [1],
    'tfidf__sublinear_tf': [1],
    
}


In [17]:
clf = GridSearchCV(pipeline, tfidf_params, verbose=True, n_jobs=3)

In [None]:
best_model = clf.fit(X_train, train.loc[idx_train, train.columns[2:]].values)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [None]:
full_model = pipeline.fit(X_train, train.loc[idx_train, train.columns[2:]].values)

In [None]:
full_model.predict(X_test)

In [None]:
preds = pd.DataFrame()
preds['id'] = test.id

In [None]:
X_train_vec = tfidf.fit_transform(X)
X_test = test.comment_text.values
X_test_vec = tfidf.transform(X_test)

In [None]:
for col in train.columns[2:]:
    print("Training ", col)
    y_train = train.loc[:, col].values
    nblog = NbLogisticClassifier(n_jobs=-1)
    nblog.fit(X_train_vec, y_train)
    preds[col] = nblog.predict_proba(X_test_vec)[:, 1]

In [None]:
preds.to_csv('../submissions/nbsvm-baseline.csv', index=False)

In [None]:
sample = pd.read_csv('../data/sample_submission.csv').head()

In [None]:
pd.read_csv('../submissions/nbsvm-baseline.csv').head()