In [46]:
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.externals.joblib import dump, load
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.externals.joblib import load, dump
from scipy.sparse import hstack
import xgboost as xgb
from common import SEED, TARGETS

In [7]:
CORPUS_CLEANED_PATH = "cache/corpus_cleaned.pkl"
CHAR_VECTORIZER_PATH = "cache/char_vect.pkl"
WORD_VECTORIZER_PATH = "cache/word_vect.pkl"
CHAR_TRAIN_PATH = "cache/char_train.pkl"
WORD_TRAIN_PATH = "cache/word_train.pkl"
CHAR_VAL_PATH = "cache/char_val.pkl"
WORD_VAL_PATH = "cache/word_val.pkl"

K_FOLDS = 10
VALIDATION_PRED_FILE = "cache/xgb_validation_pred_fold_%s.pkl"

TRAIN_FILE = "data/train_clean.csv"
TEST_FILE = "data/test_clean.csv"
SUBMISSION_SAMPLE_FILE = "data/sample_submission.csv"
SUBMISSION_FILE = "submissions/submission_xgb.csv.gz"

In [8]:
%matplotlib inline
np.random.seed(SEED)
np.set_printoptions(suppress=True)
pd.set_option("display.max_colwidth", -1)

In [9]:
train = pd.read_csv(TRAIN_FILE, encoding="utf-8")
test = pd.read_csv(TEST_FILE, encoding="utf-8")
submission = pd.read_csv(SUBMISSION_SAMPLE_FILE, encoding="utf-8")

In [10]:
corpus = pd.concat([train["comment_text"], test["comment_text"]]).values

In [11]:
%%time
try:
    char_vectorizer = load(CHAR_VECTORIZER_PATH)
except IOError:
    char_vectorizer = TfidfVectorizer(
        analyzer="char",
        ngram_range=(1, 4),
        max_features=25000,
        sublinear_tf=True
    )
    char_vectorizer.fit(corpus)
    dump(char_vectorizer, CHAR_VECTORIZER_PATH)

Wall time: 3.97 s


In [12]:
%%time
try:
    corpus_cleaned = load(CORPUS_CLEANED_PATH)
except IOError:
    corpus_cleaned = pd.concat([train["comment_text"], test["comment_text"]]).values
    dump(corpus_cleaned, CORPUS_CLEANED_PATH)

Wall time: 357 ms


In [13]:
%%time
try:
    word_vectorizer = load(WORD_VECTORIZER_PATH)
except IOError:
    word_vectorizer = TfidfVectorizer(
        analyzer="word",
        max_features=25000,
        min_df=10,
        stop_words="english"
    )
    word_vectorizer.fit(corpus_cleaned)
    dump(word_vectorizer, WORD_VECTORIZER_PATH)

Wall time: 1.32 s


In [14]:
%%time
x_train_ngram_vect = char_vectorizer.transform(train["comment_text"])
x_train_word_vect = word_vectorizer.transform(train["comment_text"])
x_train_vect = hstack([
    x_train_ngram_vect,
    x_train_word_vect,
    sparse.csr_matrix(train[["exclamation_mark_ratio", "upper_ratio", "symbols_ratio"]].values)
])

Wall time: 2min 14s


In [15]:
%%time
x_test_ngram_vect = char_vectorizer.transform(test["comment_text"])
x_test_word_vect = word_vectorizer.transform(test["comment_text"])
x_test_vect = hstack([
    x_test_ngram_vect,
    x_test_word_vect,
    sparse.csr_matrix(test[["exclamation_mark_ratio", "upper_ratio", "symbols_ratio"]].values)
])

Wall time: 2min 2s


In [16]:
kfold = KFold(n_splits=K_FOLDS, random_state=SEED)

In [17]:
x_train_vect = x_train_vect.tocsr()

In [42]:
num_epochs = 500
early_stopping_rounds = 10
param = {
    "max_depth": 8,
    "colsample_bytree": 0.7,
    "subsample": 0.6,
    "min_child_weight": 1,
    "eta": 0.2,
    "silent": 0,
    "objective": "binary:logistic",
    "eval_metric": "auc"
}

In [43]:
%%time
for fold_index, (train_index, test_index) in enumerate(kfold.split(x_train_vect, train[TARGETS].values)):
    
    x_train, x_val = x_train_vect[train_index], x_train_vect[test_index]
    y_train, y_val = train[TARGETS].values[train_index], train[TARGETS].values[test_index]
    
    scores = []
    y_preds = []
    
    for index, target in enumerate(TARGETS):
        print("Training model for y = %s on fold %d" % (target, fold_index))
        
        xgtrain = xgb.DMatrix(x_train, label=y_train[:, index])
        xgval = xgb.DMatrix(x_val, label=y_val[:,index])
        watchlist = [(xgtrain, "train"), (xgval, "test")]
        model = xgb.train(param, xgtrain, num_epochs, watchlist, early_stopping_rounds=early_stopping_rounds)
        y_pred = model.predict(xgval)
        y_preds.append(y_pred)
        
    score = roc_auc_score(y_val, np.array(y_preds).T)
    print("Validation ROC-AUC score for fold %d: %0.4f" % (fold_index, score))
    scores.append(score)
    
    dump(np.array(y_preds).T, VALIDATION_PRED_FILE % fold_index)

Training model for y = toxic on fold 0
[0]	train-auc:0.745156	test-auc:0.743462
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 10 rounds.
[1]	train-auc:0.829846	test-auc:0.82337
[2]	train-auc:0.853838	test-auc:0.845732
[3]	train-auc:0.860656	test-auc:0.854444
[4]	train-auc:0.881332	test-auc:0.875846
[5]	train-auc:0.90572	test-auc:0.901007
[6]	train-auc:0.916042	test-auc:0.908372
[7]	train-auc:0.921681	test-auc:0.91354
[8]	train-auc:0.925681	test-auc:0.918524
[9]	train-auc:0.934276	test-auc:0.927062
[10]	train-auc:0.939891	test-auc:0.930278
[11]	train-auc:0.942985	test-auc:0.932728
[12]	train-auc:0.947782	test-auc:0.935815
[13]	train-auc:0.952964	test-auc:0.939126
[14]	train-auc:0.955773	test-auc:0.941154
[15]	train-auc:0.958365	test-auc:0.942137
[16]	train-auc:0.961256	test-auc:0.943971
[17]	train-auc:0.963623	test-auc:0.946078
[18]	train-auc:0.965061	test-auc:0.947349
[19]	train-auc:0.96616	test-auc:0.9

In [45]:
print("Average ROC-AUC score: %0.4f" % np.mean(scores))

Average ROC-AUC score: 0.9808


In [47]:
X_train, X_test, y_train, y_test = train_test_split(x_train_vect, train[TARGETS].values, test_size=0.05)

In [48]:
models = []
watchlist = [(xgtrain, "train"), (xgval, "test")]
for index, target in enumerate(TARGETS):
    print("Re-training on full dataset for y = %s" % target)
    xgtrain = xgb.DMatrix(X_train, label=y_train[:, index])
    xgval = xgb.DMatrix(X_test, label=y_test[:,index])
    watchlist = [(xgtrain, "train"), (xgval, "test")]
    model = xgb.train(param, xgtrain, num_epochs, watchlist, early_stopping_rounds=early_stopping_rounds)
    models.append(model)

Re-training on full dataset for y = toxic
[0]	train-auc:0.742413	test-auc:0.75046
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 10 rounds.
[1]	train-auc:0.775638	test-auc:0.777103
[2]	train-auc:0.838533	test-auc:0.840005
[3]	train-auc:0.875905	test-auc:0.879947
[4]	train-auc:0.891571	test-auc:0.89056
[5]	train-auc:0.901998	test-auc:0.899146
[6]	train-auc:0.917981	test-auc:0.917364
[7]	train-auc:0.92409	test-auc:0.922062
[8]	train-auc:0.929925	test-auc:0.927194
[9]	train-auc:0.934531	test-auc:0.931491
[10]	train-auc:0.940131	test-auc:0.935157
[11]	train-auc:0.945812	test-auc:0.936201
[12]	train-auc:0.948546	test-auc:0.937142
[13]	train-auc:0.952238	test-auc:0.940275
[14]	train-auc:0.955869	test-auc:0.941851
[15]	train-auc:0.958447	test-auc:0.943702
[16]	train-auc:0.961551	test-auc:0.945068
[17]	train-auc:0.963829	test-auc:0.947684
[18]	train-auc:0.964469	test-auc:0.947657
[19]	train-auc:0.9663	test-auc:0

In [51]:
%%time
xgtest = xgb.DMatrix(x_test_vect)
for index, target in enumerate(TARGETS):
    submission[target] = models[index].predict(xgtest)

Wall time: 59.9 s


In [52]:
submission.to_csv(SUBMISSION_FILE, index=False, encoding="utf-8", compression="gzip") # 0.9767