In [13]:
import json
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.externals.joblib import dump, load
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import make_scorer, roc_auc_score, log_loss
from sklearn.externals.joblib import load, dump
from scipy.sparse import hstack
from common import SEED, TARGETS

In [41]:
CORPUS_CLEANED_PATH = "cache/corpus_cleaned.pkl"
CHAR_VECTORIZER_PATH = "cache/char_vect.pkl"
WORD_VECTORIZER_PATH = "cache/word_vect.pkl"
CHAR_TRAIN_PATH = "cache/char_train.pkl"
WORD_TRAIN_PATH = "cache/word_train.pkl"
CHAR_VAL_PATH = "cache/char_val.pkl"
WORD_VAL_PATH = "cache/word_val.pkl"

LOGREG_BEST_PARAMS_CACHE = "cache/logreg_best_params.json"

TRAIN_TEST_SPLIT = 0.3
K_FOLDS = 10
VALIDATION_PRED_FILE = "cache/logreg_validation_pred_fold_%s.pkl"

TRAIN_FILE = "data/train_clean.csv"
TEST_FILE = "data/test_clean.csv"
SUBMISSION_SAMPLE_FILE = "data/sample_submission.csv"
SUBMISSION_FILE = "submissions/submission_logistic_regression.csv.gz"

In [3]:
%matplotlib inline
np.random.seed(SEED)
np.set_printoptions(suppress=True)
pd.set_option("display.max_colwidth", -1)

In [4]:
train = pd.read_csv(TRAIN_FILE, encoding="utf-8")
test = pd.read_csv(TEST_FILE, encoding="utf-8")
submission = pd.read_csv(SUBMISSION_SAMPLE_FILE, encoding="utf-8")

In [5]:
corpus = pd.concat([train["comment_text"], test["comment_text"]]).values

In [6]:
%%time
try:
    char_vectorizer = load(CHAR_VECTORIZER_PATH)
except IOError:
    char_vectorizer = TfidfVectorizer(
        analyzer="char",
        ngram_range=(1, 4),
        max_features=25000,
        sublinear_tf=True
    )
    char_vectorizer.fit(corpus)
    dump(char_vectorizer, CHAR_VECTORIZER_PATH)

Wall time: 3.89 s


In [7]:
%%time
try:
    corpus_cleaned = load(CORPUS_CLEANED_PATH)
except IOError:
    corpus_cleaned = pd.concat([train["comment_text"], test["comment_text"]]).values
    dump(corpus_cleaned, CORPUS_CLEANED_PATH)

Wall time: 294 ms


In [8]:
%%time
try:
    word_vectorizer = load(WORD_VECTORIZER_PATH)
except IOError:
    word_vectorizer = TfidfVectorizer(
        analyzer="word",
        max_features=25000,
        min_df=10,
        stop_words="english"
    )
    word_vectorizer.fit(corpus_cleaned)
    dump(word_vectorizer, WORD_VECTORIZER_PATH)

Wall time: 1.2 s


In [9]:
%%time
x_train_ngram_vect = char_vectorizer.transform(train["comment_text"])
x_train_word_vect = word_vectorizer.transform(train["comment_text"])
x_train_vect = hstack([
    x_train_ngram_vect,
    x_train_word_vect,
    sparse.csr_matrix(train[["exclamation_mark_ratio", "upper_ratio", "symbols_ratio"]].values)
])

Wall time: 2min 24s


In [10]:
%%time
x_test_ngram_vect = char_vectorizer.transform(test["comment_text"])
x_test_word_vect = word_vectorizer.transform(test["comment_text"])
x_test_vect = hstack([
    x_test_ngram_vect,
    x_test_word_vect,
    sparse.csr_matrix(test[["exclamation_mark_ratio", "upper_ratio", "symbols_ratio"]].values)
])

Wall time: 2min 5s


In [11]:
x_train, x_val, y_train, y_val = train_test_split(
    x_train_vect, train[TARGETS].values, test_size=TRAIN_TEST_SPLIT, random_state=SEED, shuffle=False)

In [12]:
try:
    with open(LOGREG_BEST_PARAMS_CACHE) as fread:
        logreg_best_params = json.load(fread)
except IOError:
    grid = {
       "C": [3.5, 3.75, 4.0, 4.25, 4.5]
    }
    model = LogisticRegression(random_state=SEED, verbose=0)
    scorer = make_scorer(log_loss, greater_is_better=False)
    rsearch = GridSearchCV(estimator=model, verbose=2, param_grid=grid, scoring=scorer, n_jobs=-1)
    rsearch.fit(x_train, y_train[:, 0])
    logreg_best_params = rsearch.best_params_
    with open(LOGREG_BEST_PARAMS_CACHE, 'w') as fwrite:
        json.dump(logreg_best_params, fwrite, indent=2)
    print(-rsearch.best_score_)

print(logreg_best_params)

{'C': 4.25}


In [43]:
kfold = KFold(n_splits=K_FOLDS, random_state=SEED)

In [44]:
x_train_vect = x_train_vect.tocsr()

In [46]:
%%time
for fold_index, (train_index, test_index) in enumerate(kfold.split(x_train_vect, train[TARGETS].values)):
    
    x_train, x_val = x_train_vect[train_index], x_train_vect[test_index]
    y_train, y_val = train[TARGETS].values[train_index], train[TARGETS].values[test_index]
    
    scores = []
    y_preds_proba = []
    
    for index, target in enumerate(TARGETS):
        print("Training model for y = %s on fold %d" % (target, fold_index))
        model = LogisticRegression(random_state=SEED, verbose=0)
        model.set_params(**logreg_best_params)
        model.fit(x_train, y_train[:, index])
        
        y_pred_proba = model.predict_proba(x_val)
        y_preds_proba.append(y_pred_proba)
    
    score = roc_auc_score(y_val, np.array(y_preds_proba).transpose(1, 0, 2)[:,:,1])
    print("Validation ROC-AUC score for fold %d: %0.4f" % (fold_index, score))
    scores.append(score)
    
    dump(np.array(y_preds_proba).transpose(1, 0, 2)[:,:,1], VALIDATION_PRED_FILE % fold_index)

Training model for y = toxic on fold 0
Training model for y = severe_toxic on fold 0
Training model for y = obscene on fold 0
Training model for y = threat on fold 0
Training model for y = insult on fold 0
Training model for y = identity_hate on fold 0
Validation ROC-AUC score for fold 0: 0.9860
Training model for y = toxic on fold 1
Training model for y = severe_toxic on fold 1
Training model for y = obscene on fold 1
Training model for y = threat on fold 1
Training model for y = insult on fold 1
Training model for y = identity_hate on fold 1
Validation ROC-AUC score for fold 1: 0.9847
Training model for y = toxic on fold 2
Training model for y = severe_toxic on fold 2
Training model for y = obscene on fold 2
Training model for y = threat on fold 2
Training model for y = insult on fold 2
Training model for y = identity_hate on fold 2
Validation ROC-AUC score for fold 2: 0.9863
Training model for y = toxic on fold 3
Training model for y = severe_toxic on fold 3
Training model for y = o

In [50]:
print("Average ROC-AUC score: %0.4f" % np.mean(scores))

Average ROC-AUC score: 0.9850


In [None]:
models = []
for index, target in enumerate(TARGETS):
    print("Re-training on full dataset for y = %s" % target)
    model = LogisticRegression(random_state=SEED, verbose=0)
    model.set_params(**logreg_best_params)
    model.fit(
        x_train_vect,
        train[target].values
    )
    models.append(model)

In [None]:
%%time
submission_logreg = submission
for index, target in enumerate(TARGETS):
    submission_logreg[target] = models[index].predict_proba(x_test_vect)[:, 1]

In [None]:
submission_logreg.to_csv(SUBMISSION_FILE, index=False, encoding="utf-8", compression="gzip")
# 0.9764