In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import multiprocessing
import os
import matplotlib.pyplot as plt
import numpy as np
import skopt
import xlearn as xl
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score)
from skopt.plots import (plot_convergence, plot_evaluations, plot_objective)
from utils.measuring_performance import *
from utils.misc import *



In [3]:
DATA_DIR = os.path.abspath('../../Data/display_advertising_challenge/processed')
MODEL_DIR = os.path.abspath('model')
USE_FIELD = False

In [4]:
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)

In [5]:
model_type = 'ffm' if USE_FIELD else 'fm'
train_dataset_type = 'train'
test_dataset_type = 'valid'

train_dataset_path = os.path.join(DATA_DIR, '_'.join([model_type, 'dataset', train_dataset_type]) + '.libsvm')
test_dataset_path = os.path.join(DATA_DIR, '_'.join([model_type, 'dataset', test_dataset_type]) + '.libsvm')
model_path = os.path.join(MODEL_DIR, '_'.join([model_type, 'model', train_dataset_type]) + '.out')
score_path = os.path.join(MODEL_DIR, '_'.join([model_type, 'score', test_dataset_type]) + '.txt')

In [6]:
model = xl.create_ffm() if USE_FIELD else xl.create_fm()
model.setOnDisk()
model.setSigmoid()
model.setTrain(train_dataset_path)
model.setValidate(test_dataset_path)
model.setTest(test_dataset_path)

In [7]:
def update_params(params):
    default_params = {
        'task': 'binary'
        , 'nthread': int(0.5 * multiprocessing.cpu_count())
        , 'opt': 'adagrad'
        , 'epoch': 100
        , 'stop_window': 3
        , 'metric': 'auc'
    }
    default_params.update(params)
    return default_params


def set_args(model, space, model_path, score_path, y_true):
    @skopt.utils.use_named_args(space)
    def _objective(**params):
        params = update_params(params)
        model.fit(params, model_path)
        model.predict(model_path, score_path)
        with open(score_path, 'r') as file:
            y_score = np.array([float(line) for line in file])
        return -1.0 * roc_auc_score(y_true, y_score)
    return _objective

In [8]:
with open(test_dataset_path, 'r') as file:
    y_true = np.array([int(line[0]) for line in file])

In [9]:
space = [skopt.space.Real(0.0125, 0.4, name='lr', prior='log-uniform'), 
         skopt.space.Real(0.0005, 0.016, name='lambda', prior='log-uniform'), 
         skopt.space.Integer(2, 32, name='k')]

objective = set_args(model, space, model_path, score_path, y_true)

In [None]:
with get_elapsed_time():
    results = skopt.forest_minimize(objective, space, base_estimator='ET', acq_func='EI', n_calls=30, 
                                    random_state=42, verbose=True, xi=0.01, n_jobs=1)

Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 117.2049
Function value obtained: -0.7154
Current minimum: -0.7154
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 142.9001
Function value obtained: -0.7219
Current minimum: -0.7219
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 250.6551
Function value obtained: -0.7239
Current minimum: -0.7239
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 128.3695
Function value obtained: -0.7124
Current minimum: -0.7239
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 324.0491
Function value obtained: -0.7203
Current minimum: -0.7239
Iteration No: 6

In [None]:
skopt.dump(results, os.path.join(MODEL_DIR, '_'.join([model_type, 'opt', train_dataset_type]) + '.pkl'), 
           store_objective=False)

In [None]:
_ = skopt.plots.plot_convergence(results)
_ = skopt.plots.plot_evaluations(results)
_ = skopt.plots.plot_objective(results)

In [None]:
results = skopt.load(os.path.join(MODEL_DIR, '_'.join([model_type, 'opt', train_dataset_type]) + '.pkl'))

train_dataset_type = 'train+valid'
test_dataset_type = 'test'

train_dataset_path = os.path.join(DATA_DIR, '_'.join([model_type, 'dataset', train_dataset_type]) + '.libsvm')
test_dataset_path = os.path.join(DATA_DIR, '_'.join([model_type, 'dataset', test_dataset_type]) + '.libsvm')
model_path = os.path.join(MODEL_DIR, '_'.join([model_type, 'model', train_dataset_type]) + '.out')
score_path = os.path.join(MODEL_DIR, '_'.join([model_type, 'score', test_dataset_type]) + '.txt')

In [None]:
model.setTrain(train_dataset_path)
model.setValidate(test_dataset_path)
model.setTest(test_dataset_path)

In [None]:
with open(test_dataset_path, 'r') as file:
    y_true = np.array([int(line[0]) for line in file])
ctr = y_true.mean()

In [None]:
with get_elapsed_time():
    model.fit(update_params({k: v for k, v in zip(['lr', 'lambda', 'k'], results.x)}), model_path)
    model.predict(model_path, score_path)

In [None]:
with open(score_path, 'r') as file:
    y_score = np.array([float(line) for line in file])

In [None]:
y_pred = get_y_pred(y_score, ctr)
norm_entropy = get_norm_entropy(y_true, y_score)
calibration = y_score.mean() / ctr
accuracy, precision, recall, f1 = accuracy_score(y_true, y_pred), precision_score(y_true, y_pred), \
recall_score(y_true, y_pred), f1_score(y_true, y_pred)
confusion_matrix = plot_confusion_matrix(y_true, y_pred)
auroc = plot_roc_curve(y_true, y_score)
auprc = plot_pr_curve(y_true, y_score)
_ = plot_lift_curve(y_true, y_score)
_ = plot_class_density(y_true, y_score, threshold=ctr)

In [None]:
dump_pickle(os.path.join(MODEL_DIR, '_'.join([model_type, 'metric', train_dataset_type]) + '.pkl'), 
            (norm_entropy, calibration, accuracy, precision, recall, f1, confusion_matrix, auroc, auprc))

In [None]:
test_dataset_type = 'quiz'
test_dataset_path = os.path.join(DATA_DIR, '_'.join([model_type, 'dataset', test_dataset_type]) + '.libsvm')
score_path = os.path.join(MODEL_DIR, '_'.join([model_type, 'score', test_dataset_type]) + '.txt')

In [None]:
model.setTest(test_dataset_path)
with get_elapsed_time():
    model.predict(model_path, score_path)