In [1]:
import numpy as np
from scipy import linalg, optimize

MAX_ITER = 100


def group_lasso(X, y, alpha, groups, max_iter=MAX_ITER, rtol=1e-6,
             verbose=False):
    """
    Linear least-squares with l2/l1 regularization solver.
    Solves problem of the form:
               .5 * |Xb - y| + n_samples * alpha * Sum(w_j * |b_j|)
    where |.| is the l2-norm and b_j is the coefficients of b in the
    j-th group. This is commonly known as the `group lasso`.
    Parameters
    ----------
    X : array of shape (n_samples, n_features)
        Design Matrix.
    y : array of shape (n_samples,)
    alpha : float or array
        Amount of penalization to use.
    groups : array of shape (n_features,)
        Group label. For each column, it indicates
        its group apertenance.
    rtol : float
        Relative tolerance. ensures ||(x - x_) / x_|| < rtol,
        where x_ is the approximate solution and x is the
        true solution.
    Returns
    -------
    x : array
        vector of coefficients
    References
    ----------
    "Efficient Block-coordinate Descent Algorithms for the Group Lasso",
    Qin, Scheninberg, Goldfarb
    """

    # .. local variables ..
    X, y, groups, alpha = map(np.asanyarray, (X, y, groups, alpha))
    if len(groups) != X.shape[1]:
        raise ValueError("Incorrect shape for groups")
    w_new = np.zeros(X.shape[1], dtype=X.dtype)
    alpha = alpha * X.shape[0]

    # .. use integer indices for groups ..
    group_labels = [np.where(groups == i)[0] for i in np.unique(groups)]
    H_groups = [np.dot(X[:, g].T, X[:, g]) for g in group_labels]
    eig = list(map(linalg.eigh, H_groups))
    Xy = np.dot(X.T, y)
    initial_guess = np.zeros(len(group_labels))

    def f(x, qp2, eigvals, alpha):
        return 1 - np.sum( qp2 / ((x * eigvals + alpha) ** 2))
    def df(x, qp2, eigvals, penalty):
        # .. first derivative ..
        return np.sum((2 * qp2 * eigvals) / ((penalty + x * eigvals) ** 3))

    if X.shape[0] > X.shape[1]:
        H = np.dot(X.T, X)
    else:
        H = None

    for n_iter in range(max_iter):
        w_old = w_new.copy()
        for i, g in enumerate(group_labels):
            # .. shrinkage operator ..
            eigvals, eigvects = eig[i]
            w_i = w_new.copy()
            w_i[g] = 0.
            if H is not None:
                X_residual = np.dot(H[g], w_i) - Xy[g]
            else:
                X_residual = np.dot(X.T, np.dot(X[:, g], w_i)) - Xy[g]
            qp = np.dot(eigvects.T, X_residual)
            if len(g) < 2:
                # for single groups we know a closed form solution
                w_new[g] = - np.sign(X_residual) * max(abs(X_residual) - alpha, 0)
            else:
                if alpha < linalg.norm(X_residual, 2):
                    initial_guess[i] = optimize.newton(f, initial_guess[i], df, tol=.5,
                                args=(qp ** 2, eigvals, alpha))
                    w_new[g] = - initial_guess[i] * np.dot(eigvects /  (eigvals * initial_guess[i] + alpha), qp)
                else:
                    w_new[g] = 0.


        # .. dual gap ..
        max_inc = linalg.norm(w_old - w_new, np.inf)
        if True: #max_inc < rtol * np.amax(w_new):
            residual = np.dot(X, w_new) - y
            group_norm = alpha * np.sum([linalg.norm(w_new[g], 2)
                         for g in group_labels])
            if H is not None:
                norm_Anu = [linalg.norm(np.dot(H[g], w_new) - Xy[g]) \
                           for g in group_labels]
            else:
                norm_Anu = [linalg.norm(np.dot(H[g], residual)) \
                           for g in group_labels]
            if np.any(norm_Anu > alpha):
                nnu = residual * np.min(alpha / norm_Anu)
            else:
                nnu = residual
            primal_obj =  .5 * np.dot(residual, residual) + group_norm
            dual_obj   = -.5 * np.dot(nnu, nnu) - np.dot(nnu, y)
            dual_gap = primal_obj - dual_obj
            if verbose:
                print ('Relative error: %s' % (dual_gap / dual_obj))
            if np.abs(dual_gap / dual_obj) < rtol:
                break

    return w_new


def check_kkt(A, b, x, penalty, groups):
    """Check KKT conditions for the group lasso
    Returns True if conditions are satisfied, False otherwise
    """
    group_labels = [groups == i for i in np.unique(groups)]
    penalty = penalty * A.shape[0]
    z = np.dot(A.T, np.dot(A, x) - b)
    safety_net = 1e-1 # sort of tolerance
    for g in group_labels:
        if linalg.norm(x[g]) == 0:
            if not linalg.norm(z[g]) < penalty + safety_net:
                return False
        else:
            w = - penalty * x[g] / linalg.norm(x[g], 2)
            if not np.allclose(z[g], w, safety_net):
                return False
    return True




In [2]:
def tensor_to_matrix_group_preds(preds):
    n_items = preds.shape[0]
    n_features = preds.shape[1]
    n_predictors = preds.shape[2]
    return preds.reshape(n_items, n_features * n_predictors)


In [6]:
import numpy
from ensemble import backward_greedy_ensemble_search
from ensemble import collect_predictions_from_dirs

from dataset import load_data_frame
from dataset import get_answers

from evaluation import compute_accuracy
from evaluation import hard_abs_correlation_of_preds

from ensemble import majority_vote, hard_averaging_predictions
from ensemble import forward_metric_pred_selection

dirs = ['../scores/fg',
        '../scores/word2vec/'
        # '../grid-scores/'
       #'../models/word2vec/exp_2015-12-09_06-59-30/',
       #'../models/word2vec/exp_2015-12-09_09-32-45/',
       #'../models/word2vec/exp_2015-12-09_11-02-01/'
       ]
preds = collect_predictions_from_dirs(dirs,
                                    'train.scores')

n_items = preds.shape[0]
n_features = preds.shape[1]
n_predictors = preds.shape[2]
n_answers = 4    
    
print(preds.shape)
print(preds[:2, :, :])

correct_answers = get_answers(load_data_frame('../data/training_set.tsv'), numeric=True)

X = tensor_to_matrix_group_preds(preds)
print(X.shape)
print(X[:2])

y = correct_answers

alpha = .1
groups = numpy.array([i for j in range(n_answers) for i in range(n_predictors)])
print(groups)

coefs = group_lasso(X, y, alpha, groups, verbose=True)
print( 'KKT conditions verified:', check_kkt(X, y, coefs, alpha, groups))
print(coefs)

*** Processing dir: ../scores/fg
Processing file: ../scores/fg/1002-1004-1017-1019-1022-1024/1024/1024_fg_std_dflt_merge_only_flashcards_top3_train.scores
Processing file: ../scores/fg/1002-1004-1017-1019-1022-1024/1002/1002_fg_eng_dflt_merge.corpus-b_c_s_o_wtv10_top3_train.scores
Processing file: ../scores/fg/1002-1004-1017-1019-1022-1024/1004/1004_fg_eng_dflt_merge.corpus_all_wv10_train.scores
Processing file: ../scores/fg/1002-1004-1017-1019-1022-1024/1017/1017_fg_eng_dflt_merge_only_flashcards_top3_train.scores
Processing file: ../scores/fg/1002-1004-1017-1019-1022-1024/1022/1022_fg_eng_dflt_merge_only_flashcards_top8_train.scores
Processing file: ../scores/fg/1002-1004-1017-1019-1022-1024/1019/1019_fg_eng_dflt_merge_only_flashcards_top5_train.scores
Processing file: ../scores/fg/060-062-067-070-071-072-074-075/062/062_fg_eng_dflt_concepts+ext-studystack+openstax_train.scores
Processing file: ../scores/fg/060-062-067-070-071-072-074-075/071/071_fg_std_dflt_merge.corpus.txt (b,cp,ss

In [7]:
import numpy
from ensemble import backward_greedy_ensemble_search
from ensemble import collect_predictions_from_dirs

from dataset import load_data_frame
from dataset import get_answers

from evaluation import compute_accuracy
from evaluation import hard_abs_correlation_of_preds
from evaluation import hard_preds

from ensemble import majority_vote, hard_averaging_predictions
from ensemble import forward_metric_pred_selection

dirs = ['../scores/fg',
        '../scores/word2vec/'
        # '../grid-scores/'
       #'../models/word2vec/exp_2015-12-09_06-59-30/',
       #'../models/word2vec/exp_2015-12-09_09-32-45/',
       #'../models/word2vec/exp_2015-12-09_11-02-01/'
       ]
preds = collect_predictions_from_dirs(dirs,
                                    'train.scores')

n_items = preds.shape[0]
n_features = preds.shape[1]
n_predictors = preds.shape[2]
n_answers = 4    
    
print(preds.shape)
print(preds[:2, :, :])

correct_answers = get_answers(load_data_frame('../data/training_set.tsv'), numeric=True)

def w_avg_f(w):
    w = w / sum(w)
    w_preds = preds * w[numpy.newaxis, numpy.newaxis, :]
    h_w_preds = hard_preds(preds)
    acc = compute_accuracy(correct_answers, h_w_preds)
    return 1.0 - acc

from scipy.optimize import minimize

w_0 = numpy.random.rand(preds.shape[2]) / preds.shape[2]
print('initial weights', w_0)
minimize(w_avg_f, w_0, method='Powell')

*** Processing dir: ../scores/fg
Processing file: ../scores/fg/1002-1004-1017-1019-1022-1024/1024/1024_fg_std_dflt_merge_only_flashcards_top3_train.scores
Processing file: ../scores/fg/1002-1004-1017-1019-1022-1024/1002/1002_fg_eng_dflt_merge.corpus-b_c_s_o_wtv10_top3_train.scores
Processing file: ../scores/fg/1002-1004-1017-1019-1022-1024/1004/1004_fg_eng_dflt_merge.corpus_all_wv10_train.scores
Processing file: ../scores/fg/1002-1004-1017-1019-1022-1024/1017/1017_fg_eng_dflt_merge_only_flashcards_top3_train.scores
Processing file: ../scores/fg/1002-1004-1017-1019-1022-1024/1022/1022_fg_eng_dflt_merge_only_flashcards_top8_train.scores
Processing file: ../scores/fg/1002-1004-1017-1019-1022-1024/1019/1019_fg_eng_dflt_merge_only_flashcards_top5_train.scores
Processing file: ../scores/fg/060-062-067-070-071-072-074-075/062/062_fg_eng_dflt_concepts+ext-studystack+openstax_train.scores
Processing file: ../scores/fg/060-062-067-070-071-072-074-075/071/071_fg_std_dflt_merge.corpus.txt (b,cp,ss

     fun: 1.0
 message: 'Optimization terminated successfully.'
     nit: 1
 success: True
  status: 0
    nfev: 241
   direc: array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  

In [9]:
import numpy
from ensemble import backward_greedy_ensemble_search
from ensemble import collect_predictions_from_dirs

from dataset import load_data_frame
from dataset import get_answers

from evaluation import compute_accuracy
from evaluation import hard_abs_correlation_of_preds
from evaluation import hard_preds

from ensemble import majority_vote, hard_averaging_predictions
from ensemble import forward_metric_pred_selection

dirs = ['../models/xgboost/1127/1127',
        '../models/onevs/1164/stacking-1164/'
        # '../grid-scores/'
       #'../models/word2vec/exp_2015-12-09_06-59-30/',
       #'../models/word2vec/exp_2015-12-09_09-32-45/',
       #'../models/word2vec/exp_2015-12-09_11-02-01/'
       ]
preds = collect_predictions_from_dirs(dirs,
                                    'train.scores')

n_items = preds.shape[0]
n_features = preds.shape[1]
n_predictors = preds.shape[2]
n_answers = 4    
    
print(preds.shape)
print(preds[:2, :, :])

correct_answers = get_answers(load_data_frame('../data/training_set.tsv'), numeric=True)

def w_avg_f(w):
    w = w / sum(w)
    w_preds = preds * w[numpy.newaxis, numpy.newaxis, :]
    h_w_preds = hard_preds(w_preds)
    acc = compute_accuracy(correct_answers, h_w_preds)
    return 1.0 - acc

def pos_cons(w):
    return sum(w) -1

constraint = {'type': 'eq', 'fun':pos_cons}

from scipy.optimize import minimize

w_0 = numpy.array([0.2, 0.8])
print('initial weights', w_0)
minimize(w_avg_f, w_0,method='SLSQP', constraints=constraint)

*** Processing dir: ../models/xgboost/1127/1127
Processing file: ../models/xgboost/1127/1127/xgb_0.1_10_0.6_0.3_0.8_5_2_0.0_15_50.train.scores
*** Processing dir: ../models/onevs/1164/stacking-1164/
Processing file: ../models/onevs/1164/stacking-1164/train.scores
(2500, 4, 2)
[[[ 0.22804396  0.19613358]
  [ 0.18050843  0.27808684]
  [ 0.25182936  0.25237604]
  [ 0.33961827  0.27340355]]

 [[ 0.09854157  0.14441347]
  [ 0.20936847  0.27017365]
  [ 0.58981222  0.40545447]
  [ 0.10227774  0.17995841]]]
Loaded datasets with 2500 X 7 in 0.020415234999745735 secs

initial weights [ 0.2  0.8]


     fun: 1.0
     nit: 1
  status: 0
       x: array([ 0.2,  0.8])
    njev: 1
 message: 'Optimization terminated successfully.'
     jac: array([ 0.,  0.,  0.])
    nfev: 4
 success: True

In [8]:
x = numpy.array([ 2.62944815,  2.5896517 ,  2.60614448,  2.61829134,  2.63387744,
        2.63647428,  2.61069725,  2.63760291,  2.62560941,  2.62649569,
        2.60217856,  2.63397974,  2.60350483,  2.58887998,  2.62419212,
        2.61737595,  2.63287385,  2.62419227,  2.61591753,  2.62999499])
print(x / sum(x))

[ 0.05019239  0.04943274  0.04974756  0.04997943  0.05027694  0.05032651
  0.04983447  0.05034806  0.05011912  0.05013604  0.04967186  0.0502789
  0.04969717  0.04941801  0.05009206  0.04996195  0.05025779  0.05009207
  0.04993411  0.05020283]


In [2]:
import numpy
from ensemble import preds_tensor_to_group
from ensemble import GroupEnsemble
from dataset import get_ids

from ensemble import majority_vote, collect_predictions_from_dir, collect_predictions_from_dirs


from sklearn import linear_model

dirs = [
    '/media/valerio/formalità/scores/ir_baseline/1201/noneg/noneg_1201/',
    '/media/valerio/formalità/scores/ir_baseline/1201/neg/neg_1201/',
    '/media/valerio/formalità/scores/ir_baseline/1201/nowiki_neg/',
    '/media/valerio/formalità/scores/word2vec-s/ck12/3/',
    '/media/valerio/formalità/scores/word2vec-s/concepts/3/',
    '/media/valerio/formalità/scores/word2vec-s/studystack/3/',
    '/media/valerio/formalità/scores/ir_baseline/1201/nowiki_noneg/',
    '/media/valerio/formalità/scores/ir_baseline/studystack/neg/',
    '/media/valerio/formalità/scores/ir_baseline/studystack/noneg/',
    '/media/valerio/formalità/scores/ir_baseline/quizlet/neg/',
    '/media/valerio/formalità/scores/ir_baseline/quizlet/noneg/',
    #'/media/valerio/formalità/scores/ir_baseline/ck12/neg/',
    #'/media/valerio/formalità/scores/ir_baseline/ck12/noneg/',
    #'/media/valerio/formalità/scores/ir_baseline/concepts/neg/',
    #'/media/valerio/formalità/scores/ir_baseline/concepts/noneg/',
    
         
       ]
# dirs = ['../models/word2vec/041/3/',
#        '../models/word2vec/051/1/',
#       '../models/word2vec/065/1/',
#       '../models/word2vec/066/1/']

# train_pattern = 'train.scores|^tr.*_(bm25|vsm|lm2|dfr2).*_([1-5]).scores'
train_pattern = 'train.scores|^tr.*_(bm25|vsm|lm2|dfr2).*_([1-3]).scores'
# train_pattern = 'train.scores|^tr.*_(bm25|vsm|lm2|dfr2).*_([1-9]|10).scores'


train_preds = collect_predictions_from_dirs(dirs,
                                    train_pattern)

# valid_pattern = 'valid.scores|^ts.*_(bm25|vsm|lm2|dfr2).*_([1-5]).scores'
valid_pattern = 'valid.scores|^ts.*_(bm25|vsm|lm2|dfr2).*_([1-3]).scores'
# valid_pattern = 'valid.scores|^ts.*_(bm25|vsm|lm2|dfr2).*_([1-9]|10).scores'

valid_preds = collect_predictions_from_dirs(dirs, valid_pattern)

from ensemble import augment_predictions

augment = False
if augment:
    train_preds = augment_predictions(train_preds, logify=True, aggr_funcs=[])
    valid_preds = augment_predictions(valid_preds, logify=True, aggr_funcs=[])
    
logify_only = False
if logify_only:
    train_preds = numpy.log1p(train_preds)
    valid_preds = numpy.log1p(valid_preds)

print(train_preds.shape)
print(valid_preds.shape)

from dataset import load_data_frame
from dataset import get_answers

train_frame = load_data_frame('../data/training_set.tsv')
train_frame_ids = get_ids(train_frame)
correct_answers = get_answers(train_frame, numeric=True)

*** Processing dir: /media/valerio/formalità/scores/ir_baseline/1201/noneg/noneg_1201/
Processing file: /media/valerio/formalità/scores/ir_baseline/1201/noneg/noneg_1201/tr_bm25_en_1.scores
Processing file: /media/valerio/formalità/scores/ir_baseline/1201/noneg/noneg_1201/tr_bm25_en_2.scores
Processing file: /media/valerio/formalità/scores/ir_baseline/1201/noneg/noneg_1201/tr_bm25_en_3.scores
Processing file: /media/valerio/formalità/scores/ir_baseline/1201/noneg/noneg_1201/tr_bm25_st_1.scores
Processing file: /media/valerio/formalità/scores/ir_baseline/1201/noneg/noneg_1201/tr_bm25_st_2.scores
Processing file: /media/valerio/formalità/scores/ir_baseline/1201/noneg/noneg_1201/tr_bm25_st_3.scores
Processing file: /media/valerio/formalità/scores/ir_baseline/1201/noneg/noneg_1201/tr_dfr2_en_1.scores
Processing file: /media/valerio/formalità/scores/ir_baseline/1201/noneg/noneg_1201/tr_dfr2_en_2.scores
Processing file: /media/valerio/formalità/scores/ir_baseline/1201/noneg/noneg_1201/tr_dfr

In [3]:
from ensemble import backward_greedy_preds_search_cv

best_acc = backward_greedy_preds_search_cv(train_preds, correct_answers, seeds=[1337, 6666, 7777], n_folds=3)
print('BEST', best_acc)

Considering all predictions: acc 0.5719862495896563
Found an improvement by removing 122	 [acc:0.5731846525267561]
Found an improvement by removing 144	 [acc:0.5739844928451959]
Found an improvement by removing 72	 [acc:0.5745202794767531]
Found an improvement by removing 18	 [acc:0.5753233181419352]
Found an improvement by removing 75	 [acc:0.5761234802534556]
Found an improvement by removing 51	 [acc:0.5763896146513265]
Found an improvement by removing 142	 [acc:0.576790415997506]
Found an improvement by removing 9	 [acc:0.577189776762847]
Found an improvement by removing 60	 [acc:0.5778570319959705]
Found an improvement by removing 16	 [acc:0.5779907381730521]
Found an improvement by removing 151	 [acc:0.5781241248610606]
Found an improvement by removing 15	 [acc:0.5782579913586808]
Found an improvement by removing 58	 [acc:0.5786584720637834]
Found an improvement by removing 11	 [acc:0.5786584720637833]
Found an improvement by removing 90	 [acc:0.5786576727650986]
Found an improvem

KeyboardInterrupt: 

In [4]:
feature_ids = numpy.array([ True, False, False, False, False, False, False, False,  True,
       False, False, False,  True,  True,  True,  True, False, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False, False, False, False,  True, False, False, False,
       False, False,  True, False, False,  True, False,  True,  True,
       False,  True,  True,  True,  True,  True, False, False, False,
        True,  True,  True,  True, False,  True, False, False,  True,
       False,  True,  True, False,  True, False,  True, False, False,
       False, False, False,  True,  True, False,  True,  True,  True,
        True,  True,  True, False, False, False,  True, False, False,
       False,  True,  True, False, False, False,  True,  True, False,
       False, False, False, False,  True, False, False, False, False,
        True, False,  True, False, False, False,  True,  True, False,
       False,  True, False, False, False,  True, False, False, False,
       False, False,  True, False, False], dtype=bool)

train_preds = train_preds[:, :, feature_ids]
valid_preds = valid_preds[:, :, feature_ids]
print(train_preds.shape)
print(valid_preds.shape)

(2500, 4, 56)
(8132, 4, 56)


In [1]:
from dataset import load_data_frame
from dataset import get_answers
from dataset import get_ids

from evaluation import normalize_preds

train_frame = load_data_frame('../data/training_set.tsv')
train_frame_ids = get_ids(train_frame)
correct_answers = get_answers(train_frame, numeric=True)

import numpy

train_preds = numpy.load('../models/onevs/1142/stacking-1143/train.stacking.npy')
print(train_preds.shape)

valid_preds = numpy.load('../models/onevs/1142/stacking-1143/valid.stacking.npy')
print(valid_preds.shape)

normalize = False
if normalize:
    print('Before normalizing', train_preds[0, :, 0], valid_preds[0, :, 0])
    train_preds = normalize_preds(train_preds, cos_sim=False, eps=1e-10)
    valid_preds = normalize_preds(valid_preds, cos_sim=False, eps=1e-10)
    print('After normalizing', train_preds[0, :, 0], valid_preds[0, :, 0])

Loaded datasets with 2500 X 7 in 0.01579357499986145 secs

(2500, 4, 4)
(8132, 4, 4)


In [8]:

from sklearn import linear_model
from sklearn import svm
from sklearn import naive_bayes
from sklearn import ensemble

from sklearn import cross_validation

from time import perf_counter

from evaluation import hard_preds
from evaluation import compute_accuracy
from evaluation import create_submission
from dataset import numbers_to_letters
from dataset import save_predictions
from dataset import get_ids
from dataset import load_data_frame

from ensemble import OneVsRestClassifier
from ensemble import OneVsOneClassifier
from ensemble import averaging_predictions

from ensemble import Calibrator

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectFpr
from sklearn.feature_selection import chi2
import sklearn
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn import neighbors

from sklearn.preprocessing import StandardScaler

from evaluation import min_max_normalize_preds
from evaluation import softmax_normalize_preds

valid_frame = load_data_frame('../data/validation_set.tsv')
valid_ids = get_ids(valid_frame)

n_answers = 4
n_items = train_preds.shape[0]
n_folds = 10

random_states = [1337, 6666, 7777, 5555]

valid_accs = []

for r, rand_s in enumerate(random_states):
    print('\nRAND SEED {0}:\n'.format(rand_s))
    # kfold = cross_validation.KFold(n_items, n_folds, shuffle=True, random_state=1337)
    kfold = cross_validation.StratifiedKFold(correct_answers, n_folds, shuffle=True, random_state=rand_s)

    accs = []

    #
    # transform labels

    

    base_model = ensemble.RandomForestClassifier
    base_model_params = {'n_estimators':60, 'max_depth':4, 'criterion':'gini', 'n_jobs':-1, 'class_weight':'balanced'}

    base_model = ensemble.AdaBoostClassifier
    base_model_params = {'n_estimators':60, 'learning_rate':0.1}
    
    base_model = ensemble.ExtraTreesClassifier
    base_model_params = {'n_estimators':240, 'max_depth':4, 'criterion':'gini', 'n_jobs':-1, 'class_weight':'balanced'}
    
    base_model = ensemble.GradientBoostingClassifier
    base_model_params = {'loss':'deviance',
                    'learning_rate':0.1,
                    'n_estimators':30,
                    'subsample':1.0,
                    'min_samples_split':2,
                    'min_samples_leaf':1,
                    'min_weight_fraction_leaf':0.0,
                    'max_depth':3}

    
    
    base_feature_sel = SelectFpr# SelectKBest
    base_feature_sel_params = {'score_func':f_classif, 
                               #'k':100
                              }
    
    base_feature_tra = StandardScaler
    base_feature_tra_params = {}
    base_feature_tra_dict = {'base_transformer':base_feature_tra,
                            'base_transformer_params':base_feature_tra_params}
    
    base_feature_sel_dict = {'base_selector':base_feature_sel,
                            'base_selector_params':base_feature_sel_params
                            }
    
    
    
    base_model = sklearn.naive_bayes.GaussianNB
    base_model_params = {}
    
    
    
    base_model = QuadraticDiscriminantAnalysis
    base_model_params = {'reg_param':0.01}
    
    base_model = sklearn.svm.SVC
    base_model_params = {'probability':True, 'kernel':'linear', 'class_weight':'balanced', 'C':3.0}
    
    # base_model = neighbors.KNeighborsClassifier
    # base_model_params = {'n_neighbors':101, 'weights':'distance', 'p':2, 'metric':'minkowski', 'metric_params':None, 'n_jobs':-1}
    
    caliber_base_model = linear_model.LogisticRegression
    caliber_base_model_params = {'fit_intercept':True, 'class_weight':'balanced', 'penalty':'l2', 'C':10.0}

    base_model = linear_model.LogisticRegression
    base_model_params = {'fit_intercept':True, 'class_weight':'balanced', 'penalty':'l2', 'C':0.8, 'max_iter':200}
    
    
    base_model = LinearDiscriminantAnalysis
    base_model_params = {'solver':'lsqr', 'shrinkage':0.3, }
    
    base_model = ensemble.AdaBoostClassifier
    base_model_params = {'n_estimators':100, 'learning_rate':0.15}
    
    base_model = ensemble.ExtraTreesClassifier
    base_model_params = {'n_estimators':150, 'max_depth':6, 'criterion':'gini', 'n_jobs':-1, 'class_weight':'balanced'}
    
    normalizers = [min_max_normalize_preds, softmax_normalize_preds]
    calibration = False

    base_model = linear_model.LogisticRegression
    base_model_params = {'fit_intercept':True, 'class_weight':'balanced', 'penalty':'l2', 'C':10.0, 'max_iter':200}
    
    caliber_base_model = linear_model.LogisticRegression
    caliber_base_model_params = {'fit_intercept':True, 'class_weight':'balanced', 'penalty':'l2', 'C':10.0}
    
    for k, (train_ids, test_ids) in enumerate(kfold):
        print('Fold', k)
    
        train_x = train_preds[train_ids]
        train_y = correct_answers[train_ids]
        test_x = train_preds[test_ids]
        test_y = correct_answers[test_ids]
        
        if calibration:
            caliber = Calibrator(caliber_base_model,
                                 normalizers=normalizers,
                                 **caliber_base_model_params)
            caliber.fit(train_x, train_y)
            train_x = caliber.predict(train_x)
    
        model = OneVsRestClassifier(base_model, 
                                    #feature_transformer=base_feature_tra_dict,
        #model = OneVsOneClassifier(base_model, aggr_func=None,
                                    feature_selector=None,#base_feature_sel_dict,
                                    **base_model_params)
    
        #
        # fitting
        # print('Fitting')
        model.fit(train_x, train_y)
    
        #
        # predicting
        # print('Predicting on train')
        train_pred_probs = model.predict(train_x)
    
        hard_train_preds = hard_preds(train_pred_probs)

        train_acc = compute_accuracy(train_y, hard_train_preds)
        print('ON TRAIN', train_acc)
    
        #
        # predicting
        # print('Predicting on test')
        test_pred_probs = model.predict(test_x)
    
        hard_test_preds = hard_preds(test_pred_probs)

        test_acc = compute_accuracy(test_y, hard_test_preds)
        print('ON TEST',test_acc)
        accs.append(test_acc)

    print(accs)
    avg_acc = sum(accs)/n_folds
    valid_accs.append(avg_acc)
    print('AVG VALID', avg_acc)

avg_all_valid = sum(valid_accs) / len(valid_accs)
print('AVG ALL VALID', avg_all_valid)

model = OneVsRestClassifier(base_model, **base_model_params)
model.fit(train_preds, correct_answers)

mp = model.predict(train_preds)
print(mp.shape, len(train_ids))
print('TRAIN', compute_accuracy(correct_answers, hard_preds(mp)))


Loaded datasets with 8132 X 6 in 0.025421775004360825 secs


RAND SEED 1337:

Fold 0
ON TRAIN 0.589857651246
ON TEST 0.571428571429
Fold 1
ON TRAIN 0.585409252669
ON TEST 0.630952380952
Fold 2
ON TRAIN 0.588261449533
ON TEST 0.601593625498
Fold 3
ON TRAIN 0.590484659849
ON TEST 0.545816733068
Fold 4
ON TRAIN 0.593958240782
ON TEST 0.550200803213
Fold 5
ON TRAIN 0.586406041759
ON TEST 0.582329317269
Fold 6
ON TRAIN 0.594402487783
ON TEST 0.578313253012
Fold 7
ON TRAIN 0.591292758774
ON TEST 0.558232931727
Fold 8
ON TRAIN 0.58996001777
ON TEST 0.578313253012
Fold 9
ON TRAIN 0.58996001777
ON TEST 0.566265060241
[0.5714285714285714, 0.63095238095238093, 0.60159362549800799, 0.54581673306772904, 0.55020080321285136, 0.58232931726907633, 0.57831325301204817, 0.55823293172690758, 0.57831325301204817, 0.5662650602409639]
AVG VALID 0.576344592942

RAND SEED 6666:

Fold 0
ON TRAIN 0.588523131673
ON TEST 0.607142857143
Fold 1
ON TRAIN 0.590302491103
ON TEST 0.56746031746
Fold 2
ON TRAIN 0.5913739

In [12]:
import os

sub_id = '1144.norm'
method_str = 'logistic_stacking_on_1142_10_norm.0'

pred_dir = '../submissions/{0}/'.format(sub_id)
os.makedirs(pred_dir, exist_ok=True)

train_pred_file = '../submissions/{0}/{0}_{1}_train.scores'.format(sub_id, method_str)

if normalize:
    mp = normalize_preds(mp, cos_sim=False, eps=1e-10)
save_predictions(mp, train_pred_file, ids=train_frame_ids)

valid_mp = model.predict(valid_preds)

if normalize:
    valid_mp = normalize_preds(valid_mp, cos_sim=False, eps=1e-10)
    
valid_pred_file = '../submissions/{0}/{0}_{1}_valid.scores'.format(sub_id, method_str)
save_predictions(valid_mp, valid_pred_file, ids=valid_ids)

valid_pred_file = '../submissions/{0}/{0}_{1}_valid.submission'.format(sub_id, method_str)
create_submission(numbers_to_letters(hard_preds(valid_mp)),
                              output=valid_pred_file,
                              ids=valid_ids)

Unnamed: 0,correctAnswer,id
0,C,102501
1,D,102502
2,A,102503
3,B,102504
4,A,102505
5,A,102506
6,C,102507
7,A,102508
8,B,102509
9,C,102510


In [3]:
from sklearn import linear_model
from sklearn import svm
from sklearn import naive_bayes
from sklearn import ensemble

from sklearn import cross_validation

from time import perf_counter

from evaluation import hard_preds
from evaluation import compute_accuracy



n_answers = 4
n_items = train_preds.shape[0]
n_folds = 10

kfold = cross_validation.KFold(n_items, n_folds, shuffle=True, random_state=1337)

accs = []

#
# transform labels
label_matrix = numpy.zeros((n_items, n_answers), dtype=int)
for i in range(n_items):
    label_matrix[i, correct_answers[i]] = 1

for k, (train_ids, test_ids) in enumerate(kfold):
    print('Fold', k)
    
    train_x = train_preds[train_ids]
    train_y = label_matrix[train_ids]
    test_x = train_preds[test_ids]
    test_y = label_matrix[test_ids]
    
    print('y', train_y[:20])
    n_items_fold = len(train_x)
    # models = [linear_model.LogisticRegression(fit_intercept=True) for i in range(n_answers)]
    # models = [svm.SVC() for i in range(n_answers)]
    # models = [naive_bayes.GaussianNB() for i in range(n_answers)]
    models = [ensemble.GradientBoostingClassifier(loss='deviance', 
                                                  learning_rate=0.1, 
                                                  n_estimators=200, 
                                                  subsample=1.0, 
                                                  min_samples_split=2, 
                                                  min_samples_leaf=1, 
                                                  min_weight_fraction_leaf=0.0, 
                                                  max_depth=3) 
               for i in range(n_answers)]
    
    

    #
    # fitting
    print('Fitting')
    for j in range(len(models)):
        models[j].fit(train_x[:, j, :], train_y[:, j])
    
    #
    # predicting
    print('Predicting on train')
    train_pred_probs = numpy.zeros((n_items_fold, n_answers))
    for j in range(len(models)):
        train_pred_probs[:, j] = models[j].predict_proba(train_x[:, j, :])[:, 1]
        # train_pred_probs[:, j] = models[j].predict(train_preds[:, j, :])
    
    hard_train_preds = hard_preds(train_pred_probs)

    train_acc = compute_accuracy(correct_answers[train_ids], hard_train_preds)
    print('ON TRAIN', train_acc)
    
    #
    # predicting
    print('Predicting on test')
    test_pred_probs = numpy.zeros((len(test_x), n_answers))
    for j in range(len(models)):
        test_pred_probs[:, j] = models[j].predict_proba(test_x[:, j, :])[:, 1]
        # train_pred_probs[:, j] = models[j].predict(train_preds[:, j, :])
    
    hard_test_preds = hard_preds(test_pred_probs)

    test_acc = compute_accuracy(correct_answers[test_ids], hard_test_preds)
    print('ON TEST',test_acc)
    accs.append(test_acc)

print(accs)
print('AVG', sum(accs)/n_folds)

Fold 0
y [[0 0 1 0]
 [0 0 1 0]
 [0 0 0 1]
 [0 1 0 0]
 [0 1 0 0]
 [0 1 0 0]
 [1 0 0 0]
 [0 0 0 1]
 [0 0 1 0]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 1 0]
 [0 1 0 0]
 [0 1 0 0]
 [0 0 1 0]
 [1 0 0 0]
 [0 1 0 0]
 [0 1 0 0]
 [0 0 1 0]
 [0 0 1 0]]
Fitting
Predicting on train
ON TRAIN 0.562666666667
Predicting on test
ON TEST 0.6
Fold 1
y [[0 0 1 0]
 [0 0 1 0]
 [0 0 0 1]
 [0 1 0 0]
 [0 1 0 0]
 [0 1 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [1 0 0 0]
 [0 1 0 0]
 [0 1 0 0]
 [0 0 1 0]
 [1 0 0 0]
 [0 1 0 0]
 [0 1 0 0]
 [0 0 1 0]
 [0 0 1 0]]
Fitting
Predicting on train
ON TRAIN 0.564444444444
Predicting on test
ON TEST 0.58
Fold 2
y [[0 0 1 0]
 [0 0 1 0]
 [0 1 0 0]
 [0 1 0 0]
 [0 1 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [0 0 0 1]
 [0 0 1 0]
 [0 0 0 1]
 [0 0 0 1]
 [1 0 0 0]
 [0 0 1 0]
 [0 1 0 0]
 [0 1 0 0]
 [0 0 1 0]
 [0 1 0 0]
 [0 1 0 0]
 [0 0 1 0]
 [0 0 1 0]]
Fitting
Predicting on train
ON TRAIN 0.571111111111
Predicting on test
ON TEST 0.496
Fold 3
y [[0 0 1 0]
 [0 0 0 1]
 [0 1 0 0]
 [0 1 0 0]
 [0 1

In [3]:
import numpy
from dataset import load_predictions

train_preds = numpy.load('../models/onevs/1163/stacking-1163/train.stacking.npy')
valid_preds = numpy.load('../models/onevs/1163/stacking-1163/valid.stacking.npy')
print(train_preds.shape)
print(valid_preds.shape)

xgb_train_preds = load_predictions('../models/xgboost/1127/1127/xgb_0.1_10_0.6_0.3_0.8_5_2_0.0_15_50.train.scores')
xgb_valid_preds = load_predictions('../models/xgboost/1127/1127/xgb_0.1_10_0.6_0.3_0.8_5_2_0.0_15_50.valid.scores')

train_preds = numpy.dstack((train_preds, xgb_train_preds))
valid_preds = numpy.dstack((valid_preds, xgb_valid_preds))
print(train_preds.shape)
print(valid_preds.shape)

numpy.save('../models/onevs/1169-aug-stacking/train.stacking', train_preds)
numpy.save('../models/onevs/1169-aug-stacking/valid.stacking', valid_preds)

(2500, 4, 3)
(8132, 4, 3)
(2500, 4, 4)
(8132, 4, 4)
