# We illustrate the use of SlackMinimizer in two ways.

## The script slack_minimizer.py can be run directly, showing the results on a random forest (see source comments around main() for details).

In [10]:
%run slack_minimizer.py data/a1a_all.csv 1000 20000 -f 0.01 -k -1 -d 100

Trial 0:	Time = 0.000204086303711
Data loaded. 	Time = 1.35821700096
Random forest trained. 	Time = 1.5430328846
Classifier 1 done.
Classifier 2 done.
Classifier 3 done.
Classifier 4 done.
Classifier 5 done.
Classifier 6 done.
Classifier 7 done.
Classifier 8 done.
Classifier 9 done.
Classifier 10 done.
Classifier 11 done.
Classifier 12 done.
Classifier 13 done.
Classifier 14 done.
Classifier 15 done.
Classifier 16 done.
Classifier 17 done.
Classifier 18 done.
Classifier 19 done.
Classifier 20 done.
Featurizing done. 	Time = 4.03397393227
After iteration  0:	 Time = 0.04558801651
Holdout: 	 Error = 0.217487202559	 AUC: 0.644603637612
Validation: 	 Error = 0.218071679651	 AUC: 0.668029220661
After iteration  5:	 Time = 0.207148075104
Holdout: 	 Error = 0.230581205342	 AUC: 0.784831846259
Validation: 	 Error = 0.224108488536	 AUC: 0.79155268629
After iteration  10:	 Time = 0.468198060989
Holdout: 	 Error = 0.235125612608	 AUC: 0.800956112255
Validation: 	 Error = 0.229871909911	 AUC: 0.80

## We also give an example in which many heterogeneous non-tree classifiers are combined with SlackMinimizer.

In [13]:
import composite_feature
import numpy as np
import scipy as sp
import sklearn.linear_model, sklearn.ensemble
import muffled_utils
import time
import sklearn.metrics

In [14]:
labeled_file = 'data/a1a_all.csv'
labeled_set_size = 1000
unlabeled_set_size = 10000
holdout_set_size = 500
validation_set_size = 1000
inittime = time.time()
(x_train, y_train, x_unl, y_unl, x_out, y_out, x_validate, y_validate) = muffled_utils.read_random_data_from_csv(
    labeled_file, labeled_set_size, unlabeled_set_size, holdout_set_size, validation_set_size)
print('Data loaded. \tTime = ' + str(time.time() - inittime))

Data loaded. 	Time = 0.768465995789


In [15]:
# Now train a few different base classifiers
inittime = time.time()
skcl = []
clrf = sklearn.ensemble.RandomForestClassifier(n_estimators=50, n_jobs=-1)
skcl.append(('Plain RF', clrf))
cldt = sklearn.tree.DecisionTreeClassifier()
skcl.append(('DT', cldt))
cletf = sklearn.ensemble.AdaBoostClassifier(n_estimators=50, algorithm='SAMME')
skcl.append(('AdaBoost', cletf))
clgb = sklearn.ensemble.GradientBoostingClassifier(n_estimators=50)#, loss='deviance')
skcl.append(('LogitBoost', clgb))
cllogistic = sklearn.linear_model.LogisticRegression()#(loss='log')
skcl.append(('Logistic regression', cllogistic))
#clgp = sklearn.gaussian_process.GaussianProcessClassifier()
#skcl.append(('Gaussian process', clgp))
# Now x_train is a (LABELED_SET_SIZE x d) matrix, and y_train a vector of size LABELED_SET_SIZE.
for i in range(len(skcl)):
    skcl[i][1].fit(x_train, y_train)
    print(skcl[i][0] + ' trained', time.time() - inittime)
classifier_list = list(zip(*skcl)[1])

('Plain RF trained', 0.279616117477417)
('DT trained', 0.28757500648498535)
('AdaBoost trained', 0.42799901962280273)
('LogitBoost trained', 0.5897660255432129)
('Logistic regression trained', 0.6009221076965332)


In [16]:
print [sklearn.metrics.roc_auc_score(y_validate, c.predict(x_validate)) for c in classifier_list]

[0.73201784024868233, 0.69135018245708879, 0.74608730909582355, 0.73031490741992167, 0.75761589403973517]


In [24]:
k = 0
failure_prob = 0.0005
inittime = time.time()
(b_vector, allfeats_out, allfeats_unl, allfeats_val) = composite_feature.predict_multiple(
    classifier_list, x_out, x_unl, x_validate, y_out=y_out, k=k, 
    failure_prob=failure_prob, from_sklearn_rf=False, use_tree_partition=False)
print ('Featurizing done. \tTime = ' + str(time.time() - inittime))

Classifier 1 done.
Classifier 2 done.
Classifier 3 done.
Classifier 4 done.
Classifier 5 done.
Featurizing done. 	Time = 1.45948219299


In [25]:
import slack_minimizer
gradh = slack_minimizer.SlackMinimizer(
    b_vector, allfeats_unl, allfeats_out, y_out, unlabeled_labels=y_unl,
    validation_set=allfeats_val, validation_labels=y_validate)
statauc = gradh.sgd(50, unl_stride_size=100, linesearch=True, logging_interval=5)

After iteration  0:	 Time = 0.00942802429199
Holdout: 	 Error = 0.317332968464	 AUC: 0.749843674008
Validation: 	 Error = 0.325109400266	 AUC: 0.747871333964
After iteration  5:	 Time = 0.0521280765533
Holdout: 	 Error = 0.19193955162	 AUC: 0.806121031057
Validation: 	 Error = 0.202221024037	 AUC: 0.793231517773
After iteration  10:	 Time = 0.111564159393
Holdout: 	 Error = 0.191931702334	 AUC: 0.806121031057
Validation: 	 Error = 0.202212941786	 AUC: 0.793231517773
After iteration  15:	 Time = 0.121046066284
Holdout: 	 Error = 0.191909829871	 AUC: 0.806121031057
Validation: 	 Error = 0.202191181511	 AUC: 0.793231517773
After iteration  20:	 Time = 0.129214048386
Holdout: 	 Error = 0.191903260385	 AUC: 0.806121031057
Validation: 	 Error = 0.20218490245	 AUC: 0.793231517773
After iteration  25:	 Time = 0.137324094772
Holdout: 	 Error = 0.191911789491	 AUC: 0.806121031057
Validation: 	 Error = 0.20219210679	 AUC: 0.793231517773
After iteration  30:	 Time = 0.146389961243
Holdout: 	 Error