Skip to content
Find file
Fetching contributors…
Cannot retrieve contributors at this time
4148 lines (3784 sloc) 133 KB
/***********************************************************************/
/* */
/* svm_learn.c */
/* */
/* Learning module of Support Vector Machine. */
/* */
/* Author: Thorsten Joachims */
/* Date: 02.07.02 */
/* */
/* Copyright (c) 2002 Thorsten Joachims - All rights reserved */
/* */
/* This software is available for non-commercial use only. It must */
/* not be modified and distributed without prior permission of the */
/* author. The author is not responsible for implications from the */
/* use of this software. */
/* */
/***********************************************************************/
# include "svm_common.h"
# include "svm_learn.h"
/* interface to QP-solver */
double *optimize_qp(QP *, double *, long, double *, LEARN_PARM *);
/*---------------------------------------------------------------------------*/
/* Learns an SVM classification model based on the training data in
docs/label. The resulting model is returned in the structure
model. */
void svm_learn_classification(DOC **docs, double *class, long int
totdoc, long int totwords,
LEARN_PARM *learn_parm,
KERNEL_PARM *kernel_parm,
KERNEL_CACHE *kernel_cache,
MODEL *model,
double *alpha)
/* docs: Training vectors (x-part) */
/* class: Training labels (y-part, zero if test example for
transduction) */
/* totdoc: Number of examples in docs/label */
/* totwords: Number of features (i.e. highest feature index) */
/* learn_parm: Learning paramenters */
/* kernel_parm: Kernel paramenters */
/* kernel_cache:Initialized Cache of size totdoc, if using a kernel.
NULL if linear.*/
/* model: Returns learning result (assumed empty before called) */
/* alpha: Start values for the alpha variables or NULL
pointer. The new alpha values are returned after
optimization if not NULL. Array must be of size totdoc. */
{
long *inconsistent,i,*label;
long inconsistentnum;
long misclassified,upsupvecnum;
double loss,model_length,example_length;
double maxdiff,*lin,*a,*c;
long runtime_start,runtime_end;
long iterations;
long *unlabeled,transduction;
long heldout;
long loo_count=0,loo_count_pos=0,loo_count_neg=0,trainpos=0,trainneg=0;
long loocomputed=0,runtime_start_loo=0,runtime_start_xa=0;
double heldout_c=0,r_delta_sq=0,r_delta,r_delta_avg;
long *index,*index2dnum;
double *weights;
CFLOAT *aicache; /* buffer to keep one row of hessian */
double *xi_fullset; /* buffer for storing xi on full sample in loo */
double *a_fullset; /* buffer for storing alpha on full sample in loo */
TIMING timing_profile;
SHRINK_STATE shrink_state;
runtime_start=get_runtime();
timing_profile.time_kernel=0;
timing_profile.time_opti=0;
timing_profile.time_shrink=0;
timing_profile.time_update=0;
timing_profile.time_model=0;
timing_profile.time_check=0;
timing_profile.time_select=0;
kernel_cache_statistic=0;
learn_parm->totwords=totwords;
/* make sure -n value is reasonable */
if((learn_parm->svm_newvarsinqp < 2)
|| (learn_parm->svm_newvarsinqp > learn_parm->svm_maxqpsize)) {
learn_parm->svm_newvarsinqp=learn_parm->svm_maxqpsize;
}
init_shrink_state(&shrink_state,totdoc,(long)MAXSHRINK);
label = (long *)my_malloc(sizeof(long)*totdoc);
inconsistent = (long *)my_malloc(sizeof(long)*totdoc);
unlabeled = (long *)my_malloc(sizeof(long)*totdoc);
c = (double *)my_malloc(sizeof(double)*totdoc);
a = (double *)my_malloc(sizeof(double)*totdoc);
a_fullset = (double *)my_malloc(sizeof(double)*totdoc);
xi_fullset = (double *)my_malloc(sizeof(double)*totdoc);
lin = (double *)my_malloc(sizeof(double)*totdoc);
learn_parm->svm_cost = (double *)my_malloc(sizeof(double)*totdoc);
model->supvec = (DOC **)my_malloc(sizeof(DOC *)*(totdoc+2));
model->alpha = (double *)my_malloc(sizeof(double)*(totdoc+2));
model->index = (long *)my_malloc(sizeof(long)*(totdoc+2));
model->at_upper_bound=0;
model->b=0;
model->supvec[0]=0; /* element 0 reserved and empty for now */
model->alpha[0]=0;
model->lin_weights=NULL;
model->totwords=totwords;
model->totdoc=totdoc;
model->kernel_parm=(*kernel_parm);
model->sv_num=1;
model->loo_error=-1;
model->loo_recall=-1;
model->loo_precision=-1;
model->xa_error=-1;
model->xa_recall=-1;
model->xa_precision=-1;
inconsistentnum=0;
transduction=0;
r_delta=estimate_r_delta(docs,totdoc,kernel_parm);
r_delta_sq=r_delta*r_delta;
r_delta_avg=estimate_r_delta_average(docs,totdoc,kernel_parm);
if(learn_parm->svm_c == 0.0) { /* default value for C */
learn_parm->svm_c=1.0/(r_delta_avg*r_delta_avg);
if(verbosity>=1)
printf("Setting default regularization parameter C=%.4f\n",
learn_parm->svm_c);
}
learn_parm->eps=-1.0; /* equivalent regression epsilon for
classification */
for(i=0;i<totdoc;i++) { /* various inits */
docs[i]->docnum=i;
inconsistent[i]=0;
a[i]=0;
lin[i]=0;
c[i]=0.0;
unlabeled[i]=0;
if(class[i] == 0) {
unlabeled[i]=1;
label[i]=0;
transduction=1;
}
if(class[i] > 0) {
learn_parm->svm_cost[i]=learn_parm->svm_c*learn_parm->svm_costratio*
docs[i]->costfactor;
label[i]=1;
trainpos++;
}
else if(class[i] < 0) {
learn_parm->svm_cost[i]=learn_parm->svm_c*docs[i]->costfactor;
label[i]=-1;
trainneg++;
}
else {
learn_parm->svm_cost[i]=0;
}
}
if(verbosity>=2) {
printf("%ld positive, %ld negative, and %ld unlabeled examples.\n",trainpos,trainneg,totdoc-trainpos-trainneg); fflush(stdout);
}
/* caching makes no sense for linear kernel */
if(kernel_parm->kernel_type == LINEAR) {
kernel_cache = NULL;
}
/* compute starting state for initial alpha values */
if(alpha) {
if(verbosity>=1) {
printf("Computing starting state..."); fflush(stdout);
}
index = (long *)my_malloc(sizeof(long)*totdoc);
index2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11));
weights=(double *)my_malloc(sizeof(double)*(totwords+1));
aicache = (CFLOAT *)my_malloc(sizeof(CFLOAT)*totdoc);
for(i=0;i<totdoc;i++) { /* create full index and clip alphas */
index[i]=1;
alpha[i]=fabs(alpha[i]);
if(alpha[i]<0) alpha[i]=0;
if(alpha[i]>learn_parm->svm_cost[i]) alpha[i]=learn_parm->svm_cost[i];
}
if(kernel_parm->kernel_type != LINEAR) {
for(i=0;i<totdoc;i++) /* fill kernel cache with unbounded SV */
if((alpha[i]>0) && (alpha[i]<learn_parm->svm_cost[i])
&& (kernel_cache_space_available(kernel_cache)))
cache_kernel_row(kernel_cache,docs,i,kernel_parm);
for(i=0;i<totdoc;i++) /* fill rest of kernel cache with bounded SV */
if((alpha[i]==learn_parm->svm_cost[i])
&& (kernel_cache_space_available(kernel_cache)))
cache_kernel_row(kernel_cache,docs,i,kernel_parm);
}
(void)compute_index(index,totdoc,index2dnum);
update_linear_component(docs,label,index2dnum,alpha,a,index2dnum,totdoc,
totwords,kernel_parm,kernel_cache,lin,aicache,
weights);
(void)calculate_svm_model(docs,label,unlabeled,lin,alpha,a,c,
learn_parm,index2dnum,index2dnum,model);
for(i=0;i<totdoc;i++) { /* copy initial alphas */
a[i]=alpha[i];
}
free(index);
free(index2dnum);
free(weights);
free(aicache);
if(verbosity>=1) {
printf("done.\n"); fflush(stdout);
}
}
if(transduction) {
learn_parm->svm_iter_to_shrink=99999999;
if(verbosity >= 1)
printf("\nDeactivating Shrinking due to an incompatibility with the transductive \nlearner in the current version.\n\n");
}
if(transduction && learn_parm->compute_loo) {
learn_parm->compute_loo=0;
if(verbosity >= 1)
printf("\nCannot compute leave-one-out estimates for transductive learner.\n\n");
}
if(learn_parm->remove_inconsistent && learn_parm->compute_loo) {
learn_parm->compute_loo=0;
printf("\nCannot compute leave-one-out estimates when removing inconsistent examples.\n\n");
}
if(learn_parm->compute_loo && ((trainpos == 1) || (trainneg == 1))) {
learn_parm->compute_loo=0;
printf("\nCannot compute leave-one-out with only one example in one class.\n\n");
}
if(verbosity==1) {
printf("Optimizing"); fflush(stdout);
}
/* train the svm */
iterations=optimize_to_convergence(docs,label,totdoc,totwords,learn_parm,
kernel_parm,kernel_cache,&shrink_state,model,
inconsistent,unlabeled,a,lin,
c,&timing_profile,
&maxdiff,(long)-1,
(long)1);
if(verbosity>=1) {
if(verbosity==1) printf("done. (%ld iterations)\n",iterations);
misclassified=0;
for(i=0;(i<totdoc);i++) { /* get final statistic */
if((lin[i]-model->b)*(double)label[i] <= 0.0)
misclassified++;
}
printf("Optimization finished (%ld misclassified, maxdiff=%.5f).\n",
misclassified,maxdiff);
runtime_end=get_runtime();
if(verbosity>=2) {
printf("Runtime in cpu-seconds: %.2f (%.2f%% for kernel/%.2f%% for optimizer/%.2f%% for final/%.2f%% for update/%.2f%% for model/%.2f%% for check/%.2f%% for select)\n",
((float)runtime_end-(float)runtime_start)/100.0,
(100.0*timing_profile.time_kernel)/(float)(runtime_end-runtime_start),
(100.0*timing_profile.time_opti)/(float)(runtime_end-runtime_start),
(100.0*timing_profile.time_shrink)/(float)(runtime_end-runtime_start),
(100.0*timing_profile.time_update)/(float)(runtime_end-runtime_start),
(100.0*timing_profile.time_model)/(float)(runtime_end-runtime_start),
(100.0*timing_profile.time_check)/(float)(runtime_end-runtime_start),
(100.0*timing_profile.time_select)/(float)(runtime_end-runtime_start));
}
else {
printf("Runtime in cpu-seconds: %.2f\n",
(runtime_end-runtime_start)/100.0);
}
if(learn_parm->remove_inconsistent) {
inconsistentnum=0;
for(i=0;i<totdoc;i++)
if(inconsistent[i])
inconsistentnum++;
printf("Number of SV: %ld (plus %ld inconsistent examples)\n",
model->sv_num-1,inconsistentnum);
}
else {
upsupvecnum=0;
for(i=1;i<model->sv_num;i++) {
if(fabs(model->alpha[i]) >=
(learn_parm->svm_cost[(model->supvec[i])->docnum]-
learn_parm->epsilon_a))
upsupvecnum++;
}
printf("Number of SV: %ld (including %ld at upper bound)\n",
model->sv_num-1,upsupvecnum);
}
if((verbosity>=1) && (!learn_parm->skip_final_opt_check)) {
loss=0;
model_length=0;
for(i=0;i<totdoc;i++) {
if((lin[i]-model->b)*(double)label[i] < 1.0-learn_parm->epsilon_crit)
loss+=1.0-(lin[i]-model->b)*(double)label[i];
model_length+=a[i]*label[i]*lin[i];
}
model_length=sqrt(model_length);
fprintf(stdout,"L1 loss: loss=%.5f\n",loss);
fprintf(stdout,"Norm of weight vector: |w|=%.5f\n",model_length);
example_length=estimate_sphere(model,kernel_parm);
fprintf(stdout,"Norm of longest example vector: |x|=%.5f\n",
length_of_longest_document_vector(docs,totdoc,kernel_parm));
fprintf(stdout,"Estimated VCdim of classifier: VCdim<=%.5f\n",
estimate_margin_vcdim(model,model_length,example_length,
kernel_parm));
if((!learn_parm->remove_inconsistent) && (!transduction)) {
runtime_start_xa=get_runtime();
if(verbosity>=1) {
printf("Computing XiAlpha-estimates..."); fflush(stdout);
}
compute_xa_estimates(model,label,unlabeled,totdoc,docs,lin,a,
kernel_parm,learn_parm,&(model->xa_error),
&(model->xa_recall),&(model->xa_precision));
if(verbosity>=1) {
printf("done\n");
}
printf("Runtime for XiAlpha-estimates in cpu-seconds: %.2f\n",
(get_runtime()-runtime_start_xa)/100.0);
fprintf(stdout,"XiAlpha-estimate of the error: error<=%.2f%% (rho=%.2f,depth=%ld)\n",
model->xa_error,learn_parm->rho,learn_parm->xa_depth);
fprintf(stdout,"XiAlpha-estimate of the recall: recall=>%.2f%% (rho=%.2f,depth=%ld)\n",
model->xa_recall,learn_parm->rho,learn_parm->xa_depth);
fprintf(stdout,"XiAlpha-estimate of the precision: precision=>%.2f%% (rho=%.2f,depth=%ld)\n",
model->xa_precision,learn_parm->rho,learn_parm->xa_depth);
}
else if(!learn_parm->remove_inconsistent) {
estimate_transduction_quality(model,label,unlabeled,totdoc,docs,lin);
}
}
if(verbosity>=1) {
printf("Number of kernel evaluations: %ld\n",kernel_cache_statistic);
}
}
/* leave-one-out testing starts now */
if(learn_parm->compute_loo) {
/* save results of training on full dataset for leave-one-out */
runtime_start_loo=get_runtime();
for(i=0;i<totdoc;i++) {
xi_fullset[i]=1.0-((lin[i]-model->b)*(double)label[i]);
if(xi_fullset[i]<0) xi_fullset[i]=0;
a_fullset[i]=a[i];
}
if(verbosity>=1) {
printf("Computing leave-one-out");
}
/* repeat this loop for every held-out example */
for(heldout=0;(heldout<totdoc);heldout++) {
if(learn_parm->rho*a_fullset[heldout]*r_delta_sq+xi_fullset[heldout]
< 1.0) {
/* guaranteed to not produce a leave-one-out error */
if(verbosity==1) {
printf("+"); fflush(stdout);
}
}
else if(xi_fullset[heldout] > 1.0) {
/* guaranteed to produce a leave-one-out error */
loo_count++;
if(label[heldout] > 0) loo_count_pos++; else loo_count_neg++;
if(verbosity==1) {
printf("-"); fflush(stdout);
}
}
else {
loocomputed++;
heldout_c=learn_parm->svm_cost[heldout]; /* set upper bound to zero */
learn_parm->svm_cost[heldout]=0;
/* make sure heldout example is not currently */
/* shrunk away. Assumes that lin is up to date! */
shrink_state.active[heldout]=1;
if(verbosity>=2)
printf("\nLeave-One-Out test on example %ld\n",heldout);
if(verbosity>=1) {
printf("(?[%ld]",heldout); fflush(stdout);
}
optimize_to_convergence(docs,label,totdoc,totwords,learn_parm,
kernel_parm,
kernel_cache,&shrink_state,model,inconsistent,unlabeled,
a,lin,c,&timing_profile,
&maxdiff,heldout,(long)2);
/* printf("%.20f\n",(lin[heldout]-model->b)*(double)label[heldout]); */
if(((lin[heldout]-model->b)*(double)label[heldout]) <= 0.0) {
loo_count++; /* there was a loo-error */
if(label[heldout] > 0) loo_count_pos++; else loo_count_neg++;
if(verbosity>=1) {
printf("-)"); fflush(stdout);
}
}
else {
if(verbosity>=1) {
printf("+)"); fflush(stdout);
}
}
/* now we need to restore the original data set*/
learn_parm->svm_cost[heldout]=heldout_c; /* restore upper bound */
}
} /* end of leave-one-out loop */
if(verbosity>=1) {
printf("\nRetrain on full problem"); fflush(stdout);
}
optimize_to_convergence(docs,label,totdoc,totwords,learn_parm,
kernel_parm,
kernel_cache,&shrink_state,model,inconsistent,unlabeled,
a,lin,c,&timing_profile,
&maxdiff,(long)-1,(long)1);
if(verbosity >= 1)
printf("done.\n");
/* after all leave-one-out computed */
model->loo_error=100.0*loo_count/(double)totdoc;
model->loo_recall=(1.0-(double)loo_count_pos/(double)trainpos)*100.0;
model->loo_precision=(trainpos-loo_count_pos)/
(double)(trainpos-loo_count_pos+loo_count_neg)*100.0;
if(verbosity >= 1) {
fprintf(stdout,"Leave-one-out estimate of the error: error=%.2f%%\n",
model->loo_error);
fprintf(stdout,"Leave-one-out estimate of the recall: recall=%.2f%%\n",
model->loo_recall);
fprintf(stdout,"Leave-one-out estimate of the precision: precision=%.2f%%\n",
model->loo_precision);
fprintf(stdout,"Actual leave-one-outs computed: %ld (rho=%.2f)\n",
loocomputed,learn_parm->rho);
printf("Runtime for leave-one-out in cpu-seconds: %.2f\n",
(double)(get_runtime()-runtime_start_loo)/100.0);
}
}
if(learn_parm->alphafile[0])
write_alphas(learn_parm->alphafile,a,label,totdoc);
shrink_state_cleanup(&shrink_state);
free(label);
free(inconsistent);
free(unlabeled);
free(c);
free(a);
free(a_fullset);
free(xi_fullset);
free(lin);
free(learn_parm->svm_cost);
}
/* Learns an SVM regression model based on the training data in
docs/label. The resulting model is returned in the structure
model. */
void svm_learn_regression(DOC **docs, double *value, long int totdoc,
long int totwords, LEARN_PARM *learn_parm,
KERNEL_PARM *kernel_parm,
KERNEL_CACHE **kernel_cache, MODEL *model)
/* docs: Training vectors (x-part) */
/* class: Training value (y-part) */
/* totdoc: Number of examples in docs/label */
/* totwords: Number of features (i.e. highest feature index) */
/* learn_parm: Learning paramenters */
/* kernel_parm: Kernel paramenters */
/* kernel_cache:Initialized Cache, if using a kernel. NULL if
linear. Note that it will be free'd and reassigned */
/* model: Returns learning result (assumed empty before called) */
{
long *inconsistent,i,j;
long inconsistentnum;
long upsupvecnum;
double loss,model_length,example_length;
double maxdiff,*lin,*a,*c;
long runtime_start,runtime_end;
long iterations,kernel_cache_size;
long *unlabeled;
double r_delta_sq=0,r_delta,r_delta_avg;
double *xi_fullset; /* buffer for storing xi on full sample in loo */
double *a_fullset; /* buffer for storing alpha on full sample in loo */
TIMING timing_profile;
SHRINK_STATE shrink_state;
DOC **docs_org;
long *label;
/* set up regression problem in standard form */
docs_org=docs;
docs = (DOC **)my_malloc(sizeof(DOC)*2*totdoc);
label = (long *)my_malloc(sizeof(long)*2*totdoc);
c = (double *)my_malloc(sizeof(double)*2*totdoc);
for(i=0;i<totdoc;i++) {
j=2*totdoc-1-i;
docs[i]=create_example(i,0,0,docs_org[i]->costfactor,docs_org[i]->fvec);
label[i]=+1;
c[i]=value[i];
docs[j]=create_example(j,0,0,docs_org[i]->costfactor,docs_org[i]->fvec);
label[j]=-1;
c[j]=value[i];
}
totdoc*=2;
/* need to get a bigger kernel cache */
if(*kernel_cache) {
kernel_cache_size=(*kernel_cache)->buffsize*sizeof(CFLOAT)/(1024*1024);
kernel_cache_cleanup(*kernel_cache);
(*kernel_cache)=kernel_cache_init(totdoc,kernel_cache_size);
}
runtime_start=get_runtime();
timing_profile.time_kernel=0;
timing_profile.time_opti=0;
timing_profile.time_shrink=0;
timing_profile.time_update=0;
timing_profile.time_model=0;
timing_profile.time_check=0;
timing_profile.time_select=0;
kernel_cache_statistic=0;
learn_parm->totwords=totwords;
/* make sure -n value is reasonable */
if((learn_parm->svm_newvarsinqp < 2)
|| (learn_parm->svm_newvarsinqp > learn_parm->svm_maxqpsize)) {
learn_parm->svm_newvarsinqp=learn_parm->svm_maxqpsize;
}
init_shrink_state(&shrink_state,totdoc,(long)MAXSHRINK);
inconsistent = (long *)my_malloc(sizeof(long)*totdoc);
unlabeled = (long *)my_malloc(sizeof(long)*totdoc);
a = (double *)my_malloc(sizeof(double)*totdoc);
a_fullset = (double *)my_malloc(sizeof(double)*totdoc);
xi_fullset = (double *)my_malloc(sizeof(double)*totdoc);
lin = (double *)my_malloc(sizeof(double)*totdoc);
learn_parm->svm_cost = (double *)my_malloc(sizeof(double)*totdoc);
model->supvec = (DOC **)my_malloc(sizeof(DOC *)*(totdoc+2));
model->alpha = (double *)my_malloc(sizeof(double)*(totdoc+2));
model->index = (long *)my_malloc(sizeof(long)*(totdoc+2));
model->at_upper_bound=0;
model->b=0;
model->supvec[0]=0; /* element 0 reserved and empty for now */
model->alpha[0]=0;
model->lin_weights=NULL;
model->totwords=totwords;
model->totdoc=totdoc;
model->kernel_parm=(*kernel_parm);
model->sv_num=1;
model->loo_error=-1;
model->loo_recall=-1;
model->loo_precision=-1;
model->xa_error=-1;
model->xa_recall=-1;
model->xa_precision=-1;
inconsistentnum=0;
r_delta=estimate_r_delta(docs,totdoc,kernel_parm);
r_delta_sq=r_delta*r_delta;
r_delta_avg=estimate_r_delta_average(docs,totdoc,kernel_parm);
if(learn_parm->svm_c == 0.0) { /* default value for C */
learn_parm->svm_c=1.0/(r_delta_avg*r_delta_avg);
if(verbosity>=1)
printf("Setting default regularization parameter C=%.4f\n",
learn_parm->svm_c);
}
for(i=0;i<totdoc;i++) { /* various inits */
inconsistent[i]=0;
a[i]=0;
lin[i]=0;
unlabeled[i]=0;
if(label[i] > 0) {
learn_parm->svm_cost[i]=learn_parm->svm_c*learn_parm->svm_costratio*
docs[i]->costfactor;
}
else if(label[i] < 0) {
learn_parm->svm_cost[i]=learn_parm->svm_c*docs[i]->costfactor;
}
}
/* caching makes no sense for linear kernel */
if((kernel_parm->kernel_type == LINEAR) && (*kernel_cache)) {
printf("WARNING: Using a kernel cache for linear case will slow optimization down!\n");
}
if(verbosity==1) {
printf("Optimizing"); fflush(stdout);
}
/* train the svm */
iterations=optimize_to_convergence(docs,label,totdoc,totwords,learn_parm,
kernel_parm,*kernel_cache,&shrink_state,
model,inconsistent,unlabeled,a,lin,c,
&timing_profile,&maxdiff,(long)-1,
(long)1);
if(verbosity>=1) {
if(verbosity==1) printf("done. (%ld iterations)\n",iterations);
printf("Optimization finished (maxdiff=%.5f).\n",maxdiff);
runtime_end=get_runtime();
if(verbosity>=2) {
printf("Runtime in cpu-seconds: %.2f (%.2f%% for kernel/%.2f%% for optimizer/%.2f%% for final/%.2f%% for update/%.2f%% for model/%.2f%% for check/%.2f%% for select)\n",
((float)runtime_end-(float)runtime_start)/100.0,
(100.0*timing_profile.time_kernel)/(float)(runtime_end-runtime_start),
(100.0*timing_profile.time_opti)/(float)(runtime_end-runtime_start),
(100.0*timing_profile.time_shrink)/(float)(runtime_end-runtime_start),
(100.0*timing_profile.time_update)/(float)(runtime_end-runtime_start),
(100.0*timing_profile.time_model)/(float)(runtime_end-runtime_start),
(100.0*timing_profile.time_check)/(float)(runtime_end-runtime_start),
(100.0*timing_profile.time_select)/(float)(runtime_end-runtime_start));
}
else {
printf("Runtime in cpu-seconds: %.2f\n",
(runtime_end-runtime_start)/100.0);
}
if(learn_parm->remove_inconsistent) {
inconsistentnum=0;
for(i=0;i<totdoc;i++)
if(inconsistent[i])
inconsistentnum++;
printf("Number of SV: %ld (plus %ld inconsistent examples)\n",
model->sv_num-1,inconsistentnum);
}
else {
upsupvecnum=0;
for(i=1;i<model->sv_num;i++) {
if(fabs(model->alpha[i]) >=
(learn_parm->svm_cost[(model->supvec[i])->docnum]-
learn_parm->epsilon_a))
upsupvecnum++;
}
printf("Number of SV: %ld (including %ld at upper bound)\n",
model->sv_num-1,upsupvecnum);
}
if((verbosity>=1) && (!learn_parm->skip_final_opt_check)) {
loss=0;
model_length=0;
for(i=0;i<totdoc;i++) {
if((lin[i]-model->b)*(double)label[i] < (-learn_parm->eps+(double)label[i]*c[i])-learn_parm->epsilon_crit)
loss+=-learn_parm->eps+(double)label[i]*c[i]-(lin[i]-model->b)*(double)label[i];
model_length+=a[i]*label[i]*lin[i];
}
model_length=sqrt(model_length);
fprintf(stdout,"L1 loss: loss=%.5f\n",loss);
fprintf(stdout,"Norm of weight vector: |w|=%.5f\n",model_length);
example_length=estimate_sphere(model,kernel_parm);
fprintf(stdout,"Norm of longest example vector: |x|=%.5f\n",
length_of_longest_document_vector(docs,totdoc,kernel_parm));
}
if(verbosity>=1) {
printf("Number of kernel evaluations: %ld\n",kernel_cache_statistic);
}
}
if(learn_parm->alphafile[0])
write_alphas(learn_parm->alphafile,a,label,totdoc);
/* this makes sure the model we return does not contain pointers to the
temporary documents */
for(i=1;i<model->sv_num;i++) {
j=model->supvec[i]->docnum;
if(j >= (totdoc/2)) {
j=totdoc-j-1;
}
model->supvec[i]=docs_org[j];
}
shrink_state_cleanup(&shrink_state);
for(i=0;i<totdoc;i++)
free_example(docs[i],0);
free(docs);
free(label);
free(inconsistent);
free(unlabeled);
free(c);
free(a);
free(a_fullset);
free(xi_fullset);
free(lin);
free(learn_parm->svm_cost);
}
void svm_learn_ranking(DOC **docs, double *rankvalue, long int totdoc,
long int totwords, LEARN_PARM *learn_parm,
KERNEL_PARM *kernel_parm, KERNEL_CACHE **kernel_cache,
MODEL *model)
/* docs: Training vectors (x-part) */
/* rankvalue: Training target values that determine the ranking */
/* totdoc: Number of examples in docs/label */
/* totwords: Number of features (i.e. highest feature index) */
/* learn_parm: Learning paramenters */
/* kernel_parm: Kernel paramenters */
/* kernel_cache:Initialized pointer to Cache of size 1*totdoc, if
using a kernel. NULL if linear. NOTE: Cache is
getting reinitialized in this function */
/* model: Returns learning result (assumed empty before called) */
{
DOC **docdiff;
long i,j,k,totpair,kernel_cache_size;
double *target,*alpha,cost;
long *greater,*lesser;
MODEL *pairmodel;
SVECTOR *flow,*fhigh;
totpair=0;
for(i=0;i<totdoc;i++) {
for(j=i+1;j<totdoc;j++) {
if((docs[i]->queryid==docs[j]->queryid) && (rankvalue[i] != rankvalue[j])) {
totpair++;
}
}
}
printf("Constructing %ld rank constraints...",totpair); fflush(stdout);
docdiff=(DOC **)my_malloc(sizeof(DOC)*totpair);
target=(double *)my_malloc(sizeof(double)*totpair);
greater=(long *)my_malloc(sizeof(long)*totpair);
lesser=(long *)my_malloc(sizeof(long)*totpair);
k=0;
for(i=0;i<totdoc;i++) {
for(j=i+1;j<totdoc;j++) {
if(docs[i]->queryid == docs[j]->queryid) {
cost=(docs[i]->costfactor+docs[j]->costfactor)/2.0;
if(rankvalue[i] > rankvalue[j]) {
if(kernel_parm->kernel_type == LINEAR)
docdiff[k]=create_example(k,0,0,cost,
sub_ss(docs[i]->fvec,docs[j]->fvec));
else {
flow=copy_svector(docs[j]->fvec);
flow->factor=-1.0;
flow->next=NULL;
fhigh=copy_svector(docs[i]->fvec);
fhigh->factor=1.0;
fhigh->next=flow;
docdiff[k]=create_example(k,0,0,cost,fhigh);
}
target[k]=1;
greater[k]=i;
lesser[k]=j;
k++;
}
else if(rankvalue[i] < rankvalue[j]) {
if(kernel_parm->kernel_type == LINEAR)
docdiff[k]=create_example(k,0,0,cost,
sub_ss(docs[i]->fvec,docs[j]->fvec));
else {
flow=copy_svector(docs[j]->fvec);
flow->factor=-1.0;
flow->next=NULL;
fhigh=copy_svector(docs[i]->fvec);
fhigh->factor=1.0;
fhigh->next=flow;
docdiff[k]=create_example(k,0,0,cost,fhigh);
}
target[k]=-1;
greater[k]=i;
lesser[k]=j;
k++;
}
}
}
}
printf("done.\n"); fflush(stdout);
/* need to get a bigger kernel cache */
if(*kernel_cache) {
kernel_cache_size=(*kernel_cache)->buffsize*sizeof(CFLOAT)/(1024*1024);
kernel_cache_cleanup(*kernel_cache);
(*kernel_cache)=kernel_cache_init(totpair,kernel_cache_size);
}
/* must use unbiased hyperplane on difference vectors */
learn_parm->biased_hyperplane=0;
pairmodel=(MODEL *)my_malloc(sizeof(MODEL));
svm_learn_classification(docdiff,target,totpair,totwords,learn_parm,
kernel_parm,(*kernel_cache),pairmodel,NULL);
/* Transfer the result into a more compact model. If you would like
to output the original model on pairs of documents, see below. */
alpha=(double *)my_malloc(sizeof(double)*totdoc);
for(i=0;i<totdoc;i++) {
alpha[i]=0;
}
for(i=1;i<pairmodel->sv_num;i++) {
alpha[lesser[(pairmodel->supvec[i])->docnum]]-=pairmodel->alpha[i];
alpha[greater[(pairmodel->supvec[i])->docnum]]+=pairmodel->alpha[i];
}
model->supvec = (DOC **)my_malloc(sizeof(DOC *)*(totdoc+2));
model->alpha = (double *)my_malloc(sizeof(double)*(totdoc+2));
model->index = (long *)my_malloc(sizeof(long)*(totdoc+2));
model->supvec[0]=0; /* element 0 reserved and empty for now */
model->alpha[0]=0;
model->sv_num=1;
for(i=0;i<totdoc;i++) {
if(alpha[i]) {
model->supvec[model->sv_num]=docs[i];
model->alpha[model->sv_num]=alpha[i];
model->index[i]=model->sv_num;
model->sv_num++;
}
else {
model->index[i]=-1;
}
}
model->at_upper_bound=0;
model->b=0;
model->lin_weights=NULL;
model->totwords=totwords;
model->totdoc=totdoc;
model->kernel_parm=(*kernel_parm);
model->loo_error=-1;
model->loo_recall=-1;
model->loo_precision=-1;
model->xa_error=-1;
model->xa_recall=-1;
model->xa_precision=-1;
free(alpha);
free(greater);
free(lesser);
free(target);
/* If you would like to output the original model on pairs of
document, replace the following lines with '(*model)=(*pairmodel);' */
for(i=0;i<totpair;i++)
free_example(docdiff[i],1);
free(docdiff);
free_model(pairmodel,0);
}
/* The following solves a freely defined and given set of
inequalities. The optimization problem is of the following form:
min 0.5 w*w + C sum_i C_i \xi_i
s.t. x_i * w > rhs_i - \xi_i
This corresponds to the -z o option. */
void svm_learn_optimization(DOC **docs, double *rhs, long int
totdoc, long int totwords,
LEARN_PARM *learn_parm,
KERNEL_PARM *kernel_parm,
KERNEL_CACHE *kernel_cache, MODEL *model,
double *alpha)
/* docs: Left-hand side of inequalities (x-part) */
/* rhs: Right-hand side of inequalities */
/* totdoc: Number of examples in docs/label */
/* totwords: Number of features (i.e. highest feature index) */
/* learn_parm: Learning paramenters */
/* kernel_parm: Kernel paramenters */
/* kernel_cache:Initialized Cache of size 1*totdoc, if using a kernel.
NULL if linear.*/
/* model: Returns solution as SV expansion (assumed empty before called) */
/* alpha: Start values for the alpha variables or NULL
pointer. The new alpha values are returned after
optimization if not NULL. Array must be of size totdoc. */
{
long i,*label;
long misclassified,upsupvecnum;
double loss,model_length,example_length;
double maxdiff,*lin,*a,*c;
long runtime_start,runtime_end;
long iterations,maxslackid,svsetnum;
long *unlabeled,*inconsistent;
double r_delta_sq=0,r_delta,r_delta_avg;
long *index,*index2dnum;
double *weights,*slack,*alphaslack;
CFLOAT *aicache; /* buffer to keep one row of hessian */
TIMING timing_profile;
SHRINK_STATE shrink_state;
runtime_start=get_runtime();
timing_profile.time_kernel=0;
timing_profile.time_opti=0;
timing_profile.time_shrink=0;
timing_profile.time_update=0;
timing_profile.time_model=0;
timing_profile.time_check=0;
timing_profile.time_select=0;
kernel_cache_statistic=0;
learn_parm->totwords=totwords;
/* make sure -n value is reasonable */
if((learn_parm->svm_newvarsinqp < 2)
|| (learn_parm->svm_newvarsinqp > learn_parm->svm_maxqpsize)) {
learn_parm->svm_newvarsinqp=learn_parm->svm_maxqpsize;
}
init_shrink_state(&shrink_state,totdoc,(long)MAXSHRINK);
label = (long *)my_malloc(sizeof(long)*totdoc);
unlabeled = (long *)my_malloc(sizeof(long)*totdoc);
inconsistent = (long *)my_malloc(sizeof(long)*totdoc);
c = (double *)my_malloc(sizeof(double)*totdoc);
a = (double *)my_malloc(sizeof(double)*totdoc);
lin = (double *)my_malloc(sizeof(double)*totdoc);
learn_parm->svm_cost = (double *)my_malloc(sizeof(double)*totdoc);
model->supvec = (DOC **)my_malloc(sizeof(DOC *)*(totdoc+2));
model->alpha = (double *)my_malloc(sizeof(double)*(totdoc+2));
model->index = (long *)my_malloc(sizeof(long)*(totdoc+2));
model->at_upper_bound=0;
model->b=0;
model->supvec[0]=0; /* element 0 reserved and empty for now */
model->alpha[0]=0;
model->lin_weights=NULL;
model->totwords=totwords;
model->totdoc=totdoc;
model->kernel_parm=(*kernel_parm);
model->sv_num=1;
model->loo_error=-1;
model->loo_recall=-1;
model->loo_precision=-1;
model->xa_error=-1;
model->xa_recall=-1;
model->xa_precision=-1;
r_delta=estimate_r_delta(docs,totdoc,kernel_parm);
r_delta_sq=r_delta*r_delta;
r_delta_avg=estimate_r_delta_average(docs,totdoc,kernel_parm);
if(learn_parm->svm_c == 0.0) { /* default value for C */
learn_parm->svm_c=1.0/(r_delta_avg*r_delta_avg);
if(verbosity>=1)
printf("Setting default regularization parameter C=%.4f\n",
learn_parm->svm_c);
}
learn_parm->biased_hyperplane=0; /* learn an unbiased hyperplane */
learn_parm->eps=0.0; /* No margin, unless explicitly handcoded
in the right-hand side in the training
set. */
for(i=0;i<totdoc;i++) { /* various inits */
docs[i]->docnum=i;
a[i]=0;
lin[i]=0;
c[i]=rhs[i]; /* set right-hand side */
unlabeled[i]=0;
inconsistent[i]=0;
learn_parm->svm_cost[i]=learn_parm->svm_c*learn_parm->svm_costratio*
docs[i]->costfactor;
label[i]=1;
}
if(learn_parm->sharedslack) /* if shared slacks are used, they must */
for(i=0;i<totdoc;i++) /* be used on every constraint */
if(!docs[i]->slackid) {
perror("Error: Missing shared slacks definitions in some of the examples.");
exit(0);
}
/* compute starting state for initial alpha values */
if(alpha) {
if(verbosity>=1) {
printf("Computing starting state..."); fflush(stdout);
}
index = (long *)my_malloc(sizeof(long)*totdoc);
index2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11));
weights=(double *)my_malloc(sizeof(double)*(totwords+1));
aicache = (CFLOAT *)my_malloc(sizeof(CFLOAT)*totdoc);
for(i=0;i<totdoc;i++) { /* create full index and clip alphas */
index[i]=1;
alpha[i]=fabs(alpha[i]);
if(alpha[i]<0) alpha[i]=0;
if(alpha[i]>learn_parm->svm_cost[i]) alpha[i]=learn_parm->svm_cost[i];
}
if(kernel_parm->kernel_type != LINEAR) {
for(i=0;i<totdoc;i++) /* fill kernel cache with unbounded SV */
if((alpha[i]>0) && (alpha[i]<learn_parm->svm_cost[i])
&& (kernel_cache_space_available(kernel_cache)))
cache_kernel_row(kernel_cache,docs,i,kernel_parm);
for(i=0;i<totdoc;i++) /* fill rest of kernel cache with bounded SV */
if((alpha[i]==learn_parm->svm_cost[i])
&& (kernel_cache_space_available(kernel_cache)))
cache_kernel_row(kernel_cache,docs,i,kernel_parm);
}
(void)compute_index(index,totdoc,index2dnum);
update_linear_component(docs,label,index2dnum,alpha,a,index2dnum,totdoc,
totwords,kernel_parm,kernel_cache,lin,aicache,
weights);
(void)calculate_svm_model(docs,label,unlabeled,lin,alpha,a,c,
learn_parm,index2dnum,index2dnum,model);
for(i=0;i<totdoc;i++) { /* copy initial alphas */
a[i]=alpha[i];
}
free(index);
free(index2dnum);
free(weights);
free(aicache);
if(verbosity>=1) {
printf("done.\n"); fflush(stdout);
}
}
/* removing inconsistent does not work for general optimization problem */
if(learn_parm->remove_inconsistent) {
learn_parm->remove_inconsistent = 0;
printf("'remove inconsistent' not available in this mode. Switching option off!"); fflush(stdout);
}
/* caching makes no sense for linear kernel */
if(kernel_parm->kernel_type == LINEAR) {
kernel_cache = NULL;
}
if(verbosity==1) {
printf("Optimizing"); fflush(stdout);
}
/* train the svm */
if(learn_parm->sharedslack)
iterations=optimize_to_convergence_sharedslack(docs,label,totdoc,
totwords,learn_parm,kernel_parm,
kernel_cache,&shrink_state,model,
a,lin,c,&timing_profile,
&maxdiff);
else
iterations=optimize_to_convergence(docs,label,totdoc,
totwords,learn_parm,kernel_parm,
kernel_cache,&shrink_state,model,
inconsistent,unlabeled,
a,lin,c,&timing_profile,
&maxdiff,(long)-1,(long)1);
if(verbosity>=1) {
if(verbosity==1) printf("done. (%ld iterations)\n",iterations);
misclassified=0;
for(i=0;(i<totdoc);i++) { /* get final statistic */
if((lin[i]-model->b)*(double)label[i] <= 0.0)
misclassified++;
}
printf("Optimization finished (maxdiff=%.5f).\n",maxdiff);
runtime_end=get_runtime();
if(verbosity>=2) {
printf("Runtime in cpu-seconds: %.2f (%.2f%% for kernel/%.2f%% for optimizer/%.2f%% for final/%.2f%% for update/%.2f%% for model/%.2f%% for check/%.2f%% for select)\n",
((float)runtime_end-(float)runtime_start)/100.0,
(100.0*timing_profile.time_kernel)/(float)(runtime_end-runtime_start),
(100.0*timing_profile.time_opti)/(float)(runtime_end-runtime_start),
(100.0*timing_profile.time_shrink)/(float)(runtime_end-runtime_start),
(100.0*timing_profile.time_update)/(float)(runtime_end-runtime_start),
(100.0*timing_profile.time_model)/(float)(runtime_end-runtime_start),
(100.0*timing_profile.time_check)/(float)(runtime_end-runtime_start),
(100.0*timing_profile.time_select)/(float)(runtime_end-runtime_start));
}
else {
printf("Runtime in cpu-seconds: %.2f\n",
(runtime_end-runtime_start)/100.0);
}
}
if((verbosity>=1) && (!learn_parm->skip_final_opt_check)) {
loss=0;
model_length=0;
for(i=0;i<totdoc;i++) {
if((lin[i]-model->b)*(double)label[i] < c[i]-learn_parm->epsilon_crit)
loss+=c[i]-(lin[i]-model->b)*(double)label[i];
model_length+=a[i]*label[i]*lin[i];
}
model_length=sqrt(model_length);
fprintf(stdout,"Norm of weight vector: |w|=%.5f\n",model_length);
}
if(learn_parm->sharedslack) {
index = (long *)my_malloc(sizeof(long)*totdoc);
index2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11));
maxslackid=0;
for(i=0;i<totdoc;i++) { /* create full index */
index[i]=1;
if(maxslackid<docs[i]->slackid)
maxslackid=docs[i]->slackid;
}
(void)compute_index(index,totdoc,index2dnum);
slack=(double *)my_malloc(sizeof(double)*(maxslackid+1));
alphaslack=(double *)my_malloc(sizeof(double)*(maxslackid+1));
for(i=0;i<=maxslackid;i++) { /* init shared slacks */
slack[i]=0;
alphaslack[i]=0;
}
compute_shared_slacks(docs,label,a,lin,c,index2dnum,learn_parm,
slack,alphaslack);
loss=0;
model->at_upper_bound=0;
svsetnum=0;
for(i=0;i<=maxslackid;i++) { /* create full index */
loss+=slack[i];
if(alphaslack[i] > (learn_parm->svm_c - learn_parm->epsilon_a))
model->at_upper_bound++;
if(alphaslack[i] > learn_parm->epsilon_a)
svsetnum++;
}
free(index);
free(index2dnum);
free(slack);
free(alphaslack);
}
if((verbosity>=1) && (!learn_parm->skip_final_opt_check)) {
if(learn_parm->sharedslack) {
printf("Number of SV: %ld\n",
model->sv_num-1);
printf("Number of non-zero slack variables: %ld (out of %ld)\n",
model->at_upper_bound,svsetnum);
fprintf(stdout,"L1 loss: loss=%.5f\n",loss);
}
else {
upsupvecnum=0;
for(i=1;i<model->sv_num;i++) {
if(fabs(model->alpha[i]) >=
(learn_parm->svm_cost[(model->supvec[i])->docnum]-
learn_parm->epsilon_a))
upsupvecnum++;
}
printf("Number of SV: %ld (including %ld at upper bound)\n",
model->sv_num-1,upsupvecnum);
fprintf(stdout,"L1 loss: loss=%.5f\n",loss);
}
example_length=estimate_sphere(model,kernel_parm);
fprintf(stdout,"Norm of longest example vector: |x|=%.5f\n",
length_of_longest_document_vector(docs,totdoc,kernel_parm));
}
if(verbosity>=1) {
printf("Number of kernel evaluations: %ld\n",kernel_cache_statistic);
}
if(alpha) {
for(i=0;i<totdoc;i++) { /* copy final alphas */
alpha[i]=a[i];
}
}
if(learn_parm->alphafile[0])
write_alphas(learn_parm->alphafile,a,label,totdoc);
shrink_state_cleanup(&shrink_state);
free(label);
free(unlabeled);
free(inconsistent);
free(c);
free(a);
free(lin);
free(learn_parm->svm_cost);
}
long optimize_to_convergence(DOC **docs, long int *label, long int totdoc,
long int totwords, LEARN_PARM *learn_parm,
KERNEL_PARM *kernel_parm,
KERNEL_CACHE *kernel_cache,
SHRINK_STATE *shrink_state, MODEL *model,
long int *inconsistent, long int *unlabeled,
double *a, double *lin, double *c,
TIMING *timing_profile, double *maxdiff,
long int heldout, long int retrain)
/* docs: Training vectors (x-part) */
/* label: Training labels/value (y-part, zero if test example for
transduction) */
/* totdoc: Number of examples in docs/label */
/* totwords: Number of features (i.e. highest feature index) */
/* laern_parm: Learning paramenters */
/* kernel_parm: Kernel paramenters */
/* kernel_cache: Initialized/partly filled Cache, if using a kernel.
NULL if linear. */
/* shrink_state: State of active variables */
/* model: Returns learning result */
/* inconsistent: examples thrown out as inconstistent */
/* unlabeled: test examples for transduction */
/* a: alphas */
/* lin: linear component of gradient */
/* c: right hand side of inequalities (margin) */
/* maxdiff: returns maximum violation of KT-conditions */
/* heldout: marks held-out example for leave-one-out (or -1) */
/* retrain: selects training mode (1=regular / 2=holdout) */
{
long *chosen,*key,i,j,jj,*last_suboptimal_at,noshrink;
long inconsistentnum,choosenum,already_chosen=0,iteration;
long misclassified,supvecnum=0,*active2dnum,inactivenum;
long *working2dnum,*selexam;
long activenum;
double criterion,eq;
double *a_old;
long t0=0,t1=0,t2=0,t3=0,t4=0,t5=0,t6=0; /* timing */
long transductcycle;
long transduction;
double epsilon_crit_org;
double bestmaxdiff;
long bestmaxdiffiter,terminate;
double *selcrit; /* buffer for sorting */
CFLOAT *aicache; /* buffer to keep one row of hessian */
double *weights; /* buffer for weight vector in linear case */
QP qp; /* buffer for one quadratic program */
epsilon_crit_org=learn_parm->epsilon_crit; /* save org */
if(kernel_parm->kernel_type == LINEAR) {
learn_parm->epsilon_crit=2.0;
kernel_cache=NULL; /* caching makes no sense for linear kernel */
}
learn_parm->epsilon_shrink=2;
(*maxdiff)=1;
learn_parm->totwords=totwords;
chosen = (long *)my_malloc(sizeof(long)*totdoc);
last_suboptimal_at = (long *)my_malloc(sizeof(long)*totdoc);
key = (long *)my_malloc(sizeof(long)*(totdoc+11));
selcrit = (double *)my_malloc(sizeof(double)*totdoc);
selexam = (long *)my_malloc(sizeof(long)*totdoc);
a_old = (double *)my_malloc(sizeof(double)*totdoc);
aicache = (CFLOAT *)my_malloc(sizeof(CFLOAT)*totdoc);
working2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11));
active2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11));
qp.opt_ce = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize);
qp.opt_ce0 = (double *)my_malloc(sizeof(double));
qp.opt_g = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize
*learn_parm->svm_maxqpsize);
qp.opt_g0 = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize);
qp.opt_xinit = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize);
qp.opt_low=(double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize);
qp.opt_up=(double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize);
weights=(double *)my_malloc(sizeof(double)*(totwords+1));
choosenum=0;
inconsistentnum=0;
transductcycle=0;
transduction=0;
if(!retrain) retrain=1;
iteration=1;
bestmaxdiffiter=1;
bestmaxdiff=999999999;
terminate=0;
if(kernel_cache) {
kernel_cache->time=iteration; /* for lru cache */
kernel_cache_reset_lru(kernel_cache);
}
for(i=0;i<totdoc;i++) { /* various inits */
chosen[i]=0;
a_old[i]=a[i];
last_suboptimal_at[i]=1;
if(inconsistent[i])
inconsistentnum++;
if(unlabeled[i]) {
transduction=1;
}
}
activenum=compute_index(shrink_state->active,totdoc,active2dnum);
inactivenum=totdoc-activenum;
clear_index(working2dnum);
/* repeat this loop until we have convergence */
for(;retrain && (!terminate);iteration++) {
if(kernel_cache)
kernel_cache->time=iteration; /* for lru cache */
if(verbosity>=2) {
printf(
"Iteration %ld: ",iteration); fflush(stdout);
}
else if(verbosity==1) {
printf("."); fflush(stdout);
}
if(verbosity>=2) t0=get_runtime();
if(verbosity>=3) {
printf("\nSelecting working set... "); fflush(stdout);
}
if(learn_parm->svm_newvarsinqp>learn_parm->svm_maxqpsize)
learn_parm->svm_newvarsinqp=learn_parm->svm_maxqpsize;
i=0;
for(jj=0;(j=working2dnum[jj])>=0;jj++) { /* clear working set */
if((chosen[j]>=(learn_parm->svm_maxqpsize/
minl(learn_parm->svm_maxqpsize,
learn_parm->svm_newvarsinqp)))
|| (inconsistent[j])
|| (j == heldout)) {
chosen[j]=0;
choosenum--;
}
else {
chosen[j]++;
working2dnum[i++]=j;
}
}
working2dnum[i]=-1;
if(retrain == 2) {
choosenum=0;
for(jj=0;(j=working2dnum[jj])>=0;jj++) { /* fully clear working set */
chosen[j]=0;
}
clear_index(working2dnum);
for(i=0;i<totdoc;i++) { /* set inconsistent examples to zero (-i 1) */
if((inconsistent[i] || (heldout==i)) && (a[i] != 0.0)) {
chosen[i]=99999;
choosenum++;
a[i]=0;
}
}
if(learn_parm->biased_hyperplane) {
eq=0;
for(i=0;i<totdoc;i++) { /* make sure we fulfill equality constraint */
eq+=a[i]*label[i];
}
for(i=0;(i<totdoc) && (fabs(eq) > learn_parm->epsilon_a);i++) {
if((eq*label[i] > 0) && (a[i] > 0)) {
chosen[i]=88888;
choosenum++;
if((eq*label[i]) > a[i]) {
eq-=(a[i]*label[i]);
a[i]=0;
}
else {
a[i]-=(eq*label[i]);
eq=0;
}
}
}
}
compute_index(chosen,totdoc,working2dnum);
}
else { /* select working set according to steepest gradient */
if(iteration % 101) {
already_chosen=0;
if((minl(learn_parm->svm_newvarsinqp,
learn_parm->svm_maxqpsize-choosenum)>=4)
&& (kernel_parm->kernel_type != LINEAR)) {
/* select part of the working set from cache */
already_chosen=select_next_qp_subproblem_grad(
label,unlabeled,a,lin,c,totdoc,
(long)(minl(learn_parm->svm_maxqpsize-choosenum,
learn_parm->svm_newvarsinqp)
/2),
learn_parm,inconsistent,active2dnum,
working2dnum,selcrit,selexam,kernel_cache,1,
key,chosen);
choosenum+=already_chosen;
}
choosenum+=select_next_qp_subproblem_grad(
label,unlabeled,a,lin,c,totdoc,
minl(learn_parm->svm_maxqpsize-choosenum,
learn_parm->svm_newvarsinqp-already_chosen),
learn_parm,inconsistent,active2dnum,
working2dnum,selcrit,selexam,kernel_cache,0,key,
chosen);
}
else { /* once in a while, select a somewhat random working set
to get unlocked of infinite loops due to numerical
inaccuracies in the core qp-solver */
choosenum+=select_next_qp_subproblem_rand(
label,unlabeled,a,lin,c,totdoc,
minl(learn_parm->svm_maxqpsize-choosenum,
learn_parm->svm_newvarsinqp),
learn_parm,inconsistent,active2dnum,
working2dnum,selcrit,selexam,kernel_cache,key,
chosen,iteration);
}
}
if(verbosity>=2) {
printf(" %ld vectors chosen\n",choosenum); fflush(stdout);
}
if(verbosity>=2) t1=get_runtime();
if(kernel_cache)
cache_multiple_kernel_rows(kernel_cache,docs,working2dnum,
choosenum,kernel_parm);
if(verbosity>=2) t2=get_runtime();
if(retrain != 2) {
optimize_svm(docs,label,unlabeled,inconsistent,0.0,chosen,active2dnum,
model,totdoc,working2dnum,choosenum,a,lin,c,learn_parm,
aicache,kernel_parm,&qp,&epsilon_crit_org);
}
if(verbosity>=2) t3=get_runtime();
update_linear_component(docs,label,active2dnum,a,a_old,working2dnum,totdoc,
totwords,kernel_parm,kernel_cache,lin,aicache,
weights);
if(verbosity>=2) t4=get_runtime();
supvecnum=calculate_svm_model(docs,label,unlabeled,lin,a,a_old,c,
learn_parm,working2dnum,active2dnum,model);
if(verbosity>=2) t5=get_runtime();
/* The following computation of the objective function works only */
/* relative to the active variables */
if(verbosity>=3) {
criterion=compute_objective_function(a,lin,c,learn_parm->eps,label,
active2dnum);
printf("Objective function (over active variables): %.16f\n",criterion);
fflush(stdout);
}
for(jj=0;(i=working2dnum[jj])>=0;jj++) {
a_old[i]=a[i];
}
if(retrain == 2) { /* reset inconsistent unlabeled examples */
for(i=0;(i<totdoc);i++) {
if(inconsistent[i] && unlabeled[i]) {
inconsistent[i]=0;
label[i]=0;
}
}
}
retrain=check_optimality(model,label,unlabeled,a,lin,c,totdoc,learn_parm,
maxdiff,epsilon_crit_org,&misclassified,
inconsistent,active2dnum,last_suboptimal_at,
iteration,kernel_parm);
if(verbosity>=2) {
t6=get_runtime();
timing_profile->time_select+=t1-t0;
timing_profile->time_kernel+=t2-t1;
timing_profile->time_opti+=t3-t2;
timing_profile->time_update+=t4-t3;
timing_profile->time_model+=t5-t4;
timing_profile->time_check+=t6-t5;
}
/* checking whether optimizer got stuck */
if((*maxdiff) < bestmaxdiff) {
bestmaxdiff=(*maxdiff);
bestmaxdiffiter=iteration;
}
if(iteration > (bestmaxdiffiter+learn_parm->maxiter)) {
/* long time no progress? */
terminate=1;
retrain=0;
if(verbosity>=1)
printf("\nWARNING: Relaxing KT-Conditions due to slow progress! Terminating!\n");
}
noshrink=0;
if((!retrain) && (inactivenum>0)
&& ((!learn_parm->skip_final_opt_check)
|| (kernel_parm->kernel_type == LINEAR))) {
if(((verbosity>=1) && (kernel_parm->kernel_type != LINEAR))
|| (verbosity>=2)) {
if(verbosity==1) {
printf("\n");
}
printf(" Checking optimality of inactive variables...");
fflush(stdout);
}
t1=get_runtime();
reactivate_inactive_examples(label,unlabeled,a,shrink_state,lin,c,totdoc,
totwords,iteration,learn_parm,inconsistent,
docs,kernel_parm,kernel_cache,model,aicache,
weights,maxdiff);
/* Update to new active variables. */
activenum=compute_index(shrink_state->active,totdoc,active2dnum);
inactivenum=totdoc-activenum;
/* reset watchdog */
bestmaxdiff=(*maxdiff);
bestmaxdiffiter=iteration;
/* termination criterion */
noshrink=1;
retrain=0;
if((*maxdiff) > learn_parm->epsilon_crit)
retrain=1;
timing_profile->time_shrink+=get_runtime()-t1;
if(((verbosity>=1) && (kernel_parm->kernel_type != LINEAR))
|| (verbosity>=2)) {
printf("done.\n"); fflush(stdout);
printf(" Number of inactive variables = %ld\n",inactivenum);
}
}
if((!retrain) && (learn_parm->epsilon_crit>(*maxdiff)))
learn_parm->epsilon_crit=(*maxdiff);
if((!retrain) && (learn_parm->epsilon_crit>epsilon_crit_org)) {
learn_parm->epsilon_crit/=2.0;
retrain=1;
noshrink=1;
}
if(learn_parm->epsilon_crit<epsilon_crit_org)
learn_parm->epsilon_crit=epsilon_crit_org;
if(verbosity>=2) {
printf(" => (%ld SV (incl. %ld SV at u-bound), max violation=%.5f)\n",
supvecnum,model->at_upper_bound,(*maxdiff));
fflush(stdout);
}
if(verbosity>=3) {
printf("\n");
}
if((!retrain) && (transduction)) {
for(i=0;(i<totdoc);i++) {
shrink_state->active[i]=1;
}
activenum=compute_index(shrink_state->active,totdoc,active2dnum);
inactivenum=0;
if(verbosity==1) printf("done\n");
retrain=incorporate_unlabeled_examples(model,label,inconsistent,
unlabeled,a,lin,totdoc,
selcrit,selexam,key,
transductcycle,kernel_parm,
learn_parm);
epsilon_crit_org=learn_parm->epsilon_crit;
if(kernel_parm->kernel_type == LINEAR)
learn_parm->epsilon_crit=1;
transductcycle++;
/* reset watchdog */
bestmaxdiff=(*maxdiff);
bestmaxdiffiter=iteration;
}
else if(((iteration % 10) == 0) && (!noshrink)) {
activenum=shrink_problem(docs,learn_parm,shrink_state,kernel_parm,
active2dnum,last_suboptimal_at,iteration,totdoc,
maxl((long)(activenum/10),
maxl((long)(totdoc/500),100)),
a,inconsistent);
inactivenum=totdoc-activenum;
if((kernel_cache)
&& (supvecnum>kernel_cache->max_elems)
&& ((kernel_cache->activenum-activenum)>maxl((long)(activenum/10),500))) {
kernel_cache_shrink(kernel_cache,totdoc,
minl((kernel_cache->activenum-activenum),
(kernel_cache->activenum-supvecnum)),
shrink_state->active);
}
}
if((!retrain) && learn_parm->remove_inconsistent) {
if(verbosity>=1) {
printf(" Moving training errors to inconsistent examples...");
fflush(stdout);
}
if(learn_parm->remove_inconsistent == 1) {
retrain=identify_inconsistent(a,label,unlabeled,totdoc,learn_parm,
&inconsistentnum,inconsistent);
}
else if(learn_parm->remove_inconsistent == 2) {
retrain=identify_misclassified(lin,label,unlabeled,totdoc,
model,&inconsistentnum,inconsistent);
}
else if(learn_parm->remove_inconsistent == 3) {
retrain=identify_one_misclassified(lin,label,unlabeled,totdoc,
model,&inconsistentnum,inconsistent);
}
if(retrain) {
if(kernel_parm->kernel_type == LINEAR) { /* reinit shrinking */
learn_parm->epsilon_crit=2.0;
}
}
if(verbosity>=1) {
printf("done.\n");
if(retrain) {
printf(" Now %ld inconsistent examples.\n",inconsistentnum);
}
}
}
} /* end of loop */
free(chosen);
free(last_suboptimal_at);
free(key);
free(selcrit);
free(selexam);
free(a_old);
free(aicache);
free(working2dnum);
free(active2dnum);
free(qp.opt_ce);
free(qp.opt_ce0);
free(qp.opt_g);
free(qp.opt_g0);
free(qp.opt_xinit);
free(qp.opt_low);
free(qp.opt_up);
free(weights);
learn_parm->epsilon_crit=epsilon_crit_org; /* restore org */
model->maxdiff=(*maxdiff);
return(iteration);
}
long optimize_to_convergence_sharedslack(DOC **docs, long int *label,
long int totdoc,
long int totwords, LEARN_PARM *learn_parm,
KERNEL_PARM *kernel_parm,
KERNEL_CACHE *kernel_cache,
SHRINK_STATE *shrink_state, MODEL *model,
double *a, double *lin, double *c,
TIMING *timing_profile, double *maxdiff)
/* docs: Training vectors (x-part) */
/* label: Training labels/value (y-part, zero if test example for
transduction) */
/* totdoc: Number of examples in docs/label */
/* totwords: Number of features (i.e. highest feature index) */
/* learn_parm: Learning paramenters */
/* kernel_parm: Kernel paramenters */
/* kernel_cache: Initialized/partly filled Cache, if using a kernel.
NULL if linear. */
/* shrink_state: State of active variables */
/* model: Returns learning result */
/* a: alphas */
/* lin: linear component of gradient */
/* c: right hand side of inequalities (margin) */
/* maxdiff: returns maximum violation of KT-conditions */
{
long *chosen,*key,i,j,jj,*last_suboptimal_at,noshrink,*unlabeled;
long *inconsistent,choosenum,already_chosen=0,iteration;
long misclassified,supvecnum=0,*active2dnum,inactivenum;
long *working2dnum,*selexam,*ignore;
long activenum,retrain,maxslackid,slackset,jointstep;
double criterion,eq_target;
double *a_old,*alphaslack;
long t0=0,t1=0,t2=0,t3=0,t4=0,t5=0,t6=0; /* timing */
double epsilon_crit_org,maxsharedviol;
double bestmaxdiff;
long bestmaxdiffiter,terminate;
double *selcrit; /* buffer for sorting */
CFLOAT *aicache; /* buffer to keep one row of hessian */
double *weights; /* buffer for weight vector in linear case */
QP qp; /* buffer for one quadratic program */
double *slack; /* vector of slack variables for optimization with
shared slacks */
epsilon_crit_org=learn_parm->epsilon_crit; /* save org */
if(kernel_parm->kernel_type == LINEAR) {
learn_parm->epsilon_crit=2.0;
kernel_cache=NULL; /* caching makes no sense for linear kernel */
}
learn_parm->epsilon_shrink=2;
(*maxdiff)=1;
learn_parm->totwords=totwords;
chosen = (long *)my_malloc(sizeof(long)*totdoc);
unlabeled = (long *)my_malloc(sizeof(long)*totdoc);
inconsistent = (long *)my_malloc(sizeof(long)*totdoc);
ignore = (long *)my_malloc(sizeof(long)*totdoc);
key = (long *)my_malloc(sizeof(long)*(totdoc+11));
selcrit = (double *)my_malloc(sizeof(double)*totdoc);
selexam = (long *)my_malloc(sizeof(long)*totdoc);
a_old = (double *)my_malloc(sizeof(double)*totdoc);
aicache = (CFLOAT *)my_malloc(sizeof(CFLOAT)*totdoc);
working2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11));
active2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11));
qp.opt_ce = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize);
qp.opt_ce0 = (double *)my_malloc(sizeof(double));
qp.opt_g = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize
*learn_parm->svm_maxqpsize);
qp.opt_g0 = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize);
qp.opt_xinit = (double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize);
qp.opt_low=(double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize);
qp.opt_up=(double *)my_malloc(sizeof(double)*learn_parm->svm_maxqpsize);
weights=(double *)my_malloc(sizeof(double)*(totwords+1));
maxslackid=0;
for(i=0;i<totdoc;i++) { /* determine size of slack array */
if(maxslackid<docs[i]->slackid)
maxslackid=docs[i]->slackid;
}
slack=(double *)my_malloc(sizeof(double)*(maxslackid+1));
alphaslack=(double *)my_malloc(sizeof(double)*(maxslackid+1));
last_suboptimal_at = (long *)my_malloc(sizeof(long)*(maxslackid+1));
for(i=0;i<=maxslackid;i++) { /* init shared slacks */
slack[i]=0;
alphaslack[i]=0;
last_suboptimal_at[i]=1;
}
choosenum=0;
retrain=1;
iteration=1;
bestmaxdiffiter=1;
bestmaxdiff=999999999;
terminate=0;
if(kernel_cache) {
kernel_cache->time=iteration; /* for lru cache */
kernel_cache_reset_lru(kernel_cache);
}
for(i=0;i<totdoc;i++) { /* various inits */
chosen[i]=0;
unlabeled[i]=0;
inconsistent[i]=0;
ignore[i]=0;
a_old[i]=a[i];
}
activenum=compute_index(shrink_state->active,totdoc,active2dnum);
inactivenum=totdoc-activenum;
clear_index(working2dnum);
/* call to init slack and alphaslack */
compute_shared_slacks(docs,label,a,lin,c,active2dnum,learn_parm,
slack,alphaslack);
/* repeat this loop until we have convergence */
for(;retrain && (!terminate);iteration++) {
if(kernel_cache)
kernel_cache->time=iteration; /* for lru cache */
if(verbosity>=2) {
printf(
"Iteration %ld: ",iteration); fflush(stdout);
}
else if(verbosity==1) {
printf("."); fflush(stdout);
}
if(verbosity>=2) t0=get_runtime();
if(verbosity>=3) {
printf("\nSelecting working set... "); fflush(stdout);
}
if(learn_parm->svm_newvarsinqp>learn_parm->svm_maxqpsize)
learn_parm->svm_newvarsinqp=learn_parm->svm_maxqpsize;
/* select working set according to steepest gradient */
jointstep=0;
eq_target=0;
if(iteration % 101) {
slackset=select_next_qp_slackset(docs,label,a,lin,slack,alphaslack,c,
learn_parm,active2dnum,&maxsharedviol);
if((iteration % 2)
|| (!slackset) || (maxsharedviol<learn_parm->epsilon_crit)){
/* do a step with examples from different slack sets */
if(verbosity >= 2) {
printf("(i-step)"); fflush(stdout);
}
i=0;
for(jj=0;(j=working2dnum[jj])>=0;jj++) { /* clear old part of working set */
if((chosen[j]>=(learn_parm->svm_maxqpsize/
minl(learn_parm->svm_maxqpsize,
learn_parm->svm_newvarsinqp)))) {
chosen[j]=0;
choosenum--;
}
else {
chosen[j]++;
working2dnum[i++]=j;
}
}
working2dnum[i]=-1;
already_chosen=0;
if((minl(learn_parm->svm_newvarsinqp,
learn_parm->svm_maxqpsize-choosenum)>=4)
&& (kernel_parm->kernel_type != LINEAR)) {
/* select part of the working set from cache */
already_chosen=select_next_qp_subproblem_grad(
label,unlabeled,a,lin,c,totdoc,
(long)(minl(learn_parm->svm_maxqpsize-choosenum,
learn_parm->svm_newvarsinqp)
/2),
learn_parm,inconsistent,active2dnum,
working2dnum,selcrit,selexam,kernel_cache,
(long)1,key,chosen);
choosenum+=already_chosen;
}
choosenum+=select_next_qp_subproblem_grad(
label,unlabeled,a,lin,c,totdoc,
minl(learn_parm->svm_maxqpsize-choosenum,
learn_parm->svm_newvarsinqp-already_chosen),
learn_parm,inconsistent,active2dnum,
working2dnum,selcrit,selexam,kernel_cache,
(long)0,key,chosen);
}
else { /* do a step with all examples from same slack set */
if(verbosity >= 2) {
printf("(j-step on %ld)",slackset); fflush(stdout);
}
jointstep=1;
for(jj=0;(j=working2dnum[jj])>=0;jj++) { /* clear working set */
chosen[j]=0;
}
working2dnum[0]=-1;
eq_target=alphaslack[slackset];
for(j=0;j<totdoc;j++) { /* mask all but slackset */
/* for(jj=0;(j=active2dnum[jj])>=0;jj++) { */
if(docs[j]->slackid != slackset)
ignore[j]=1;
else {
ignore[j]=0;
learn_parm->svm_cost[j]=learn_parm->svm_c;
/* printf("Inslackset(%ld,%ld)",j,shrink_state->active[j]); */
}
}
learn_parm->biased_hyperplane=1;
choosenum=select_next_qp_subproblem_grad(
label,unlabeled,a,lin,c,totdoc,
learn_parm->svm_maxqpsize,
learn_parm,ignore,active2dnum,
working2dnum,selcrit,selexam,kernel_cache,
(long)0,key,chosen);
learn_parm->biased_hyperplane=0;
}
}
else { /* once in a while, select a somewhat random working set
to get unlocked of infinite loops due to numerical
inaccuracies in the core qp-solver */
choosenum+=select_next_qp_subproblem_rand(
label,unlabeled,a,lin,c,totdoc,
minl(learn_parm->svm_maxqpsize-choosenum,
learn_parm->svm_newvarsinqp),
learn_parm,inconsistent,active2dnum,
working2dnum,selcrit,selexam,kernel_cache,key,
chosen,iteration);
}
if(verbosity>=2) {
printf(" %ld vectors chosen\n",choosenum); fflush(stdout);
}
if(verbosity>=2) t1=get_runtime();
if(kernel_cache)
cache_multiple_kernel_rows(kernel_cache,docs,working2dnum,
choosenum,kernel_parm);
if(verbosity>=2) t2=get_runtime();
if(jointstep) learn_parm->biased_hyperplane=1;
optimize_svm(docs,label,unlabeled,ignore,eq_target,chosen,active2dnum,
model,totdoc,working2dnum,choosenum,a,lin,c,learn_parm,
aicache,kernel_parm,&qp,&epsilon_crit_org);
learn_parm->biased_hyperplane=0;
for(jj=0;(i=working2dnum[jj])>=0;jj++) /* recompute sums of alphas */
alphaslack[docs[i]->slackid]+=(a[i]-a_old[i]);
for(jj=0;(i=working2dnum[jj])>=0;jj++) { /* reduce alpha to fulfill
constraints */
if(alphaslack[docs[i]->slackid] > learn_parm->svm_c) {
if(a[i] < (alphaslack[docs[i]->slackid]-learn_parm->svm_c)) {
alphaslack[docs[i]->slackid]-=a[i];
a[i]=0;
}
else {
a[i]-=(alphaslack[docs[i]->slackid]-learn_parm->svm_c);
alphaslack[docs[i]->slackid]=learn_parm->svm_c;
}
}
}
for(jj=0;(i=active2dnum[jj])>=0;jj++)
learn_parm->svm_cost[i]=a[i]+(learn_parm->svm_c
-alphaslack[docs[i]->slackid]);
if(verbosity>=2) t3=get_runtime();
update_linear_component(docs,label,active2dnum,a,a_old,working2dnum,totdoc,
totwords,kernel_parm,kernel_cache,lin,aicache,
weights);
compute_shared_slacks(docs,label,a,lin,c,active2dnum,learn_parm,
slack,alphaslack);
if(verbosity>=2) t4=get_runtime();
supvecnum=calculate_svm_model(docs,label,unlabeled,lin,a,a_old,c,
learn_parm,working2dnum,active2dnum,model);
if(verbosity>=2) t5=get_runtime();
/* The following computation of the objective function works only */
/* relative to the active variables */
if(verbosity>=3) {
criterion=compute_objective_function(a,lin,c,learn_parm->eps,label,
active2dnum);
printf("Objective function (over active variables): %.16f\n",criterion);
fflush(stdout);
}
for(jj=0;(i=working2dnum[jj])>=0;jj++) {
a_old[i]=a[i];
}
retrain=check_optimality_sharedslack(docs,model,label,a,lin,c,
slack,alphaslack,totdoc,learn_parm,
maxdiff,epsilon_crit_org,&misclassified,
active2dnum,last_suboptimal_at,
iteration,kernel_parm);
if(verbosity>=2) {
t6=get_runtime();
timing_profile->time_select+=t1-t0;
timing_profile->time_kernel+=t2-t1;
timing_profile->time_opti+=t3-t2;
timing_profile->time_update+=t4-t3;
timing_profile->time_model+=t5-t4;
timing_profile->time_check+=t6-t5;
}
/* checking whether optimizer got stuck */
if((*maxdiff) < bestmaxdiff) {
bestmaxdiff=(*maxdiff);
bestmaxdiffiter=iteration;
}
if(iteration > (bestmaxdiffiter+learn_parm->maxiter)) {
/* long time no progress? */
terminate=1;
retrain=0;
if(verbosity>=1)
printf("\nWARNING: Relaxing KT-Conditions due to slow progress! Terminating!\n");
}
noshrink=0;
if((!retrain) && (inactivenum>0)
&& ((!learn_parm->skip_final_opt_check)
|| (kernel_parm->kernel_type == LINEAR))) {
if(((verbosity>=1) && (kernel_parm->kernel_type != LINEAR))
|| (verbosity>=2)) {
if(verbosity==1) {
printf("\n");
}
printf(" Checking optimality of inactive variables...");
fflush(stdout);
}
t1=get_runtime();
reactivate_inactive_examples(label,unlabeled,a,shrink_state,lin,c,totdoc,
totwords,iteration,learn_parm,inconsistent,
docs,kernel_parm,kernel_cache,model,aicache,
weights,maxdiff);
/* Update to new active variables. */
activenum=compute_index(shrink_state->active,totdoc,active2dnum);
inactivenum=totdoc-activenum;
/* check optimality, since check in reactivate does not work for
sharedslacks */
retrain=check_optimality_sharedslack(docs,model,label,a,lin,c,
slack,alphaslack,totdoc,learn_parm,
maxdiff,epsilon_crit_org,&misclassified,
active2dnum,last_suboptimal_at,
iteration,kernel_parm);
/* reset watchdog */
bestmaxdiff=(*maxdiff);
bestmaxdiffiter=iteration;
/* termination criterion */
noshrink=1;
retrain=0;
if((*maxdiff) > learn_parm->epsilon_crit)
retrain=1;
timing_profile->time_shrink+=get_runtime()-t1;
if(((verbosity>=1) && (kernel_parm->kernel_type != LINEAR))
|| (verbosity>=2)) {
printf("done.\n"); fflush(stdout);
printf(" Number of inactive variables = %ld\n",inactivenum);
}
}
if((!retrain) && (learn_parm->epsilon_crit>(*maxdiff)))
learn_parm->epsilon_crit=(*maxdiff);
if((!retrain) && (learn_parm->epsilon_crit>epsilon_crit_org)) {
learn_parm->epsilon_crit/=2.0;
retrain=1;
noshrink=1;
}
if(learn_parm->epsilon_crit<epsilon_crit_org)
learn_parm->epsilon_crit=epsilon_crit_org;
if(verbosity>=2) {
printf(" => (%ld SV (incl. %ld SV at u-bound), max violation=%.5f)\n",
supvecnum,model->at_upper_bound,(*maxdiff));
fflush(stdout);
}
if(verbosity>=3) {
printf("\n");
}
if(((iteration % 10) == 0) && (!noshrink)) {
activenum=shrink_problem(docs,learn_parm,shrink_state,
kernel_parm,active2dnum,
last_suboptimal_at,iteration,totdoc,
maxl((long)(activenum/10),
maxl((long)(totdoc/500),100)),
a,inconsistent);
inactivenum=totdoc-activenum;
if((kernel_cache)
&& (supvecnum>kernel_cache->max_elems)
&& ((kernel_cache->activenum-activenum)>maxl((long)(activenum/10),500))) {
kernel_cache_shrink(kernel_cache,totdoc,
minl((kernel_cache->activenum-activenum),
(kernel_cache->activenum-supvecnum)),
shrink_state->active);
}
}
} /* end of loop */
free(alphaslack);
free(slack);
free(chosen);
free(unlabeled);
free(inconsistent);
free(ignore);
free(last_suboptimal_at);
free(key);
free(selcrit);
free(selexam);
free(a_old);
free(aicache);
free(working2dnum);
free(active2dnum);
free(qp.opt_ce);
free(qp.opt_ce0);
free(qp.opt_g);
free(qp.opt_g0);
free(qp.opt_xinit);
free(qp.opt_low);
free(qp.opt_up);
free(weights);
learn_parm->epsilon_crit=epsilon_crit_org; /* restore org */
model->maxdiff=(*maxdiff);
return(iteration);
}
double compute_objective_function(double *a, double *lin, double *c,
double eps, long int *label,
long int *active2dnum)
/* Return value of objective function. */
/* Works only relative to the active variables! */
{
long i,ii;
double criterion;
/* calculate value of objective function */
criterion=0;
for(ii=0;active2dnum[ii]>=0;ii++) {
i=active2dnum[ii];
criterion=criterion+(eps-(double)label[i]*c[i])*a[i]+0.5*a[i]*label[i]*lin[i];
}
return(criterion);
}
void clear_index(long int *index)
/* initializes and empties index */
{
index[0]=-1;
}
void add_to_index(long int *index, long int elem)
/* initializes and empties index */
{
register long i;
for(i=0;index[i] != -1;i++);
index[i]=elem;
index[i+1]=-1;
}
long compute_index(long int *binfeature, long int range, long int *index)
/* create an inverted index of binfeature */
{
register long i,ii;
ii=0;
for(i=0;i<range;i++) {
if(binfeature[i]) {
index[ii]=i;
ii++;
}
}
for(i=0;i<4;i++) {
index[ii+i]=-1;
}
return(ii);
}
void optimize_svm(DOC **docs, long int *label, long int *unlabeled,
long int *exclude_from_eq_const, double eq_target,
long int *chosen, long int *active2dnum, MODEL *model,
long int totdoc, long int *working2dnum, long int varnum,
double *a, double *lin, double *c, LEARN_PARM *learn_parm,
CFLOAT *aicache, KERNEL_PARM *kernel_parm, QP *qp,
double *epsilon_crit_target)
/* Do optimization on the working set. */
{
long i;
double *a_v;
compute_matrices_for_optimization(docs,label,unlabeled,
exclude_from_eq_const,eq_target,chosen,
active2dnum,working2dnum,model,a,lin,c,
varnum,totdoc,learn_parm,aicache,
kernel_parm,qp);
if(verbosity>=3) {
printf("Running optimizer..."); fflush(stdout);
}
/* call the qp-subsolver */
a_v=optimize_qp(qp,epsilon_crit_target,
learn_parm->svm_maxqpsize,
&(model->b), /* in case the optimizer gives us */
/* the threshold for free. otherwise */
/* b is calculated in calculate_model. */
learn_parm);
if(verbosity>=3) {
printf("done\n");
}
for(i=0;i<varnum;i++) {
a[working2dnum[i]]=a_v[i];
/*
if(a_v[i]<=(0+learn_parm->epsilon_a)) {
a[working2dnum[i]]=0;
}
else if(a_v[i]>=(learn_parm->svm_cost[working2dnum[i]]-learn_parm->epsilon_a)) {
a[working2dnum[i]]=learn_parm->svm_cost[working2dnum[i]];
}
*/
}
}
void compute_matrices_for_optimization(DOC **docs, long int *label,
long int *unlabeled, long *exclude_from_eq_const, double eq_target,
long int *chosen, long int *active2dnum,
long int *key, MODEL *model, double *a, double *lin, double *c,
long int varnum, long int totdoc, LEARN_PARM *learn_parm,
CFLOAT *aicache, KERNEL_PARM *kernel_parm, QP *qp)
{
register long ki,kj,i,j;
register double kernel_temp;
if(verbosity>=3) {
fprintf(stdout,"Computing qp-matrices (type %ld kernel [degree %ld, rbf_gamma %f, coef_lin %f, coef_const %f])...",kernel_parm->kernel_type,kernel_parm->poly_degree,kernel_parm->rbf_gamma,kernel_parm->coef_lin,kernel_parm->coef_const);
fflush(stdout);
}
qp->opt_n=varnum;
qp->opt_ce0[0]=-eq_target; /* compute the constant for equality constraint */
for(j=1;j<model->sv_num;j++) { /* start at 1 */
if((!chosen[(model->supvec[j])->docnum])
&& (!exclude_from_eq_const[(model->supvec[j])->docnum])) {
qp->opt_ce0[0]+=model->alpha[j];
}
}
if(learn_parm->biased_hyperplane)
qp->opt_m=1;
else
qp->opt_m=0; /* eq-constraint will be ignored */
/* init linear part of objective function */
for(i=0;i<varnum;i++) {
qp->opt_g0[i]=lin[key[i]];
}
for(i=0;i<varnum;i++) {
ki=key[i];
/* Compute the matrix for equality constraints */
qp->opt_ce[i]=label[ki];
qp->opt_low[i]=0;
qp->opt_up[i]=learn_parm->svm_cost[ki];
kernel_temp=(double)kernel(kernel_parm,docs[ki],docs[ki]);
/* compute linear part of objective function */
qp->opt_g0[i]-=(kernel_temp*a[ki]*(double)label[ki]);
/* compute quadratic part of objective function */
qp->opt_g[varnum*i+i]=kernel_temp;
for(j=i+1;j<varnum;j++) {
kj=key[j];
kernel_temp=(double)kernel(kernel_parm,docs[ki],docs[kj]);
/* compute linear part of objective function */
qp->opt_g0[i]-=(kernel_temp*a[kj]*(double)label[kj]);
qp->opt_g0[j]-=(kernel_temp*a[ki]*(double)label[ki]);
/* compute quadratic part of objective function */
qp->opt_g[varnum*i+j]=(double)label[ki]*(double)label[kj]*kernel_temp;
qp->opt_g[varnum*j+i]=(double)label[ki]*(double)label[kj]*kernel_temp;
}
if(verbosity>=3) {
if(i % 20 == 0) {
fprintf(stdout,"%ld..",i); fflush(stdout);
}
}
}
for(i=0;i<varnum;i++) {
/* assure starting at feasible point */
qp->opt_xinit[i]=a[key[i]];
/* set linear part of objective function */
qp->opt_g0[i]=(learn_parm->eps-(double)label[key[i]]*c[key[i]])+qp->opt_g0[i]*(double)label[key[i]];
}
if(verbosity>=3) {
fprintf(stdout,"done\n");
}
}
long calculate_svm_model(DOC **docs, long int *label, long int *unlabeled,
double *lin, double *a, double *a_old, double *c,
LEARN_PARM *learn_parm, long int *working2dnum,
long int *active2dnum, MODEL *model)
/* Compute decision function based on current values */
/* of alpha. */
{
long i,ii,pos,b_calculated=0,first_low,first_high;
double ex_c,b_temp,b_low,b_high;
if(verbosity>=3) {
printf("Calculating model..."); fflush(stdout);
}
if(!learn_parm->biased_hyperplane) {
model->b=0;
b_calculated=1;
}
for(ii=0;(i=working2dnum[ii])>=0;ii++) {
if((a_old[i]>0) && (a[i]==0)) { /* remove from model */
pos=model->index[i];
model->index[i]=-1;
(model->sv_num)--;
model->supvec[pos]=model->supvec[model->sv_num];
model->alpha[pos]=model->alpha[model->sv_num];
model->index[(model->supvec[pos])->docnum]=pos;
}
else if((a_old[i]==0) && (a[i]>0)) { /* add to model */
model->supvec[model->sv_num]=docs[i];
model->alpha[model->sv_num]=a[i]*(double)label[i];
model->index[i]=model->sv_num;
(model->sv_num)++;
}
else if(a_old[i]==a[i]) { /* nothing to do */
}
else { /* just update alpha */
model->alpha[model->index[i]]=a[i]*(double)label[i];
}
ex_c=learn_parm->svm_cost[i]-learn_parm->epsilon_a;
if((a_old[i]>=ex_c) && (a[i]<ex_c)) {
(model->at_upper_bound)--;
}
else if((a_old[i]<ex_c) && (a[i]>=ex_c)) {
(model->at_upper_bound)++;
}
if((!b_calculated)
&& (a[i]>learn_parm->epsilon_a) && (a[i]<ex_c)) { /* calculate b */
model->b=((double)label[i]*learn_parm->eps-c[i]+lin[i]);
/* model->b=(-(double)label[i]+lin[i]); */
b_calculated=1;
}
}
/* No alpha in the working set not at bounds, so b was not
calculated in the usual way. The following handles this special
case. */
if(learn_parm->biased_hyperplane
&& (!b_calculated)
&& (model->sv_num-1 == model->at_upper_bound)) {
first_low=1;
first_high=1;
b_low=0;
b_high=0;
for(ii=0;(i=active2dnum[ii])>=0;ii++) {
ex_c=learn_parm->svm_cost[i]-learn_parm->epsilon_a;
if(a[i]<ex_c) {
if(label[i]>0) {
b_temp=-(learn_parm->eps-c[i]+lin[i]);
if((b_temp>b_low) || (first_low)) {
b_low=b_temp;
first_low=0;
}
}
else {
b_temp=-(-learn_parm->eps-c[i]+lin[i]);
if((b_temp<b_high) || (first_high)) {
b_high=b_temp;
first_high=0;
}
}
}
else {
if(label[i]<0) {
b_temp=-(-learn_parm->eps-c[i]+lin[i]);
if((b_temp>b_low) || (first_low)) {
b_low=b_temp;
first_low=0;
}
}
else {
b_temp=-(learn_parm->eps-c[i]+lin[i]);
if((b_temp<b_high) || (first_high)) {
b_high=b_temp;
first_high=0;
}
}
}
}
if(first_high) {
model->b=-b_low;
}
else if(first_low) {
model->b=-b_high;
}
else {
model->b=-(b_high+b_low)/2.0; /* select b as the middle of range */
/* printf("\nb_low=%f, b_high=%f,b=%f\n",b_low,b_high,model->b); */
}
}
if(verbosity>=3) {
printf("done\n"); fflush(stdout);
}
return(model->sv_num-1); /* have to substract one, since element 0 is empty*/
}
long check_optimality(MODEL *model, long int *label, long int *unlabeled,
double *a, double *lin, double *c, long int totdoc,
LEARN_PARM *learn_parm, double *maxdiff,
double epsilon_crit_org, long int *misclassified,
long int *inconsistent, long int *active2dnum,
long int *last_suboptimal_at,
long int iteration, KERNEL_PARM *kernel_parm)
/* Check KT-conditions */
{
long i,ii,retrain;
double dist,ex_c,target;
if(kernel_parm->kernel_type == LINEAR) { /* be optimistic */
learn_parm->epsilon_shrink=-learn_parm->epsilon_crit+epsilon_crit_org;
}
else { /* be conservative */
learn_parm->epsilon_shrink=learn_parm->epsilon_shrink*0.7+(*maxdiff)*0.3;
}
retrain=0;
(*maxdiff)=0;
(*misclassified)=0;
for(ii=0;(i=active2dnum[ii])>=0;ii++) {
if((!inconsistent[i]) && label[i]) {
dist=(lin[i]-model->b)*(double)label[i];/* 'distance' from
hyperplane*/
target=-(learn_parm->eps-(double)label[i]*c[i]);
ex_c=learn_parm->svm_cost[i]-learn_parm->epsilon_a;
if(dist <= 0) {
(*misclassified)++; /* does not work due to deactivation of var */
}
if((a[i]>learn_parm->epsilon_a) && (dist > target)) {
if((dist-target)>(*maxdiff)) /* largest violation */
(*maxdiff)=dist-target;
}
else if((a[i]<ex_c) && (dist < target)) {
if((target-dist)>(*maxdiff)) /* largest violation */
(*maxdiff)=target-dist;
}
/* Count how long a variable was at lower/upper bound (and optimal).*/
/* Variables, which were at the bound and optimal for a long */
/* time are unlikely to become support vectors. In case our */
/* cache is filled up, those variables are excluded to save */
/* kernel evaluations. (See chapter 'Shrinking').*/
if((a[i]>(learn_parm->epsilon_a))
&& (a[i]<ex_c)) {
last_suboptimal_at[i]=iteration; /* not at bound */
}
else if((a[i]<=(learn_parm->epsilon_a))
&& (dist < (target+learn_parm->epsilon_shrink))) {
last_suboptimal_at[i]=iteration; /* not likely optimal */
}
else if((a[i]>=ex_c)
&& (dist > (target-learn_parm->epsilon_shrink))) {
last_suboptimal_at[i]=iteration; /* not likely optimal */
}
}
}
/* termination criterion */
if((!retrain) && ((*maxdiff) > learn_parm->epsilon_crit)) {
retrain=1;
}
return(retrain);
}
long check_optimality_sharedslack(DOC **docs, MODEL *model, long int *label,
double *a, double *lin, double *c, double *slack,
double *alphaslack,
long int totdoc,
LEARN_PARM *learn_parm, double *maxdiff,
double epsilon_crit_org, long int *misclassified,
long int *active2dnum,
long int *last_suboptimal_at,
long int iteration, KERNEL_PARM *kernel_parm)
/* Check KT-conditions */
{
long i,ii,retrain;
double dist,ex_c=0,target;
if(kernel_parm->kernel_type == LINEAR) { /* be optimistic */
learn_parm->epsilon_shrink=-learn_parm->epsilon_crit+epsilon_crit_org;
}
else { /* be conservative */
learn_parm->epsilon_shrink=learn_parm->epsilon_shrink*0.7+(*maxdiff)*0.3;
}
retrain=0;
(*maxdiff)=0;
(*misclassified)=0;
for(ii=0;(i=active2dnum[ii])>=0;ii++) {
/* 'distance' from hyperplane*/
dist=(lin[i]-model->b)*(double)label[i]+slack[docs[i]->slackid];
target=-(learn_parm->eps-(double)label[i]*c[i]);
ex_c=learn_parm->svm_c-learn_parm->epsilon_a;
if((a[i]>learn_parm->epsilon_a) && (dist > target)) {
if((dist-target)>(*maxdiff)) { /* largest violation */
(*maxdiff)=dist-target;
if(verbosity>=5) printf("sid %ld: dist=%.2f, target=%.2f, slack=%.2f, a=%f, alphaslack=%f\n",docs[i]->slackid,dist,target,slack[docs[i]->slackid],a[i],alphaslack[docs[i]->slackid]);
if(verbosity>=5) printf(" (single %f)\n",(*maxdiff));
}
}
if((alphaslack[docs[i]->slackid]<ex_c) && (slack[docs[i]->slackid]>0)) {
if((slack[docs[i]->slackid])>(*maxdiff)) { /* largest violation */
(*maxdiff)=slack[docs[i]->slackid];
if(verbosity>=5) printf("sid %ld: dist=%.2f, target=%.2f, slack=%.2f, a=%f, alphaslack=%f\n",docs[i]->slackid,dist,target,slack[docs[i]->slackid],a[i],alphaslack[docs[i]->slackid]);
if(verbosity>=5) printf(" (joint %f)\n",(*maxdiff));
}
}
/* Count how long a variable was at lower/upper bound (and optimal).*/
/* Variables, which were at the bound and optimal for a long */
/* time are unlikely to become support vectors. In case our */
/* cache is filled up, those variables are excluded to save */
/* kernel evaluations. (See chapter 'Shrinking').*/
if((a[i]>(learn_parm->epsilon_a))
&& (a[i]<ex_c)) {
last_suboptimal_at[docs[i]->slackid]=iteration; /* not at bound */
}
else if((a[i]<=(learn_parm->epsilon_a))
&& (dist < (target+learn_parm->epsilon_shrink))) {
last_suboptimal_at[docs[i]->slackid]=iteration; /* not likely optimal */
}
else if((a[i]>=ex_c)
&& (slack[docs[i]->slackid] < learn_parm->epsilon_shrink)) {
last_suboptimal_at[docs[i]->slackid]=iteration; /* not likely optimal */
}
}
/* termination criterion */
if((!retrain) && ((*maxdiff) > learn_parm->epsilon_crit)) {
retrain=1;
}
return(retrain);
}
void compute_shared_slacks(DOC **docs, long int *label,
double *a, double *lin,
double *c, long int *active2dnum,
LEARN_PARM *learn_parm,
double *slack, double *alphaslack)
/* compute the value of shared slacks and the joint alphas */
{
long jj,i;
double dist,target;
for(jj=0;(i=active2dnum[jj])>=0;jj++) { /* clear slack variables */
slack[docs[i]->slackid]=0.0;
alphaslack[docs[i]->slackid]=0.0;
}
for(jj=0;(i=active2dnum[jj])>=0;jj++) { /* recompute slack variables */
dist=(lin[i])*(double)label[i];
target=-(learn_parm->eps-(double)label[i]*c[i]);
if((target-dist) > slack[docs[i]->slackid])
slack[docs[i]->slackid]=target-dist;
alphaslack[docs[i]->slackid]+=a[i];
}
}
long identify_inconsistent(double *a, long int *label,
long int *unlabeled, long int totdoc,
LEARN_PARM *learn_parm,
long int *inconsistentnum, long int *inconsistent)
{
long i,retrain;
/* Throw out examples with multipliers at upper bound. This */
/* corresponds to the -i 1 option. */
/* ATTENTION: this is just a heuristic for finding a close */
/* to minimum number of examples to exclude to */
/* make the problem separable with desired margin */
retrain=0;
for(i=0;i<totdoc;i++) {
if((!inconsistent[i]) && (!unlabeled[i])
&& (a[i]>=(learn_parm->svm_cost[i]-learn_parm->epsilon_a))) {
(*inconsistentnum)++;
inconsistent[i]=1; /* never choose again */
retrain=2; /* start over */
if(verbosity>=3) {
printf("inconsistent(%ld)..",i); fflush(stdout);
}
}
}
return(retrain);
}
long identify_misclassified(double *lin, long int *label,
long int *unlabeled, long int totdoc,
MODEL *model, long int *inconsistentnum,
long int *inconsistent)
{
long i,retrain;
double dist;
/* Throw out misclassified examples. This */
/* corresponds to the -i 2 option. */
/* ATTENTION: this is just a heuristic for finding a close */
/* to minimum number of examples to exclude to */
/* make the problem separable with desired margin */
retrain=0;
for(i=0;i<totdoc;i++) {
dist=(lin[i]-model->b)*(double)label[i]; /* 'distance' from hyperplane*/
if((!inconsistent[i]) && (!unlabeled[i]) && (dist <= 0)) {
(*inconsistentnum)++;
inconsistent[i]=1; /* never choose again */
retrain=2; /* start over */
if(verbosity>=3) {
printf("inconsistent(%ld)..",i); fflush(stdout);
}
}
}
return(retrain);
}
long identify_one_misclassified(double *lin, long int *label,
long int *unlabeled,
long int totdoc, MODEL *model,
long int *inconsistentnum,
long int *inconsistent)
{
long i,retrain,maxex=-1;
double dist,maxdist=0;
/* Throw out the 'most misclassified' example. This */
/* corresponds to the -i 3 option. */
/* ATTENTION: this is just a heuristic for finding a close */
/* to minimum number of examples to exclude to */
/* make the problem separable with desired margin */
retrain=0;
for(i=0;i<totdoc;i++) {
if((!inconsistent[i]) && (!unlabeled[i])) {
dist=(lin[i]-model->b)*(double)label[i];/* 'distance' from hyperplane*/
if(dist<maxdist) {
maxdist=dist;
maxex=i;
}
}
}
if(maxex>=0) {
(*inconsistentnum)++;
inconsistent[maxex]=1; /* never choose again */
retrain=2; /* start over */
if(verbosity>=3) {
printf("inconsistent(%ld)..",i); fflush(stdout);
}
}
return(retrain);
}
void update_linear_component(DOC **docs, long int *label,
long int *active2dnum, double *a,
double *a_old, long int *working2dnum,
long int totdoc, long int totwords,
KERNEL_PARM *kernel_parm,
KERNEL_CACHE *kernel_cache,
double *lin, CFLOAT *aicache, double *weights)
/* keep track of the linear component */
/* lin of the gradient etc. by updating */
/* based on the change of the variables */
/* in the current working set */
{
register long i,ii,j,jj;
register double tec;
SVECTOR *f;
if(kernel_parm->kernel_type==0) { /* special linear case */
clear_vector_n(weights,totwords);
for(ii=0;(i=working2dnum[ii])>=0;ii++) {
if(a[i] != a_old[i]) {
for(f=docs[i]->fvec;f;f=f->next)
add_vector_ns(weights,f,
f->factor*((a[i]-a_old[i])*(double)label[i]));
}
}
for(jj=0;(j=active2dnum[jj])>=0;jj++) {
for(f=docs[j]->fvec;f;f=f->next)
lin[j]+=f->factor*sprod_ns(weights,f);
}
}
else { /* general case */
for(jj=0;(i=working2dnum[jj])>=0;jj++) {
if(a[i] != a_old[i]) {
get_kernel_row(kernel_cache,docs,i,totdoc,active2dnum,aicache,
kernel_parm);
for(ii=0;(j=active2dnum[ii])>=0;ii++) {
tec=aicache[j];
lin[j]+=(((a[i]*tec)-(a_old[i]*tec))*(double)label[i]);
}
}
}
}
}
long incorporate_unlabeled_examples(MODEL *model, long int *label,
long int *inconsistent,
long int *unlabeled,
double *a, double *lin,
long int totdoc, double *selcrit,
long int *select, long int *key,
long int transductcycle,
KERNEL_PARM *kernel_parm,
LEARN_PARM *learn_parm)
{
long i,j,k,j1,j2,j3,j4,unsupaddnum1=0,unsupaddnum2=0;
long pos,neg,upos,uneg,orgpos,orgneg,nolabel,newpos,newneg,allunlab;
double dist,model_length,posratio,negratio;
long check_every=2;
double loss;
static double switchsens=0.0,switchsensorg=0.0;
double umin,umax,sumalpha;
long imin=0,imax=0;
static long switchnum=0;
switchsens/=1.2;
/* assumes that lin[] is up to date -> no inactive vars */
orgpos=0;
orgneg=0;
newpos=0;
newneg=0;
nolabel=0;
allunlab=0;
for(i=0;i<totdoc;i++) {
if(!unlabeled[i]) {
if(label[i] > 0) {
orgpos++;
}
else {
orgneg++;
}
}
else {
allunlab++;
if(unlabeled[i]) {
if(label[i] > 0) {
newpos++;
}
else if(label[i] < 0) {
newneg++;
}
}
}
if(label[i]==0) {
nolabel++;
}
}
if(learn_parm->transduction_posratio >= 0) {
posratio=learn_parm->transduction_posratio;
}
else {
posratio=(double)orgpos/(double)(orgpos+orgneg); /* use ratio of pos/neg */
} /* in training data */
negratio=1.0-posratio;
learn_parm->svm_costratio=1.0; /* global */
if(posratio>0) {
learn_parm->svm_costratio_unlab=negratio/posratio;
}
else {
learn_parm->svm_costratio_unlab=1.0;
}
pos=0;
neg=0;
upos=0;
uneg=0;
for(i=0;i<totdoc;i++) {
dist=(lin[i]-model->b); /* 'distance' from hyperplane*/
if(dist>0) {
pos++;
}
else {
neg++;
}
if(unlabeled[i]) {
if(dist>0) {
upos++;
}
else {
uneg++;
}
}
if((!unlabeled[i]) && (a[i]>(learn_parm->svm_cost[i]-learn_parm->epsilon_a))) {
/* printf("Ubounded %ld (class %ld, unlabeled %ld)\n",i,label[i],unlabeled[i]); */
}
}
if(verbosity>=2) {
printf("POS=%ld, ORGPOS=%ld, ORGNEG=%ld\n",pos,orgpos,orgneg);
printf("POS=%ld, NEWPOS=%ld, NEWNEG=%ld\n",pos,newpos,newneg);
printf("pos ratio = %f (%f).\n",(double)(upos)/(double)(allunlab),posratio);
fflush(stdout);
}
if(transductcycle == 0) {
j1=0;
j2=0;
j4=0;
for(i=0;i<totdoc;i++) {
dist=(lin[i]-model->b); /* 'distance' from hyperplane*/
if((label[i]==0) && (unlabeled[i])) {
selcrit[j4]=dist;
key[j4]=i;
j4++;
}
}
unsupaddnum1=0;
unsupaddnum2=0;
select_top_n(selcrit,j4,select,(long)(allunlab*posratio+0.5));
for(k=0;(k<(long)(allunlab*posratio+0.5));k++) {
i=key[select[k]];
label[i]=1;
unsupaddnum1++;
j1++;
}
for(i=0;i<totdoc;i++) {
if((label[i]==0) && (unlabeled[i])) {
label[i]=-1;
j2++;
unsupaddnum2++;
}
}
for(i=0;i<totdoc;i++) { /* set upper bounds on vars */
if(unlabeled[i]) {
if(label[i] == 1) {
learn_parm->svm_cost[i]=learn_parm->svm_c*
learn_parm->svm_costratio_unlab*learn_parm->svm_unlabbound;
}
else if(label[i] == -1) {
learn_parm->svm_cost[i]=learn_parm->svm_c*
learn_parm->svm_unlabbound;
}
}
}
if(verbosity>=1) {
/* printf("costratio %f, costratio_unlab %f, unlabbound %f\n",
learn_parm->svm_costratio,learn_parm->svm_costratio_unlab,
learn_parm->svm_unlabbound); */
printf("Classifying unlabeled data as %ld POS / %ld NEG.\n",
unsupaddnum1,unsupaddnum2);
fflush(stdout);
}
if(verbosity >= 1)
printf("Retraining.");
if(verbosity >= 2) printf("\n");
return((long)3);
}
if((transductcycle % check_every) == 0) {
if(verbosity >= 1)
printf("Retraining.");
if(verbosity >= 2) printf("\n");
j1=0;
j2=0;
unsupaddnum1=0;
unsupaddnum2=0;
for(i=0;i<totdoc;i++) {
if((unlabeled[i] == 2)) {
unlabeled[i]=1;
label[i]=1;
j1++;
unsupaddnum1++;
}
else if((unlabeled[i] == 3)) {
unlabeled[i]=1;
label[i]=-1;
j2++;
unsupaddnum2++;
}
}
for(i=0;i<totdoc;i++) { /* set upper bounds on vars */
if(unlabeled[i]) {
if(label[i] == 1) {
learn_parm->svm_cost[i]=learn_parm->svm_c*
learn_parm->svm_costratio_unlab*learn_parm->svm_unlabbound;
}
else if(label[i] == -1) {
learn_parm->svm_cost[i]=learn_parm->svm_c*
learn_parm->svm_unlabbound;
}
}
}
if(verbosity>=2) {
/* printf("costratio %f, costratio_unlab %f, unlabbound %f\n",
learn_parm->svm_costratio,learn_parm->svm_costratio_unlab,
learn_parm->svm_unlabbound); */
printf("%ld positive -> Added %ld POS / %ld NEG unlabeled examples.\n",
upos,unsupaddnum1,unsupaddnum2);
fflush(stdout);
}
if(learn_parm->svm_unlabbound == 1) {
learn_parm->epsilon_crit=0.001; /* do the last run right */
}
else {
learn_parm->epsilon_crit=0.01; /* otherwise, no need to be so picky */
}
return((long)3);
}
else if(((transductcycle % check_every) < check_every)) {
model_length=0;
sumalpha=0;
loss=0;
for(i=0;i<totdoc;i++) {
model_length+=a[i]*label[i]*lin[i];
sumalpha+=a[i];
dist=(lin[i]-model->b); /* 'distance' from hyperplane*/
if((label[i]*dist)<(1.0-learn_parm->epsilon_crit)) {
loss+=(1.0-(label[i]*dist))*learn_parm->svm_cost[i];
}
}
model_length=sqrt(model_length);
if(verbosity>=2) {
printf("Model-length = %f (%f), loss = %f, objective = %f\n",
model_length,sumalpha,loss,loss+0.5*model_length*model_length);
fflush(stdout);
}
j1=0;
j2=0;
j3=0;
j4=0;
unsupaddnum1=0;
unsupaddnum2=0;
umin=99999;
umax=-99999;
j4=1;
while(j4) {
umin=99999;
umax=-99999;
for(i=0;(i<totdoc);i++) {
dist=(lin[i]-model->b);
if((label[i]>0) && (unlabeled[i]) && (!inconsistent[i])
&& (dist<umin)) {
umin=dist;
imin=i;
}
if((label[i]<0) && (unlabeled[i]) && (!inconsistent[i])
&& (dist>umax)) {
umax=dist;
imax=i;
}
}
if((umin < (umax+switchsens-1E-4))) {
j1++;
j2++;
unsupaddnum1++;
unlabeled[imin]=3;
inconsistent[imin]=1;
unsupaddnum2++;
unlabeled[imax]=2;
inconsistent[imax]=1;
}
else
j4=0;
j4=0;
}
for(j=0;(j<totdoc);j++) {
if(unlabeled[j] && (!inconsistent[j])) {
if(label[j]>0) {
unlabeled[j]=2;
}
else if(label[j]<0) {
unlabeled[j]=3;
}
/* inconsistent[j]=1; */
j3++;
}
}
switchnum+=unsupaddnum1+unsupaddnum2;
/* stop and print out current margin
printf("switchnum %ld %ld\n",switchnum,kernel_parm->poly_degree);
if(switchnum == 2*kernel_parm->poly_degree) {
learn_parm->svm_unlabbound=1;
}
*/
if((!unsupaddnum1) && (!unsupaddnum2)) {
if((learn_parm->svm_unlabbound>=1) && ((newpos+newneg) == allunlab)) {
for(j=0;(j<totdoc);j++) {
inconsistent[j]=0;
if(unlabeled[j]) unlabeled[j]=1;
}
write_prediction(learn_parm->predfile,model,lin,a,unlabeled,label,
totdoc,learn_parm);
if(verbosity>=1)
printf("Number of switches: %ld\n",switchnum);
return((long)0);
}
switchsens=switchsensorg;
learn_parm->svm_unlabbound*=1.5;
if(learn_parm->svm_unlabbound>1) {
learn_parm->svm_unlabbound=1;
}
model->at_upper_bound=0; /* since upper bound increased */
if(verbosity>=1)
printf("Increasing influence of unlabeled examples to %f%% .",
learn_parm->svm_unlabbound*100.0);
}
else if(verbosity>=1) {
printf("%ld positive -> Switching labels of %ld POS / %ld NEG unlabeled examples.",
upos,unsupaddnum1,unsupaddnum2);
fflush(stdout);
}
if(verbosity >= 2) printf("\n");
learn_parm->epsilon_crit=0.5; /* don't need to be so picky */
for(i=0;i<totdoc;i++) { /* set upper bounds on vars */
if(unlabeled[i]) {
if(label[i] == 1) {
learn_parm->svm_cost[i]=learn_parm->svm_c*
learn_parm->svm_costratio_unlab*learn_parm->svm_unlabbound;
}
else if(label[i] == -1) {
learn_parm->svm_cost[i]=learn_parm->svm_c*
learn_parm->svm_unlabbound;
}
}
}
return((long)2);
}
return((long)0);
}
/*************************** Working set selection ***************************/
long select_next_qp_subproblem_grad(long int *label,
long int *unlabeled,
double *a, double *lin,
double *c, long int totdoc,
long int qp_size,
LEARN_PARM *learn_parm,
long int *inconsistent,
long int *active2dnum,
long int *working2dnum,
double *selcrit,
long int *select,
KERNEL_CACHE *kernel_cache,
long int cache_only,
long int *key, long int *chosen)
/* Use the feasible direction approach to select the next
qp-subproblem (see chapter 'Selecting a good working set'). If
'cache_only' is true, then the variables are selected only among
those for which the kernel evaluations are cached. */
{
long choosenum,i,j,k,activedoc,inum,valid;
double s;
for(inum=0;working2dnum[inum]>=0;inum++); /* find end of index */
choosenum=0;
activedoc=0;
for(i=0;(j=active2dnum[i])>=0;i++) {
s=-label[j];
if(kernel_cache && cache_only)
valid=(kernel_cache->index[j]>=0);
else
valid=1;
if(valid
&& (!((a[j]<=(0+learn_parm->epsilon_a)) && (s<0)))
&& (!((a[j]>=(learn_parm->svm_cost[j]-learn_parm->epsilon_a))
&& (s>0)))
&& (!chosen[j])
&& (label[j])
&& (!inconsistent[j]))
{
selcrit[activedoc]=(double)label[j]*(learn_parm->eps-(double)label[j]*c[j]+(double)label[j]*lin[j]);
/* selcrit[activedoc]=(double)label[j]*(-1.0+(double)label[j]*lin[j]); */
key[activedoc]=j;
activedoc++;
}
}
select_top_n(selcrit,activedoc,select,(long)(qp_size/2));
for(k=0;(choosenum<(qp_size/2)) && (k<(qp_size/2)) && (k<activedoc);k++) {
/* if(learn_parm->biased_hyperplane || (selcrit[select[k]] > 0)) { */
i=key[select[k]];
chosen[i]=1;
working2dnum[inum+choosenum]=i;
choosenum+=1;
if(kernel_cache)
kernel_cache_touch(kernel_cache,i); /* make sure it does not get
kicked out of cache */
/* } */
}
activedoc=0;
for(i=0;(j=active2dnum[i])>=0;i++) {
s=label[j];
if(kernel_cache && cache_only)
valid=(kernel_cache->index[j]>=0);
else
valid=1;
if(valid
&& (!((a[j]<=(0+learn_parm->epsilon_a)) && (s<0)))
&& (!((a[j]>=(learn_parm->svm_cost[j]-learn_parm->epsilon_a))
&& (s>0)))
&& (!chosen[j])
&& (label[j])
&& (!inconsistent[j]))
{
selcrit[activedoc]=-(double)label[j]*(learn_parm->eps-(double)label[j]*c[j]+(double)label[j]*lin[j]);
/* selcrit[activedoc]=-(double)(label[j]*(-1.0+(double)label[j]*lin[j])); */
key[activedoc]=j;
activedoc++;
}
}
select_top_n(selcrit,activedoc,select,(long)(qp_size/2));
for(k=0;(choosenum<qp_size) && (k<(qp_size/2)) && (k<activedoc);k++) {
/* if(learn_parm->biased_hyperplane || (selcrit[select[k]] > 0)) { */
i=key[select[k]];
chosen[i]=1;
working2dnum[inum+choosenum]=i;
choosenum+=1;
if(kernel_cache)
kernel_cache_touch(kernel_cache,i); /* make sure it does not get
kicked out of cache */
/* } */
}
working2dnum[inum+choosenum]=-1; /* complete index */
return(choosenum);
}
long select_next_qp_subproblem_rand(long int *label,
long int *unlabeled,
double *a, double *lin,
double *c, long int totdoc,
long int qp_size,
LEARN_PARM *learn_parm,
long int *inconsistent,
long int *active2dnum,
long int *working2dnum,
double *selcrit,
long int *select,
KERNEL_CACHE *kernel_cache,
long int *key,
long int *chosen,
long int iteration)
/* Use the feasible direction approach to select the next
qp-subproblem (see section 'Selecting a good working set'). Chooses
a feasible direction at (pseudo) random to help jump over numerical
problem. */
{
long choosenum,i,j,k,activedoc,inum;
double s;
for(inum=0;working2dnum[inum]>=0;inum++); /* find end of index */
choosenum=0;
activedoc=0;
for(i=0;(j=active2dnum[i])>=0;i++) {
s=-label[j];
if((!((a[j]<=(0+learn_parm->epsilon_a)) && (s<0)))
&& (!((a[j]>=(learn_parm->svm_cost[j]-learn_parm->epsilon_a))
&& (s>0)))
&& (!inconsistent[j])
&& (label[j])
&& (!chosen[j])) {
selcrit[activedoc]=(j+iteration) % totdoc;
key[activedoc]=j;
activedoc++;
}
}
select_top_n(selcrit,activedoc,select,(long)(qp_size/2));
for(k=0;(choosenum<(qp_size/2)) && (k<(qp_size/2)) && (k<activedoc);k++) {
i=key[select[k]];
chosen[i]=1;
working2dnum[inum+choosenum]=i;
choosenum+=1;
kernel_cache_touch(kernel_cache,i); /* make sure it does not get kicked */
/* out of cache */
}
activedoc=0;
for(i=0;(j=active2dnum[i])>=0;i++) {
s=label[j];
if((!((a[j]<=(0+learn_parm->epsilon_a)) && (s<0)))
&& (!((a[j]>=(learn_parm->svm_cost[j]-learn_parm->epsilon_a))
&& (s>0)))
&& (!inconsistent[j])
&& (label[j])
&& (!chosen[j])) {
selcrit[activedoc]=(j+iteration) % totdoc;
key[activedoc]=j;
activedoc++;
}
}
select_top_n(selcrit,activedoc,select,(long)(qp_size/2));
for(k=0;(choosenum<qp_size) && (k<(qp_size/2)) && (k<activedoc);k++) {
i=key[select[k]];
chosen[i]=1;
working2dnum[inum+choosenum]=i;
choosenum+=1;
kernel_cache_touch(kernel_cache,i); /* make sure it does not get kicked */
/* out of cache */
}
working2dnum[inum+choosenum]=-1; /* complete index */
return(choosenum);
}
long select_next_qp_slackset(DOC **docs, long int *label,
double *a, double *lin,
double *slack, double *alphaslack,
double *c,
LEARN_PARM *learn_parm,
long int *active2dnum, double *maxviol)
/* returns the slackset with the largest internal violation */
{
long i,ii,maxdiffid;
double dist,target,maxdiff,ex_c;
maxdiff=0;
maxdiffid=0;
for(ii=0;(i=active2dnum[ii])>=0;ii++) {
ex_c=learn_parm->svm_c-learn_parm->epsilon_a;
if(alphaslack[docs[i]->slackid] >= ex_c) {
dist=(lin[i])*(double)label[i]+slack[docs[i]->slackid]; /* distance */
target=-(learn_parm->eps-(double)label[i]*c[i]); /* rhs of constraint */
if((a[i]>learn_parm->epsilon_a) && (dist > target)) {
if((dist-target)>maxdiff) { /* largest violation */
maxdiff=dist-target;
maxdiffid=docs[i]->slackid;
}
}
}
}
(*maxviol)=maxdiff;
return(maxdiffid);
}
void select_top_n(double *selcrit, long int range, long int *select,
long int n)
{
register long i,j;
for(i=0;(i<n) && (i<range);i++) { /* Initialize with the first n elements */
for(j=i;j>=0;j--) {
if((j>0) && (selcrit[select[j-1]]<selcrit[i])){
select[j]=select[j-1];
}
else {
select[j]=i;
j=-1;
}
}
}
if(n>0) {
for(i=n;i<range;i++) {
if(selcrit[i]>selcrit[select[n-1]]) {
for(j=n-1;j>=0;j--) {
if((j>0) && (selcrit[select[j-1]]<selcrit[i])) {
select[j]=select[j-1];
}
else {
select[j]=i;
j=-1;
}
}
}
}
}
}
/******************************** Shrinking *********************************/
void init_shrink_state(SHRINK_STATE *shrink_state, long int totdoc,
long int maxhistory)
{
long i;
shrink_state->deactnum=0;
shrink_state->active = (long *)my_malloc(sizeof(long)*totdoc);
shrink_state->inactive_since = (long *)my_malloc(sizeof(long)*totdoc);
shrink_state->a_history = (double **)my_malloc(sizeof(double *)*maxhistory);
shrink_state->maxhistory=maxhistory;
shrink_state->last_lin = (double *)my_malloc(sizeof(double)*totdoc);
shrink_state->last_a = (double *)my_malloc(sizeof(double)*totdoc);
for(i=0;i<totdoc;i++) {
shrink_state->active[i]=1;
shrink_state->inactive_since[i]=0;
shrink_state->last_a[i]=0;
shrink_state->last_lin[i]=0;
}
}
void shrink_state_cleanup(SHRINK_STATE *shrink_state)
{
free(shrink_state->active);
free(shrink_state->inactive_since);
if(shrink_state->deactnum > 0)
free(shrink_state->a_history[shrink_state->deactnum-1]);
free(shrink_state->a_history);
free(shrink_state->last_a);
free(shrink_state->last_lin);
}
long shrink_problem(DOC **docs,
LEARN_PARM *learn_parm,
SHRINK_STATE *shrink_state,
KERNEL_PARM *kernel_parm,
long int *active2dnum,
long int *last_suboptimal_at,
long int iteration,
long int totdoc,
long int minshrink,
double *a,
long int *inconsistent)
/* Shrink some variables away. Do the shrinking only if at least
minshrink variables can be removed. */
{
long i,ii,change,activenum,lastiter;
double *a_old;
activenum=0;
change=0;
for(ii=0;active2dnum[ii]>=0;ii++) {
i=active2dnum[ii];
activenum++;
if(learn_parm->sharedslack)
lastiter=last_suboptimal_at[docs[i]->slackid];
else
lastiter=last_suboptimal_at[i];
if(((iteration-lastiter) > learn_parm->svm_iter_to_shrink)
|| (inconsistent[i])) {
change++;
}
}
if((change>=minshrink) /* shrink only if sufficiently many candidates */
&& (shrink_state->deactnum<shrink_state->maxhistory)) { /* and enough memory */
/* Shrink problem by removing those variables which are */
/* optimal at a bound for a minimum number of iterations */
if(verbosity>=2) {
printf(" Shrinking..."); fflush(stdout);
}
if(kernel_parm->kernel_type != LINEAR) { /* non-linear case save alphas */
a_old=(double *)my_malloc(sizeof(double)*totdoc);
shrink_state->a_history[shrink_state->deactnum]=a_old;
for(i=0;i<totdoc;i++) {
a_old[i]=a[i];
}
}
for(ii=0;active2dnum[ii]>=0;ii++) {
i=active2dnum[ii];
if(learn_parm->sharedslack)
lastiter=last_suboptimal_at[docs[i]->slackid];
else
lastiter=last_suboptimal_at[i];
if(((iteration-lastiter) > learn_parm->svm_iter_to_shrink)
|| (inconsistent[i])) {
shrink_state->active[i]=0;
shrink_state->inactive_since[i]=shrink_state->deactnum;
}
}
activenum=compute_index(shrink_state->active,totdoc,active2dnum);
shrink_state->deactnum++;
if(kernel_parm->kernel_type == LINEAR) {
shrink_state->deactnum=0;
}
if(verbosity>=2) {
printf("done.\n"); fflush(stdout);
printf(" Number of inactive variables = %ld\n",totdoc-activenum);
}
}
return(activenum);
}
void reactivate_inactive_examples(long int *label,
long int *unlabeled,
double *a,
SHRINK_STATE *shrink_state,
double *lin,
double *c,
long int totdoc,
long int totwords,
long int iteration,
LEARN_PARM *learn_parm,
long int *inconsistent,
DOC **docs,
KERNEL_PARM *kernel_parm,
KERNEL_CACHE *kernel_cache,
MODEL *model,
CFLOAT *aicache,
double *weights,
double *maxdiff)
/* Make all variables active again which had been removed by
shrinking. */
/* Computes lin for those variables from scratch. */
{
register long i,j,ii,jj,t,*changed2dnum,*inactive2dnum;
long *changed,*inactive;
register double kernel_val,*a_old,dist;
double ex_c,target;
SVECTOR *f;
if(kernel_parm->kernel_type == LINEAR) { /* special linear case */
a_old=shrink_state->last_a;
clear_vector_n(weights,totwords);
for(i=0;i<totdoc;i++) {
if(a[i] != a_old[i]) {
for(f=docs[i]->fvec;f;f=f->next)
add_vector_ns(weights,f,
f->factor*((a[i]-a_old[i])*(double)label[i]));
a_old[i]=a[i];
}
}
for(i=0;i<totdoc;i++) {
if(!shrink_state->active[i]) {
for(f=docs[i]->fvec;f;f=f->next)
lin[i]=shrink_state->last_lin[i]+f->factor*sprod_ns(weights,f);
}
shrink_state->last_lin[i]=lin[i];
}
}
else {
changed=(long *)my_malloc(sizeof(long)*totdoc);
changed2dnum=(long *)my_malloc(sizeof(long)*(totdoc+11));
inactive=(long *)my_malloc(sizeof(long)*totdoc);
inactive2dnum=(long *)my_malloc(sizeof(long)*(totdoc+11));
for(t=shrink_state->deactnum-1;(t>=0) && shrink_state->a_history[t];t--) {
if(verbosity>=2) {
printf("%ld..",t); fflush(stdout);
}
a_old=shrink_state->a_history[t];
for(i=0;i<totdoc;i++) {
inactive[i]=((!shrink_state->active[i])
&& (shrink_state->inactive_since[i] == t));
changed[i]= (a[i] != a_old[i]);
}
compute_index(inactive,totdoc,inactive2dnum);
compute_index(changed,totdoc,changed2dnum);
for(ii=0;(i=changed2dnum[ii])>=0;ii++) {
get_kernel_row(kernel_cache,docs,i,totdoc,inactive2dnum,aicache,
kernel_parm);
for(jj=0;(j=inactive2dnum[jj])>=0;jj++) {
kernel_val=aicache[j];
lin[j]+=(((a[i]*kernel_val)-(a_old[i]*kernel_val))*(double)label[i]);
}
}
}
free(changed);
free(changed2dnum);
free(inactive);
free(inactive2dnum);
}
(*maxdiff)=0;
for(i=0;i<totdoc;i++) {
shrink_state->inactive_since[i]=shrink_state->deactnum-1;
if(!inconsistent[i]) {
dist=(lin[i]-model->b)*(double)label[i];
target=-(learn_parm->eps-(double)label[i]*c[i]);
ex_c=learn_parm->svm_cost[i]-learn_parm->epsilon_a;
if((a[i]>learn_parm->epsilon_a) && (dist > target)) {
if((dist-target)>(*maxdiff)) /* largest violation */
(*maxdiff)=dist-target;
}
else if((a[i]<ex_c) && (dist < target)) {
if((target-dist)>(*maxdiff)) /* largest violation */
(*maxdiff)=target-dist;
}
if((a[i]>(0+learn_parm->epsilon_a))
&& (a[i]<ex_c)) {
shrink_state->active[i]=1; /* not at bound */
}
else if((a[i]<=(0+learn_parm->epsilon_a)) && (dist < (target+learn_parm->epsilon_shrink))) {
shrink_state->active[i]=1;
}
else if((a[i]>=ex_c)
&& (dist > (target-learn_parm->epsilon_shrink))) {
shrink_state->active[i]=1;
}
else if(learn_parm->sharedslack) { /* make all active when sharedslack */
shrink_state->active[i]=1;
}
}
}
if(kernel_parm->kernel_type != LINEAR) { /* update history for non-linear */
for(i=0;i<totdoc;i++) {
(shrink_state->a_history[shrink_state->deactnum-1])[i]=a[i];
}
for(t=shrink_state->deactnum-2;(t>=0) && shrink_state->a_history[t];t--) {
free(shrink_state->a_history[t]);
shrink_state->a_history[t]=0;
}
}
}
/****************************** Cache handling *******************************/
void get_kernel_row(KERNEL_CACHE *kernel_cache, DOC **docs,
long int docnum, long int totdoc,
long int *active2dnum, CFLOAT *buffer,
KERNEL_PARM *kernel_parm)
/* Get's a row of the matrix of kernel values This matrix has the
same form as the Hessian, just that the elements are not
multiplied by */
/* y_i * y_j * a_i * a_j */
/* Takes the values from the cache if available. */
{
register long i,j,start;
DOC *ex;
ex=docs[docnum];
if(kernel_cache->index[docnum] != -1) { /* row is cached? */
kernel_cache->lru[kernel_cache->index[docnum]]=kernel_cache->time; /* lru */
start=kernel_cache->activenum*kernel_cache->index[docnum];
for(i=0;(j=active2dnum[i])>=0;i++) {
if(kernel_cache->totdoc2active[j] >= 0) { /* column is cached? */
buffer[j]=kernel_cache->buffer[start+kernel_cache->totdoc2active[j]];
}
else {
buffer[j]=(CFLOAT)kernel(kernel_parm,ex,docs[j]);
}
}
}
else {
for(i=0;(j=active2dnum[i])>=0;i++) {
buffer[j]=(CFLOAT)kernel(kernel_parm,ex,docs[j]);
}
}
}
void cache_kernel_row(KERNEL_CACHE *kernel_cache, DOC **docs,
long int m, KERNEL_PARM *kernel_parm)
/* Fills cache for the row m */
{
register DOC *ex;
register long j,k,l;
register CFLOAT *cache;
if(!kernel_cache_check(kernel_cache,m)) { /* not cached yet*/
cache = kernel_cache_clean_and_malloc(kernel_cache,m);
if(cache) {
l=kernel_cache->totdoc2active[m];
ex=docs[m];
for(j=0;j<kernel_cache->activenum;j++) { /* fill cache */
k=kernel_cache->active2totdoc[j];
if((kernel_cache->index[k] != -1) && (l != -1) && (k != m)) {
cache[j]=kernel_cache->buffer[kernel_cache->activenum
*kernel_cache->index[k]+l];
}
else {
cache[j]=kernel(kernel_parm,ex,docs[k]);
}
}
}
else {
perror("Error: Kernel cache full! => increase cache size");
}
}
}
void cache_multiple_kernel_rows(KERNEL_CACHE *kernel_cache, DOC **docs,
long int *key, long int varnum,
KERNEL_PARM *kernel_parm)
/* Fills cache for the rows in key */
{
register long i;
for(i=0;i<varnum;i++) { /* fill up kernel cache */
cache_kernel_row(kernel_cache,docs,key[i],kernel_parm);
}
}
void kernel_cache_shrink(KERNEL_CACHE *kernel_cache, long int totdoc,
long int numshrink, long int *after)
/* Remove numshrink columns in the cache which correspond to
examples marked 0 in after. */
{
register long i,j,jj,from=0,to=0,scount;
long *keep;
if(verbosity>=2) {
printf(" Reorganizing cache..."); fflush(stdout);
}
keep=(long *)my_malloc(sizeof(long)*totdoc);
for(j=0;j<totdoc;j++) {
keep[j]=1;
}
scount=0;
for(jj=0;(jj<kernel_cache->activenum) && (scount<numshrink);jj++) {
j=kernel_cache->active2totdoc[jj];
if(!after[j]) {
scount++;
keep[j]=0;
}
}
for(i=0;i<kernel_cache->max_elems;i++) {
for(jj=0;jj<kernel_cache->activenum;jj++) {
j=kernel_cache->active2totdoc[jj];
if(!keep[j]) {
from++;
}
else {
kernel_cache->buffer[to]=kernel_cache->buffer[from];
to++;
from++;
}
}
}
kernel_cache->activenum=0;
for(j=0;j<totdoc;j++) {
if((keep[j]) && (kernel_cache->totdoc2active[j] != -1)) {
kernel_cache->active2totdoc[kernel_cache->activenum]=j;
kernel_cache->totdoc2active[j]=kernel_cache->activenum;
kernel_cache->activenum++;
}
else {
kernel_cache->totdoc2active[j]=-1;
}
}
kernel_cache->max_elems=(long)(kernel_cache->buffsize/kernel_cache->activenum);
if(kernel_cache->max_elems>totdoc) {
kernel_cache->max_elems=totdoc;
}
free(keep);
if(verbosity>=2) {
printf("done.\n"); fflush(stdout);
printf(" Cache-size in rows = %ld\n",kernel_cache->max_elems);
}
}
KERNEL_CACHE *kernel_cache_init(long int totdoc, long int buffsize)
{
long i;
KERNEL_CACHE *kernel_cache;
kernel_cache=(KERNEL_CACHE *)my_malloc(sizeof(KERNEL_CACHE));
kernel_cache->index = (long *)my_malloc(sizeof(long)*totdoc);
kernel_cache->occu = (long *)my_malloc(sizeof(long)*totdoc);
kernel_cache->lru = (long *)my_malloc(sizeof(long)*totdoc);
kernel_cache->invindex = (long *)my_malloc(sizeof(long)*totdoc);
kernel_cache->active2totdoc = (long *)my_malloc(sizeof(long)*totdoc);
kernel_cache->totdoc2active = (long *)my_malloc(sizeof(long)*totdoc);
kernel_cache->buffer = (CFLOAT *)my_malloc((size_t)(buffsize)*1024*1024);
kernel_cache->buffsize=(long)(buffsize/sizeof(CFLOAT)*1024*1024);
kernel_cache->max_elems=(long)(kernel_cache->buffsize/totdoc);
if(kernel_cache->max_elems>totdoc) {
kernel_cache->max_elems=totdoc;
}
if(verbosity>=2) {
printf(" Cache-size in rows = %ld\n",kernel_cache->max_elems);
printf(" Kernel evals so far: %ld\n",kernel_cache_statistic);
}
kernel_cache->elems=0; /* initialize cache */
for(i=0;i<totdoc;i++) {
kernel_cache->index[i]=-1;
kernel_cache->lru[i]=0;
}
for(i=0;i<totdoc;i++) {
kernel_cache->occu[i]=0;
kernel_cache->invindex[i]=-1;
}
kernel_cache->activenum=totdoc;;
for(i=0;i<totdoc;i++) {
kernel_cache->active2totdoc[i]=i;
kernel_cache->totdoc2active[i]=i;
}
kernel_cache->time=0;
return(kernel_cache);
}
void kernel_cache_reset_lru(KERNEL_CACHE *kernel_cache)
{
long maxlru=0,k;
for(k=0;k<kernel_cache->max_elems;k++) {
if(maxlru < kernel_cache->lru[k])
maxlru=kernel_cache->lru[k];
}
for(k=0;k<kernel_cache->max_elems;k++) {
kernel_cache->lru[k]-=maxlru;
}
}
void kernel_cache_cleanup(KERNEL_CACHE *kernel_cache)
{
free(kernel_cache->index);
free(kernel_cache->occu);
free(kernel_cache->lru);
free(kernel_cache->invindex);
free(kernel_cache->active2totdoc);
free(kernel_cache->totdoc2active);
free(kernel_cache->buffer);
free(kernel_cache);
}
long kernel_cache_malloc(KERNEL_CACHE *kernel_cache)
{
long i;
if(kernel_cache_space_available(kernel_cache)) {
for(i=0;i<kernel_cache->max_elems;i++) {
if(!kernel_cache->occu[i]) {
kernel_cache->occu[i]=1;
kernel_cache->elems++;
return(i);
}
}
}
return(-1);
}
void kernel_cache_free(KERNEL_CACHE *kernel_cache, long int i)
{
kernel_cache->occu[i]=0;
kernel_cache->elems--;
}
long kernel_cache_free_lru(KERNEL_CACHE *kernel_cache)
/* remove least recently used cache element */
{
register long k,least_elem=-1,least_time;
least_time=kernel_cache->time+1;
for(k=0;k<kernel_cache->max_elems;k++) {
if(kernel_cache->invindex[k] != -1) {
if(kernel_cache->lru[k]<least_time) {
least_time=kernel_cache->lru[k];
least_elem=k;
}
}
}
if(least_elem != -1) {
kernel_cache_free(kernel_cache,least_elem);
kernel_cache->index[kernel_cache->invindex[least_elem]]=-1;
kernel_cache->invindex[least_elem]=-1;
return(1);
}
return(0);
}
CFLOAT *kernel_cache_clean_and_malloc(KERNEL_CACHE *kernel_cache,
long int docnum)
/* Get a free cache entry. In case cache is full, the lru element
is removed. */
{
long result;
if((result = kernel_cache_malloc(kernel_cache)) == -1) {
if(kernel_cache_free_lru(kernel_cache)) {
result = kernel_cache_malloc(kernel_cache);
}
}
kernel_cache->index[docnum]=result;
if(result == -1) {
return(0);
}
kernel_cache->invindex[result]=docnum;
kernel_cache->lru[kernel_cache->index[docnum]]=kernel_cache->time; /* lru */
return((CFLOAT *)((long)kernel_cache->buffer
+(kernel_cache->activenum*sizeof(CFLOAT)*
kernel_cache->index[docnum])));
}
long kernel_cache_touch(KERNEL_CACHE *kernel_cache, long int docnum)
/* Update lru time to avoid removal from cache. */
{
if(kernel_cache && kernel_cache->index[docnum] != -1) {
kernel_cache->lru[kernel_cache->index[docnum]]=kernel_cache->time; /* lru */
return(1);
}
return(0);
}
long kernel_cache_check(KERNEL_CACHE *kernel_cache, long int docnum)
/* Is that row cached? */
{
return(kernel_cache->index[docnum] != -1);
}
long kernel_cache_space_available(KERNEL_CACHE *kernel_cache)
/* Is there room for one more row? */
{
return(kernel_cache->elems < kernel_cache->max_elems);
}
/************************** Compute estimates ******************************/
void compute_xa_estimates(MODEL *model, long int *label,
long int *unlabeled, long int totdoc,
DOC **docs, double *lin, double *a,
KERNEL_PARM *kernel_parm,
LEARN_PARM *learn_parm, double *error,
double *recall, double *precision)
/* Computes xa-estimate of error rate, recall, and precision. See
T. Joachims, Estimating the Generalization Performance of an SVM
Efficiently, IMCL, 2000. */
{
long i,looerror,looposerror,loonegerror;
long totex,totposex;
double xi,r_delta,r_delta_sq,sim=0;
long *sv2dnum=NULL,*sv=NULL,svnum;
r_delta=estimate_r_delta(docs,totdoc,kernel_parm);
r_delta_sq=r_delta*r_delta;
looerror=0;
looposerror=0;
loonegerror=0;
totex=0;
totposex=0;
svnum=0;
if(learn_parm->xa_depth > 0) {
sv = (long *)my_malloc(sizeof(long)*(totdoc+11));
for(i=0;i<totdoc;i++)
sv[i]=0;
for(i=1;i<model->sv_num;i++)
if(a[model->supvec[i]->docnum]
< (learn_parm->svm_cost[model->supvec[i]->docnum]
-learn_parm->epsilon_a)) {
sv[model->supvec[i]->docnum]=1;
svnum++;
}
sv2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11));
clear_index(sv2dnum);
compute_index(sv,totdoc,sv2dnum);
}
for(i=0;i<totdoc;i++) {
if(unlabeled[i]) {
/* ignore it */
}
else {
xi=1.0-((lin[i]-model->b)*(double)label[i]);
if(xi<0) xi=0;
if(label[i]>0) {
totposex++;
}
if((learn_parm->rho*a[i]*r_delta_sq+xi) >= 1.0) {
if(learn_parm->xa_depth > 0) { /* makes assumptions */
sim=distribute_alpha_t_greedily(sv2dnum,svnum,docs,a,i,label,
kernel_parm,learn_parm,
(double)((1.0-xi-a[i]*r_delta_sq)/(2.0*a[i])));
}
if((learn_parm->xa_depth == 0) ||
((a[i]*kernel(kernel_parm,docs[i],docs[i])+a[i]*2.0*sim+xi) >= 1.0)) {
looerror++;
if(label[i]>0) {
looposerror++;
}
else {
loonegerror++;
}
}
}
totex++;
}
}
(*error)=((double)looerror/(double)totex)*100.0;
(*recall)=(1.0-(double)looposerror/(double)totposex)*100.0;
(*precision)=(((double)totposex-(double)looposerror)
/((double)totposex-(double)looposerror+(double)loonegerror))*100.0;
free(sv);
free(sv2dnum);
}
double distribute_alpha_t_greedily(long int *sv2dnum, long int svnum,
DOC **docs, double *a,
long int docnum,
long int *label,
KERNEL_PARM *kernel_parm,
LEARN_PARM *learn_parm, double thresh)
/* Experimental Code improving plain XiAlpha Estimates by
computing a better bound using a greedy optimzation strategy. */
{
long best_depth=0;
long i,j,k,d,skip,allskip;
double best,best_val[101],val,init_val_sq,init_val_lin;
long best_ex[101];
CFLOAT *cache,*trow;
cache=(CFLOAT *)my_malloc(sizeof(CFLOAT)*learn_parm->xa_depth*svnum);
trow = (CFLOAT *)my_malloc(sizeof(CFLOAT)*svnum);
for(k=0;k<svnum;k++) {
trow[k]=kernel(kernel_parm,docs[docnum],docs[sv2dnum[k]]);
}
init_val_sq=0;
init_val_lin=0;
best=0;
for(d=0;d<learn_parm->xa_depth;d++) {
allskip=1;
if(d>=1) {
init_val_sq+=cache[best_ex[d-1]+svnum*(d-1)];
for(k=0;k<d-1;k++) {
init_val_sq+=2.0*cache[best_ex[k]+svnum*(d-1)];
}
init_val_lin+=trow[best_ex[d-1]];
}
for(i=0;i<svnum;i++) {
skip=0;
if(sv2dnum[i] == docnum) skip=1;
for(j=0;j<d;j++) {
if(i == best_ex[j]) skip=1;
}
if(!skip) {
val=init_val_sq;
if(kernel_parm->kernel_type == LINEAR)
val+=docs[sv2dnum[i]]->fvec->twonorm_sq;
else
val+=kernel(kernel_parm,docs[sv2dnum[i]],docs[sv2dnum[i]]);
for(j=0;j<d;j++) {
val+=2.0*cache[i+j*svnum];
}
val*=(1.0/(2.0*(d+1.0)*(d+1.0)));
val-=((init_val_lin+trow[i])/(d+1.0));
if(allskip || (val < best_val[d])) {
best_val[d]=val;
best_ex[d]=i;
}
allskip=0;
if(val < thresh) {
i=svnum;
/* printf("EARLY"); */
}
}
}
if(!allskip) {
for(k=0;k<svnum;k++) {
cache[d*svnum+k]=kernel(kernel_parm,
docs[sv2dnum[best_ex[d]]],
docs[sv2dnum[k]]);
}
}
if((!allskip) && ((best_val[d] < best) || (d == 0))) {
best=best_val[d];
best_depth=d;
}
if(allskip || (best < thresh)) {
d=learn_parm->xa_depth;
}
}
free(cache);
free(trow);
/* printf("Distribute[%ld](%ld)=%f, ",docnum,best_depth,best); */
return(best);
}
void estimate_transduction_quality(MODEL *model, long int *label,
long int *unlabeled,
long int totdoc, DOC **docs, double *lin)
/* Loo-bound based on observation that loo-errors must have an
equal distribution in both training and test examples, given
that the test examples are classified correctly. Compare
chapter "Constraints on the Transductive Hyperplane" in my
Dissertation. */
{
long i,j,l=0,ulab=0,lab=0,labpos=0,labneg=0,ulabpos=0,ulabneg=0,totulab=0;
double totlab=0,totlabpos=0,totlabneg=0,labsum=0,ulabsum=0;
double r_delta,r_delta_sq,xi,xisum=0,asum=0;
r_delta=estimate_r_delta(docs,totdoc,&(model->kernel_parm));
r_delta_sq=r_delta*r_delta;
for(j=0;j<totdoc;j++) {
if(unlabeled[j]) {
totulab++;
}
else {
totlab++;
if(label[j] > 0)
totlabpos++;
else
totlabneg++;
}
}
for(j=1;j<model->sv_num;j++) {
i=model->supvec[j]->docnum;
xi=1.0-((lin[i]-model->b)*(double)label[i]);
if(xi<0) xi=0;
xisum+=xi;
asum+=fabs(model->alpha[j]);
if(unlabeled[i]) {
ulabsum+=(fabs(model->alpha[j])*r_delta_sq+xi);
}
else {
labsum+=(fabs(model->alpha[j])*r_delta_sq+xi);
}
if((fabs(model->alpha[j])*r_delta_sq+xi) >= 1) {
l++;
if(unlabeled[model->supvec[j]->docnum]) {
ulab++;
if(model->alpha[j] > 0)
ulabpos++;
else
ulabneg++;
}
else {
lab++;
if(model->alpha[j] > 0)
labpos++;
else
labneg++;
}
}
}
printf("xacrit>=1: labeledpos=%.5f labeledneg=%.5f default=%.5f\n",(double)labpos/(double)totlab*100.0,(double)labneg/(double)totlab*100.0,(double)totlabpos/(double)(totlab)*100.0);
printf("xacrit>=1: unlabelpos=%.5f unlabelneg=%.5f\n",(double)ulabpos/(double)totulab*100.0,(double)ulabneg/(double)totulab*100.0);
printf("xacrit>=1: labeled=%.5f unlabled=%.5f all=%.5f\n",(double)lab/(double)totlab*100.0,(double)ulab/(double)totulab*100.0,(double)l/(double)(totdoc)*100.0);
printf("xacritsum: labeled=%.5f unlabled=%.5f all=%.5f\n",(double)labsum/(double)totlab*100.0,(double)ulabsum/(double)totulab*100.0,(double)(labsum+ulabsum)/(double)(totdoc)*100.0);
printf("r_delta_sq=%.5f xisum=%.5f asum=%.5f\n",r_delta_sq,xisum,asum);
}
double estimate_margin_vcdim(MODEL *model, double w, double R,
KERNEL_PARM *kernel_parm)
/* optional: length of model vector in feature space */
/* optional: radius of ball containing the data */
{
double h;
/* follows chapter 5.6.4 in [Vapnik/95] */
if(w<0) {
w=model_length_s(model,kernel_parm);
}
if(R<0) {
R=estimate_sphere(model,kernel_parm);
}
h = w*w * R*R +1;
return(h);
}
double estimate_sphere(MODEL *model, KERNEL_PARM *kernel_parm)
/* Approximates the radius of the ball containing */
/* the support vectors by bounding it with the */
{ /* length of the longest support vector. This is */
register long j; /* pretty good for text categorization, since all */
double xlen,maxxlen=0; /* documents have feature vectors of length 1. It */
DOC *nulldoc; /* assumes that the center of the ball is at the */
WORD nullword; /* origin of the space. */
nullword.wnum=0;
nulldoc=create_example(-2,0,0,0.0,create_svector(&nullword,"",1.0));
for(j=1;j<model->sv_num;j++) {
xlen=sqrt(kernel(kernel_parm,model->supvec[j],model->supvec[j])
-2*kernel(kernel_parm,model->supvec[j],nulldoc)
+kernel(kernel_parm,nulldoc,nulldoc));
if(xlen>maxxlen) {
maxxlen=xlen;
}
}
free_example(nulldoc,1);
return(maxxlen);
}
double estimate_r_delta(DOC **docs, long int totdoc, KERNEL_PARM *kernel_parm)
{
long i;
double maxxlen,xlen;
DOC *nulldoc; /* assumes that the center of the ball is at the */
WORD nullword; /* origin of the space. */
nullword.wnum=0;
nulldoc=create_example(-2,0,0,0.0,create_svector(&nullword,"",1.0));
maxxlen=0;
for(i=0;i<totdoc;i++) {
xlen=sqrt(kernel(kernel_parm,docs[i],docs[i])
-2*kernel(kernel_parm,docs[i],nulldoc)
+kernel(kernel_parm,nulldoc,nulldoc));
if(xlen>maxxlen) {
maxxlen=xlen;
}
}
free_example(nulldoc,1);
return(maxxlen);
}
double estimate_r_delta_average(DOC **docs, long int totdoc,
KERNEL_PARM *kernel_parm)
{
long i;
double avgxlen;
DOC *nulldoc; /* assumes that the center of the ball is at the */
WORD nullword; /* origin of the space. */
nullword.wnum=0;
nulldoc=create_example(-2,0,0,0.0,create_svector(&nullword,"",1.0));
avgxlen=0;
for(i=0;i<totdoc;i++) {
avgxlen+=sqrt(kernel(kernel_parm,docs[i],docs[i])
-2*kernel(kernel_parm,docs[i],nulldoc)
+kernel(kernel_parm,nulldoc,nulldoc));
}
free_example(nulldoc,1);
return(avgxlen/totdoc);
}
double length_of_longest_document_vector(DOC **docs, long int totdoc,
KERNEL_PARM *kernel_parm)
{
long i;
double maxxlen,xlen;
maxxlen=0;
for(i=0;i<totdoc;i++) {
xlen=sqrt(kernel(kernel_parm,docs[i],docs[i]));
if(xlen>maxxlen) {
maxxlen=xlen;
}
}
return(maxxlen);
}
/****************************** IO-handling **********************************/
void write_prediction(char *predfile, MODEL *model, double *lin,
double *a, long int *unlabeled,
long