@@ -0,0 +1,165 @@
#include "cuda_runtime.h"
#include "curand.h"
#include "cublas_v2.h"

extern "C" {
#include "activations.h"
#include "cuda.h"
}


__device__ float lhtan_activate_kernel(float x)
{
if(x < 0) return .001*x;
if(x > 1) return .001*(x-1) + 1;
return x;
}
__device__ float lhtan_gradient_kernel(float x)
{
if(x > 0 && x < 1) return 1;
return .001;
}

__device__ float hardtan_activate_kernel(float x)
{
if (x < -1) return -1;
if (x > 1) return 1;
return x;
}
__device__ float linear_activate_kernel(float x){return x;}
__device__ float logistic_activate_kernel(float x){return 1./(1. + exp(-x));}
__device__ float loggy_activate_kernel(float x){return 2./(1. + exp(-x)) - 1;}
__device__ float relu_activate_kernel(float x){return x*(x>0);}
__device__ float elu_activate_kernel(float x){return (x >= 0)*x + (x < 0)*(exp(x)-1);}
__device__ float relie_activate_kernel(float x){return (x>0) ? x : .01*x;}
__device__ float ramp_activate_kernel(float x){return x*(x>0)+.1*x;}
__device__ float leaky_activate_kernel(float x){return (x>0) ? x : .1*x;}
__device__ float tanh_activate_kernel(float x){return (2/(1 + exp(-2*x)) - 1);}
__device__ float plse_activate_kernel(float x)
{
if(x < -4) return .01 * (x + 4);
if(x > 4) return .01 * (x - 4) + 1;
return .125*x + .5;
}
__device__ float stair_activate_kernel(float x)
{
int n = floor(x);
if (n%2 == 0) return floor(x/2.);
else return (x - n) + floor(x/2.);
}


__device__ float hardtan_gradient_kernel(float x)
{
if (x > -1 && x < 1) return 1;
return 0;
}
__device__ float linear_gradient_kernel(float x){return 1;}
__device__ float logistic_gradient_kernel(float x){return (1-x)*x;}
__device__ float loggy_gradient_kernel(float x)
{
float y = (x+1.)/2.;
return 2*(1-y)*y;
}
__device__ float relu_gradient_kernel(float x){return (x>0);}
__device__ float elu_gradient_kernel(float x){return (x >= 0) + (x < 0)*(x + 1);}
__device__ float relie_gradient_kernel(float x){return (x>0) ? 1 : .01;}
__device__ float ramp_gradient_kernel(float x){return (x>0)+.1;}
__device__ float leaky_gradient_kernel(float x){return (x>0) ? 1 : .1;}
__device__ float tanh_gradient_kernel(float x){return 1-x*x;}
__device__ float plse_gradient_kernel(float x){return (x < 0 || x > 1) ? .01 : .125;}
__device__ float stair_gradient_kernel(float x)
{
if (floor(x) == x) return 0;
return 1;
}

__device__ float activate_kernel(float x, ACTIVATION a)
{
switch(a){
case LINEAR:
return linear_activate_kernel(x);
case LOGISTIC:
return logistic_activate_kernel(x);
case LOGGY:
return loggy_activate_kernel(x);
case RELU:
return relu_activate_kernel(x);
case ELU:
return elu_activate_kernel(x);
case RELIE:
return relie_activate_kernel(x);
case RAMP:
return ramp_activate_kernel(x);
case LEAKY:
return leaky_activate_kernel(x);
case TANH:
return tanh_activate_kernel(x);
case PLSE:
return plse_activate_kernel(x);
case STAIR:
return stair_activate_kernel(x);
case HARDTAN:
return hardtan_activate_kernel(x);
case LHTAN:
return lhtan_activate_kernel(x);
}
return 0;
}

__device__ float gradient_kernel(float x, ACTIVATION a)
{
switch(a){
case LINEAR:
return linear_gradient_kernel(x);
case LOGISTIC:
return logistic_gradient_kernel(x);
case LOGGY:
return loggy_gradient_kernel(x);
case RELU:
return relu_gradient_kernel(x);
case ELU:
return elu_gradient_kernel(x);
case RELIE:
return relie_gradient_kernel(x);
case RAMP:
return ramp_gradient_kernel(x);
case LEAKY:
return leaky_gradient_kernel(x);
case TANH:
return tanh_gradient_kernel(x);
case PLSE:
return plse_gradient_kernel(x);
case STAIR:
return stair_gradient_kernel(x);
case HARDTAN:
return hardtan_gradient_kernel(x);
case LHTAN:
return lhtan_gradient_kernel(x);
}
return 0;
}

__global__ void activate_array_kernel(float *x, int n, ACTIVATION a)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(i < n) x[i] = activate_kernel(x[i], a);
}

__global__ void gradient_array_kernel(float *x, int n, ACTIVATION a, float *delta)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(i < n) delta[i] *= gradient_kernel(x[i], a);
}

extern "C" void activate_array_ongpu(float *x, int n, ACTIVATION a)
{
activate_array_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, a);
check_error(cudaPeekAtLastError());
}

extern "C" void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta)
{
gradient_array_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, a, delta);
check_error(cudaPeekAtLastError());
}
@@ -0,0 +1,63 @@
#include "activation_layer.h"
#include "utils.h"
#include "cuda.h"
#include "blas.h"
#include "gemm.h"

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

layer make_activation_layer(int batch, int inputs, ACTIVATION activation)
{
layer l = {0};
l.type = ACTIVE;

l.inputs = inputs;
l.outputs = inputs;
l.batch=batch;

l.output = calloc(batch*inputs, sizeof(float*));
l.delta = calloc(batch*inputs, sizeof(float*));

l.forward = forward_activation_layer;
l.backward = backward_activation_layer;
#ifdef GPU
l.forward_gpu = forward_activation_layer_gpu;
l.backward_gpu = backward_activation_layer_gpu;

l.output_gpu = cuda_make_array(l.output, inputs*batch);
l.delta_gpu = cuda_make_array(l.delta, inputs*batch);
#endif
l.activation = activation;
fprintf(stderr, "Activation Layer: %d inputs\n", inputs);
return l;
}

void forward_activation_layer(layer l, network_state state)
{
copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
activate_array(l.output, l.outputs*l.batch, l.activation);
}

void backward_activation_layer(layer l, network_state state)
{
gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
copy_cpu(l.outputs*l.batch, l.delta, 1, state.delta, 1);
}

#ifdef GPU

void forward_activation_layer_gpu(layer l, network_state state)
{
copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1);
activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
}

void backward_activation_layer_gpu(layer l, network_state state)
{
gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1);
}
#endif
@@ -0,0 +1,19 @@
#ifndef ACTIVATION_LAYER_H
#define ACTIVATION_LAYER_H

#include "activations.h"
#include "layer.h"
#include "network.h"

layer make_activation_layer(int batch, int inputs, ACTIVATION activation);

void forward_activation_layer(layer l, network_state state);
void backward_activation_layer(layer l, network_state state);

#ifdef GPU
void forward_activation_layer_gpu(layer l, network_state state);
void backward_activation_layer_gpu(layer l, network_state state);
#endif

#endif

@@ -0,0 +1,143 @@
#include "activations.h"

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

char *get_activation_string(ACTIVATION a)
{
switch(a){
case LOGISTIC:
return "logistic";
case LOGGY:
return "loggy";
case RELU:
return "relu";
case ELU:
return "elu";
case RELIE:
return "relie";
case RAMP:
return "ramp";
case LINEAR:
return "linear";
case TANH:
return "tanh";
case PLSE:
return "plse";
case LEAKY:
return "leaky";
case STAIR:
return "stair";
case HARDTAN:
return "hardtan";
case LHTAN:
return "lhtan";
default:
break;
}
return "relu";
}

ACTIVATION get_activation(char *s)
{
if (strcmp(s, "logistic")==0) return LOGISTIC;
if (strcmp(s, "loggy")==0) return LOGGY;
if (strcmp(s, "relu")==0) return RELU;
if (strcmp(s, "elu")==0) return ELU;
if (strcmp(s, "relie")==0) return RELIE;
if (strcmp(s, "plse")==0) return PLSE;
if (strcmp(s, "hardtan")==0) return HARDTAN;
if (strcmp(s, "lhtan")==0) return LHTAN;
if (strcmp(s, "linear")==0) return LINEAR;
if (strcmp(s, "ramp")==0) return RAMP;
if (strcmp(s, "leaky")==0) return LEAKY;
if (strcmp(s, "tanh")==0) return TANH;
if (strcmp(s, "stair")==0) return STAIR;
fprintf(stderr, "Couldn't find activation function %s, going with ReLU\n", s);
return RELU;
}

float activate(float x, ACTIVATION a)
{
switch(a){
case LINEAR:
return linear_activate(x);
case LOGISTIC:
return logistic_activate(x);
case LOGGY:
return loggy_activate(x);
case RELU:
return relu_activate(x);
case ELU:
return elu_activate(x);
case RELIE:
return relie_activate(x);
case RAMP:
return ramp_activate(x);
case LEAKY:
return leaky_activate(x);
case TANH:
return tanh_activate(x);
case PLSE:
return plse_activate(x);
case STAIR:
return stair_activate(x);
case HARDTAN:
return hardtan_activate(x);
case LHTAN:
return lhtan_activate(x);
}
return 0;
}

void activate_array(float *x, const int n, const ACTIVATION a)
{
int i;
for(i = 0; i < n; ++i){
x[i] = activate(x[i], a);
}
}

float gradient(float x, ACTIVATION a)
{
switch(a){
case LINEAR:
return linear_gradient(x);
case LOGISTIC:
return logistic_gradient(x);
case LOGGY:
return loggy_gradient(x);
case RELU:
return relu_gradient(x);
case ELU:
return elu_gradient(x);
case RELIE:
return relie_gradient(x);
case RAMP:
return ramp_gradient(x);
case LEAKY:
return leaky_gradient(x);
case TANH:
return tanh_gradient(x);
case PLSE:
return plse_gradient(x);
case STAIR:
return stair_gradient(x);
case HARDTAN:
return hardtan_gradient(x);
case LHTAN:
return lhtan_gradient(x);
}
return 0;
}

void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta)
{
int i;
for(i = 0; i < n; ++i){
delta[i] *= gradient(x[i], a);
}
}

@@ -0,0 +1,88 @@
#ifndef ACTIVATIONS_H
#define ACTIVATIONS_H
#include "cuda.h"
#include "math.h"

typedef enum{
LOGISTIC, RELU, RELIE, LINEAR, RAMP, TANH, PLSE, LEAKY, ELU, LOGGY, STAIR, HARDTAN, LHTAN
}ACTIVATION;

ACTIVATION get_activation(char *s);

char *get_activation_string(ACTIVATION a);
float activate(float x, ACTIVATION a);
float gradient(float x, ACTIVATION a);
void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta);
void activate_array(float *x, const int n, const ACTIVATION a);
#ifdef GPU
void activate_array_ongpu(float *x, int n, ACTIVATION a);
void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta);
#endif

static inline float stair_activate(float x)
{
int n = floor(x);
if (n%2 == 0) return floor(x/2.);
else return (x - n) + floor(x/2.);
}
static inline float hardtan_activate(float x)
{
if (x < -1) return -1;
if (x > 1) return 1;
return x;
}
static inline float linear_activate(float x){return x;}
static inline float logistic_activate(float x){return 1./(1. + exp(-x));}
static inline float loggy_activate(float x){return 2./(1. + exp(-x)) - 1;}
static inline float relu_activate(float x){return x*(x>0);}
static inline float elu_activate(float x){return (x >= 0)*x + (x < 0)*(exp(x)-1);}
static inline float relie_activate(float x){return (x>0) ? x : .01*x;}
static inline float ramp_activate(float x){return x*(x>0)+.1*x;}
static inline float leaky_activate(float x){return (x>0) ? x : .1*x;}
static inline float tanh_activate(float x){return (exp(2*x)-1)/(exp(2*x)+1);}
static inline float plse_activate(float x)
{
if(x < -4) return .01 * (x + 4);
if(x > 4) return .01 * (x - 4) + 1;
return .125*x + .5;
}

static inline float lhtan_activate(float x)
{
if(x < 0) return .001*x;
if(x > 1) return .001*(x-1) + 1;
return x;
}
static inline float lhtan_gradient(float x)
{
if(x > 0 && x < 1) return 1;
return .001;
}

static inline float hardtan_gradient(float x)
{
if (x > -1 && x < 1) return 1;
return 0;
}
static inline float linear_gradient(float x){return 1;}
static inline float logistic_gradient(float x){return (1-x)*x;}
static inline float loggy_gradient(float x)
{
float y = (x+1.)/2.;
return 2*(1-y)*y;
}
static inline float stair_gradient(float x)
{
if (floor(x) == x) return 0;
return 1;
}
static inline float relu_gradient(float x){return (x>0);}
static inline float elu_gradient(float x){return (x >= 0) + (x < 0)*(x + 1);}
static inline float relie_gradient(float x){return (x>0) ? 1 : .01;}
static inline float ramp_gradient(float x){return (x>0)+.1;}
static inline float leaky_gradient(float x){return (x>0) ? 1 : .1;}
static inline float tanh_gradient(float x){return 1-x*x;}
static inline float plse_gradient(float x){return (x < 0 || x > 1) ? .01 : .125;}

#endif

@@ -0,0 +1,71 @@
#include "network.h"
#include "utils.h"
#include "parser.h"
#include "option_list.h"
#include "blas.h"
#include "classifier.h"
#include <sys/time.h>

void demo_art(char *cfgfile, char *weightfile, int cam_index)
{
#ifdef OPENCV
network net = parse_network_cfg(cfgfile);
if(weightfile){
load_weights(&net, weightfile);
}
set_batch_network(&net, 1);

srand(2222222);
CvCapture * cap;

cap = cvCaptureFromCAM(cam_index);

char *window = "ArtJudgementBot9000!!!";
if(!cap) error("Couldn't connect to webcam.\n");
cvNamedWindow(window, CV_WINDOW_NORMAL);
cvResizeWindow(window, 512, 512);
int i;
int idx[] = {37, 401, 434};
int n = sizeof(idx)/sizeof(idx[0]);

while(1){
image in = get_image_from_stream(cap);
image in_s = resize_image(in, net.w, net.h);
show_image(in, window);

float *p = network_predict(net, in_s.data);

printf("\033[2J");
printf("\033[1;1H");

float score = 0;
for(i = 0; i < n; ++i){
float s = p[idx[i]];
if (s > score) score = s;
}
score = score;
printf("I APPRECIATE THIS ARTWORK: %10.7f%%\n", score*100);
printf("[");
int upper = 30;
for(i = 0; i < upper; ++i){
printf("%c", ((i+.5) < score*upper) ? 219 : ' ');
}
printf("]\n");

free_image(in_s);
free_image(in);

cvWaitKey(1);
}
#endif
}


void run_art(int argc, char **argv)
{
int cam_index = find_int_arg(argc, argv, "-c", 0);
char *cfg = argv[2];
char *weights = argv[3];
demo_art(cfg, weights, cam_index);
}

@@ -0,0 +1,71 @@
#include "avgpool_layer.h"
#include "cuda.h"
#include <stdio.h>

avgpool_layer make_avgpool_layer(int batch, int w, int h, int c)
{
fprintf(stderr, "avg %4d x%4d x%4d -> %4d\n", w, h, c, c);
avgpool_layer l = {0};
l.type = AVGPOOL;
l.batch = batch;
l.h = h;
l.w = w;
l.c = c;
l.out_w = 1;
l.out_h = 1;
l.out_c = c;
l.outputs = l.out_c;
l.inputs = h*w*c;
int output_size = l.outputs * batch;
l.output = calloc(output_size, sizeof(float));
l.delta = calloc(output_size, sizeof(float));
l.forward = forward_avgpool_layer;
l.backward = backward_avgpool_layer;
#ifdef GPU
l.forward_gpu = forward_avgpool_layer_gpu;
l.backward_gpu = backward_avgpool_layer_gpu;
l.output_gpu = cuda_make_array(l.output, output_size);
l.delta_gpu = cuda_make_array(l.delta, output_size);
#endif
return l;
}

void resize_avgpool_layer(avgpool_layer *l, int w, int h)
{
l->w = w;
l->h = h;
l->inputs = h*w*l->c;
}

void forward_avgpool_layer(const avgpool_layer l, network_state state)
{
int b,i,k;

for(b = 0; b < l.batch; ++b){
for(k = 0; k < l.c; ++k){
int out_index = k + b*l.c;
l.output[out_index] = 0;
for(i = 0; i < l.h*l.w; ++i){
int in_index = i + l.h*l.w*(k + b*l.c);
l.output[out_index] += state.input[in_index];
}
l.output[out_index] /= l.h*l.w;
}
}
}

void backward_avgpool_layer(const avgpool_layer l, network_state state)
{
int b,i,k;

for(b = 0; b < l.batch; ++b){
for(k = 0; k < l.c; ++k){
int out_index = k + b*l.c;
for(i = 0; i < l.h*l.w; ++i){
int in_index = i + l.h*l.w*(k + b*l.c);
state.delta[in_index] += l.delta[out_index] / (l.h*l.w);
}
}
}
}

@@ -0,0 +1,23 @@
#ifndef AVGPOOL_LAYER_H
#define AVGPOOL_LAYER_H

#include "image.h"
#include "cuda.h"
#include "layer.h"
#include "network.h"

typedef layer avgpool_layer;

image get_avgpool_image(avgpool_layer l);
avgpool_layer make_avgpool_layer(int batch, int w, int h, int c);
void resize_avgpool_layer(avgpool_layer *l, int w, int h);
void forward_avgpool_layer(const avgpool_layer l, network_state state);
void backward_avgpool_layer(const avgpool_layer l, network_state state);

#ifdef GPU
void forward_avgpool_layer_gpu(avgpool_layer l, network_state state);
void backward_avgpool_layer_gpu(avgpool_layer l, network_state state);
#endif

#endif

@@ -0,0 +1,61 @@
#include "cuda_runtime.h"
#include "curand.h"
#include "cublas_v2.h"

extern "C" {
#include "avgpool_layer.h"
#include "cuda.h"
}

__global__ void forward_avgpool_layer_kernel(int n, int w, int h, int c, float *input, float *output)
{
int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(id >= n) return;

int k = id % c;
id /= c;
int b = id;

int i;
int out_index = (k + c*b);
output[out_index] = 0;
for(i = 0; i < w*h; ++i){
int in_index = i + h*w*(k + b*c);
output[out_index] += input[in_index];
}
output[out_index] /= w*h;
}

__global__ void backward_avgpool_layer_kernel(int n, int w, int h, int c, float *in_delta, float *out_delta)
{
int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(id >= n) return;

int k = id % c;
id /= c;
int b = id;

int i;
int out_index = (k + c*b);
for(i = 0; i < w*h; ++i){
int in_index = i + h*w*(k + b*c);
in_delta[in_index] += out_delta[out_index] / (w*h);
}
}

extern "C" void forward_avgpool_layer_gpu(avgpool_layer layer, network_state state)
{
size_t n = layer.c*layer.batch;

forward_avgpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.w, layer.h, layer.c, state.input, layer.output_gpu);
check_error(cudaPeekAtLastError());
}

extern "C" void backward_avgpool_layer_gpu(avgpool_layer layer, network_state state)
{
size_t n = layer.c*layer.batch;

backward_avgpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.w, layer.h, layer.c, state.delta, layer.delta_gpu);
check_error(cudaPeekAtLastError());
}

@@ -0,0 +1,279 @@
#include "convolutional_layer.h"
#include "batchnorm_layer.h"
#include "blas.h"
#include <stdio.h>

layer make_batchnorm_layer(int batch, int w, int h, int c)
{
fprintf(stderr, "Batch Normalization Layer: %d x %d x %d image\n", w,h,c);
layer l = {0};
l.type = BATCHNORM;
l.batch = batch;
l.h = l.out_h = h;
l.w = l.out_w = w;
l.c = l.out_c = c;
l.output = calloc(h * w * c * batch, sizeof(float));
l.delta = calloc(h * w * c * batch, sizeof(float));
l.inputs = w*h*c;
l.outputs = l.inputs;

l.scales = calloc(c, sizeof(float));
l.scale_updates = calloc(c, sizeof(float));
l.biases = calloc(c, sizeof(float));
l.bias_updates = calloc(c, sizeof(float));
int i;
for(i = 0; i < c; ++i){
l.scales[i] = 1;
}

l.mean = calloc(c, sizeof(float));
l.variance = calloc(c, sizeof(float));

l.rolling_mean = calloc(c, sizeof(float));
l.rolling_variance = calloc(c, sizeof(float));

l.forward = forward_batchnorm_layer;
l.backward = backward_batchnorm_layer;
#ifdef GPU
l.forward_gpu = forward_batchnorm_layer_gpu;
l.backward_gpu = backward_batchnorm_layer_gpu;

l.output_gpu = cuda_make_array(l.output, h * w * c * batch);
l.delta_gpu = cuda_make_array(l.delta, h * w * c * batch);

l.biases_gpu = cuda_make_array(l.biases, c);
l.bias_updates_gpu = cuda_make_array(l.bias_updates, c);

l.scales_gpu = cuda_make_array(l.scales, c);
l.scale_updates_gpu = cuda_make_array(l.scale_updates, c);

l.mean_gpu = cuda_make_array(l.mean, c);
l.variance_gpu = cuda_make_array(l.variance, c);

l.rolling_mean_gpu = cuda_make_array(l.mean, c);
l.rolling_variance_gpu = cuda_make_array(l.variance, c);

l.mean_delta_gpu = cuda_make_array(l.mean, c);
l.variance_delta_gpu = cuda_make_array(l.variance, c);

l.x_gpu = cuda_make_array(l.output, l.batch*l.outputs);
l.x_norm_gpu = cuda_make_array(l.output, l.batch*l.outputs);
#ifdef CUDNN
cudnnCreateTensorDescriptor(&l.normTensorDesc);
cudnnCreateTensorDescriptor(&l.dstTensorDesc);
cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w);
cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1);

#endif
#endif
return l;
}

void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates)
{
int i,b,f;
for(f = 0; f < n; ++f){
float sum = 0;
for(b = 0; b < batch; ++b){
for(i = 0; i < size; ++i){
int index = i + size*(f + n*b);
sum += delta[index] * x_norm[index];
}
}
scale_updates[f] += sum;
}
}

void mean_delta_cpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta)
{

int i,j,k;
for(i = 0; i < filters; ++i){
mean_delta[i] = 0;
for (j = 0; j < batch; ++j) {
for (k = 0; k < spatial; ++k) {
int index = j*filters*spatial + i*spatial + k;
mean_delta[i] += delta[index];
}
}
mean_delta[i] *= (-1./sqrt(variance[i] + .00001f));
}
}
void variance_delta_cpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta)
{

int i,j,k;
for(i = 0; i < filters; ++i){
variance_delta[i] = 0;
for(j = 0; j < batch; ++j){
for(k = 0; k < spatial; ++k){
int index = j*filters*spatial + i*spatial + k;
variance_delta[i] += delta[index]*(x[index] - mean[i]);
}
}
variance_delta[i] *= -.5 * pow(variance[i] + .00001f, (float)(-3./2.));
}
}
void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta)
{
int f, j, k;
for(j = 0; j < batch; ++j){
for(f = 0; f < filters; ++f){
for(k = 0; k < spatial; ++k){
int index = j*filters*spatial + f*spatial + k;
delta[index] = delta[index] * 1./(sqrt(variance[f] + .00001f)) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch);
}
}
}
}

void resize_batchnorm_layer(layer *layer, int w, int h)
{
fprintf(stderr, "Not implemented\n");
}

void forward_batchnorm_layer(layer l, network_state state)
{
if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
if(l.type == CONNECTED){
l.out_c = l.outputs;
l.out_h = l.out_w = 1;
}
if(state.train){
mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean);
variance_cpu(l.output, l.mean, l.batch, l.out_c, l.out_h*l.out_w, l.variance);

scal_cpu(l.out_c, .99, l.rolling_mean, 1);
axpy_cpu(l.out_c, .01, l.mean, 1, l.rolling_mean, 1);
scal_cpu(l.out_c, .99, l.rolling_variance, 1);
axpy_cpu(l.out_c, .01, l.variance, 1, l.rolling_variance, 1);

copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
normalize_cpu(l.output, l.mean, l.variance, l.batch, l.out_c, l.out_h*l.out_w);
copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1);
} else {
normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.out_c, l.out_h*l.out_w);
}
scale_bias(l.output, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
add_bias(l.output, l.biases, l.batch, l.out_c, l.out_h*l.out_w);
}

void backward_batchnorm_layer(const layer l, network_state state)
{
backward_bias(l.bias_updates, l.delta, l.batch, l.out_c, l.out_w*l.out_h);
backward_scale_cpu(l.x_norm, l.delta, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates);

scale_bias(l.delta, l.scales, l.batch, l.out_c, l.out_h*l.out_w);

mean_delta_cpu(l.delta, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta);
variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta);
normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.out_c, l.out_w*l.out_h, l.delta);
if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, l.delta, 1, state.delta, 1);
}

#ifdef GPU

void pull_batchnorm_layer(layer l)
{
cuda_pull_array(l.scales_gpu, l.scales, l.c);
cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.c);
cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.c);
}
void push_batchnorm_layer(layer l)
{
cuda_push_array(l.scales_gpu, l.scales, l.c);
cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.c);
cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.c);
}

void forward_batchnorm_layer_gpu(layer l, network_state state)
{
if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1);
if(l.type == CONNECTED){
l.out_c = l.outputs;
l.out_h = l.out_w = 1;
}
if (state.train) {
#ifdef CUDNN
copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
float one = 1;
float zero = 0;
cudnnBatchNormalizationForwardTraining(cudnn_handle(),
CUDNN_BATCHNORM_SPATIAL,
&one,
&zero,
l.dstTensorDesc,
l.x_gpu,
l.dstTensorDesc,
l.output_gpu,
l.normTensorDesc,
l.scales_gpu,
l.biases_gpu,
.01,
l.rolling_mean_gpu,
l.rolling_variance_gpu,
.00001,
l.mean_gpu,
l.variance_gpu);
#else
fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu);
fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu);

scal_ongpu(l.out_c, .99, l.rolling_mean_gpu, 1);
axpy_ongpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1);
scal_ongpu(l.out_c, .99, l.rolling_variance_gpu, 1);
axpy_ongpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1);

copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1);

scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
#endif
} else {
normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
}

}

void backward_batchnorm_layer_gpu(const layer l, network_state state)
{
#ifdef CUDNN
float one = 1;
float zero = 0;
cudnnBatchNormalizationBackward(cudnn_handle(),
CUDNN_BATCHNORM_SPATIAL,
&one,
&zero,
&one,
&one,
l.dstTensorDesc,
l.x_gpu,
l.dstTensorDesc,
l.delta_gpu,
l.dstTensorDesc,
l.x_norm_gpu,
l.normTensorDesc,
l.scales_gpu,
l.scale_updates_gpu,
l.bias_updates_gpu,
.00001,
l.mean_gpu,
l.variance_gpu);
copy_ongpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1);
#else
backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h);
backward_scale_gpu(l.x_norm_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates_gpu);

scale_bias_gpu(l.delta_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);

fast_mean_delta_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta_gpu);
fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu);
normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu);
#endif
if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1);
}
#endif
@@ -0,0 +1,19 @@
#ifndef BATCHNORM_LAYER_H
#define BATCHNORM_LAYER_H

#include "image.h"
#include "layer.h"
#include "network.h"

layer make_batchnorm_layer(int batch, int w, int h, int c);
void forward_batchnorm_layer(layer l, network_state state);
void backward_batchnorm_layer(layer l, network_state state);

#ifdef GPU
void forward_batchnorm_layer_gpu(layer l, network_state state);
void backward_batchnorm_layer_gpu(layer l, network_state state);
void pull_batchnorm_layer(layer l);
void push_batchnorm_layer(layer l);
#endif

#endif
@@ -0,0 +1,243 @@
#include "blas.h"
#include "math.h"
#include <assert.h>
#include <float.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void reorg_cpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out)
{
int b,i,j,k;
int out_c = c/(stride*stride);

for(b = 0; b < batch; ++b){
for(k = 0; k < c; ++k){
for(j = 0; j < h; ++j){
for(i = 0; i < w; ++i){
int in_index = i + w*(j + h*(k + c*b));
int c2 = k % out_c;
int offset = k / out_c;
int w2 = i*stride + offset % stride;
int h2 = j*stride + offset / stride;
int out_index = w2 + w*stride*(h2 + h*stride*(c2 + out_c*b));
if(forward) out[out_index] = x[in_index];
else out[in_index] = x[out_index];
}
}
}
}
}

void flatten(float *x, int size, int layers, int batch, int forward)
{
float *swap = calloc(size*layers*batch, sizeof(float));
int i,c,b;
for(b = 0; b < batch; ++b){
for(c = 0; c < layers; ++c){
for(i = 0; i < size; ++i){
int i1 = b*layers*size + c*size + i;
int i2 = b*layers*size + i*layers + c;
if (forward) swap[i2] = x[i1];
else swap[i1] = x[i2];
}
}
}
memcpy(x, swap, size*layers*batch*sizeof(float));
free(swap);
}

void weighted_sum_cpu(float *a, float *b, float *s, int n, float *c)
{
int i;
for(i = 0; i < n; ++i){
c[i] = s[i]*a[i] + (1-s[i])*(b ? b[i] : 0);
}
}

void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
{
int stride = w1/w2;
int sample = w2/w1;
assert(stride == h1/h2);
assert(sample == h2/h1);
if(stride < 1) stride = 1;
if(sample < 1) sample = 1;
int minw = (w1 < w2) ? w1 : w2;
int minh = (h1 < h2) ? h1 : h2;
int minc = (c1 < c2) ? c1 : c2;

int i,j,k,b;
for(b = 0; b < batch; ++b){
for(k = 0; k < minc; ++k){
for(j = 0; j < minh; ++j){
for(i = 0; i < minw; ++i){
int out_index = i*sample + w2*(j*sample + h2*(k + c2*b));
int add_index = i*stride + w1*(j*stride + h1*(k + c1*b));
out[out_index] += add[add_index];
}
}
}
}
}

void mean_cpu(float *x, int batch, int filters, int spatial, float *mean)
{
float scale = 1./(batch * spatial);
int i,j,k;
for(i = 0; i < filters; ++i){
mean[i] = 0;
for(j = 0; j < batch; ++j){
for(k = 0; k < spatial; ++k){
int index = j*filters*spatial + i*spatial + k;
mean[i] += x[index];
}
}
mean[i] *= scale;
}
}

void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance)
{
float scale = 1./(batch * spatial - 1);
int i,j,k;
for(i = 0; i < filters; ++i){
variance[i] = 0;
for(j = 0; j < batch; ++j){
for(k = 0; k < spatial; ++k){
int index = j*filters*spatial + i*spatial + k;
variance[i] += pow((x[index] - mean[i]), 2);
}
}
variance[i] *= scale;
}
}

void normalize_cpu(float *x, float *mean, float *variance, int batch, int filters, int spatial)
{
int b, f, i;
for(b = 0; b < batch; ++b){
for(f = 0; f < filters; ++f){
for(i = 0; i < spatial; ++i){
int index = b*filters*spatial + f*spatial + i;
x[index] = (x[index] - mean[f])/(sqrt(variance[f]) + .000001f);
}
}
}
}

void const_cpu(int N, float ALPHA, float *X, int INCX)
{
int i;
for(i = 0; i < N; ++i) X[i*INCX] = ALPHA;
}

void mul_cpu(int N, float *X, int INCX, float *Y, int INCY)
{
int i;
for(i = 0; i < N; ++i) Y[i*INCY] *= X[i*INCX];
}

void pow_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
{
int i;
for(i = 0; i < N; ++i) Y[i*INCY] = pow(X[i*INCX], ALPHA);
}

void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY)
{
int i;
for(i = 0; i < N; ++i) Y[i*INCY] += ALPHA*X[i*INCX];
}

void scal_cpu(int N, float ALPHA, float *X, int INCX)
{
int i;
for(i = 0; i < N; ++i) X[i*INCX] *= ALPHA;
}

void fill_cpu(int N, float ALPHA, float *X, int INCX)
{
int i;
for(i = 0; i < N; ++i) X[i*INCX] = ALPHA;
}

void copy_cpu(int N, float *X, int INCX, float *Y, int INCY)
{
int i;
for(i = 0; i < N; ++i) Y[i*INCY] = X[i*INCX];
}

void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
{
int i;
for(i = 0; i < n; ++i){
float diff = truth[i] - pred[i];
float abs_val = fabs(diff);
if(abs_val < 1) {
error[i] = diff * diff;
delta[i] = diff;
}
else {
error[i] = 2*abs_val - 1;
delta[i] = (diff < 0) ? 1 : -1;
}
}
}

void l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
{
int i;
for(i = 0; i < n; ++i){
float diff = truth[i] - pred[i];
error[i] = fabs(diff);
delta[i] = diff > 0 ? 1 : -1;
}
}

void l2_cpu(int n, float *pred, float *truth, float *delta, float *error)
{
int i;
for(i = 0; i < n; ++i){
float diff = truth[i] - pred[i];
error[i] = diff * diff;
delta[i] = diff;
}
}

float dot_cpu(int N, float *X, int INCX, float *Y, int INCY)
{
int i;
float dot = 0;
for(i = 0; i < N; ++i) dot += X[i*INCX] * Y[i*INCY];
return dot;
}

void softmax(float *input, int n, float temp, int stride, float *output)
{
int i;
float sum = 0;
float largest = -FLT_MAX;
for(i = 0; i < n; ++i){
if(input[i*stride] > largest) largest = input[i*stride];
}
for(i = 0; i < n; ++i){
float e = exp(input[i*stride]/temp - largest/temp);
sum += e;
output[i*stride] = e;
}
for(i = 0; i < n; ++i){
output[i*stride] /= sum;
}
}


void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output)
{
int g, b;
for(b = 0; b < batch; ++b){
for(g = 0; g < groups; ++g){
softmax(input + b*batch_offset + g*group_offset, n, temp, stride, output + b*batch_offset + g*group_offset);
}
}
}

@@ -0,0 +1,91 @@
#ifndef BLAS_H
#define BLAS_H
void flatten(float *x, int size, int layers, int batch, int forward);
void pm(int M, int N, float *A);
float *random_matrix(int rows, int cols);
void time_random_matrix(int TA, int TB, int m, int k, int n);
void reorg_cpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out);

void test_blas();

void const_cpu(int N, float ALPHA, float *X, int INCX);
void constrain_ongpu(int N, float ALPHA, float * X, int INCX);
void pow_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
void mul_cpu(int N, float *X, int INCX, float *Y, int INCY);

void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
void copy_cpu(int N, float *X, int INCX, float *Y, int INCY);
void scal_cpu(int N, float ALPHA, float *X, int INCX);
void fill_cpu(int N, float ALPHA, float * X, int INCX);
float dot_cpu(int N, float *X, int INCX, float *Y, int INCY);
void test_gpu_blas();
void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out);

void mean_cpu(float *x, int batch, int filters, int spatial, float *mean);
void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
void normalize_cpu(float *x, float *mean, float *variance, int batch, int filters, int spatial);

void scale_bias(float *output, float *scales, int batch, int n, int size);
void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates);
void mean_delta_cpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta);
void variance_delta_cpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta);
void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta);

void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error);
void l2_cpu(int n, float *pred, float *truth, float *delta, float *error);
void l1_cpu(int n, float *pred, float *truth, float *delta, float *error);
void weighted_sum_cpu(float *a, float *b, float *s, int num, float *c);

void softmax(float *input, int n, float temp, int stride, float *output);
void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output);

#ifdef GPU
#include "cuda.h"

void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY);
void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY);
void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
void scal_ongpu(int N, float ALPHA, float * X, int INCX);
void add_ongpu(int N, float ALPHA, float * X, int INCX);
void supp_ongpu(int N, float ALPHA, float * X, int INCX);
void mask_ongpu(int N, float * X, float mask_num, float * mask);
void const_ongpu(int N, float ALPHA, float *X, int INCX);
void pow_ongpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
void mul_ongpu(int N, float *X, int INCX, float *Y, int INCY);
void fill_ongpu(int N, float ALPHA, float * X, int INCX);

void mean_gpu(float *x, int batch, int filters, int spatial, float *mean);
void variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
void normalize_gpu(float *x, float *mean, float *variance, int batch, int filters, int spatial);

void normalize_delta_gpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta);

void fast_mean_delta_gpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta);
void fast_variance_delta_gpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta);

void fast_variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
void fast_mean_gpu(float *x, int batch, int filters, int spatial, float *mean);
void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out);
void scale_bias_gpu(float *output, float *biases, int batch, int n, int size);
void backward_scale_gpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates);
void scale_bias_gpu(float *output, float *biases, int batch, int n, int size);
void add_bias_gpu(float *output, float *biases, int batch, int n, int size);
void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size);

void smooth_l1_gpu(int n, float *pred, float *truth, float *delta, float *error);
void l2_gpu(int n, float *pred, float *truth, float *delta, float *error);
void l1_gpu(int n, float *pred, float *truth, float *delta, float *error);
void weighted_delta_gpu(float *a, float *b, float *s, float *da, float *db, float *ds, int num, float *dc);
void weighted_sum_gpu(float *a, float *b, float *s, int num, float *c);
void mult_add_into_gpu(int num, float *a, float *b, float *c);

void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out);

void softmax_gpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output);
void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t);

void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out);

#endif
#endif

Large diffs are not rendered by default.

@@ -0,0 +1,345 @@
#include "box.h"
#include <stdio.h>
#include <math.h>
#include <stdlib.h>

box float_to_box(float *f, int stride)
{
box b;
b.x = f[0];
b.y = f[1*stride];
b.w = f[2*stride];
b.h = f[3*stride];
return b;
}

dbox derivative(box a, box b)
{
dbox d;
d.dx = 0;
d.dw = 0;
float l1 = a.x - a.w/2;
float l2 = b.x - b.w/2;
if (l1 > l2){
d.dx -= 1;
d.dw += .5;
}
float r1 = a.x + a.w/2;
float r2 = b.x + b.w/2;
if(r1 < r2){
d.dx += 1;
d.dw += .5;
}
if (l1 > r2) {
d.dx = -1;
d.dw = 0;
}
if (r1 < l2){
d.dx = 1;
d.dw = 0;
}

d.dy = 0;
d.dh = 0;
float t1 = a.y - a.h/2;
float t2 = b.y - b.h/2;
if (t1 > t2){
d.dy -= 1;
d.dh += .5;
}
float b1 = a.y + a.h/2;
float b2 = b.y + b.h/2;
if(b1 < b2){
d.dy += 1;
d.dh += .5;
}
if (t1 > b2) {
d.dy = -1;
d.dh = 0;
}
if (b1 < t2){
d.dy = 1;
d.dh = 0;
}
return d;
}

float overlap(float x1, float w1, float x2, float w2)
{
float l1 = x1 - w1/2;
float l2 = x2 - w2/2;
float left = l1 > l2 ? l1 : l2;
float r1 = x1 + w1/2;
float r2 = x2 + w2/2;
float right = r1 < r2 ? r1 : r2;
return right - left;
}

float box_intersection(box a, box b)
{
float w = overlap(a.x, a.w, b.x, b.w);
float h = overlap(a.y, a.h, b.y, b.h);
if(w < 0 || h < 0) return 0;
float area = w*h;
return area;
}

float box_union(box a, box b)
{
float i = box_intersection(a, b);
float u = a.w*a.h + b.w*b.h - i;
return u;
}

float box_iou(box a, box b)
{
return box_intersection(a, b)/box_union(a, b);
}

float box_rmse(box a, box b)
{
return sqrt(pow(a.x-b.x, 2) +
pow(a.y-b.y, 2) +
pow(a.w-b.w, 2) +
pow(a.h-b.h, 2));
}

dbox dintersect(box a, box b)
{
float w = overlap(a.x, a.w, b.x, b.w);
float h = overlap(a.y, a.h, b.y, b.h);
dbox dover = derivative(a, b);
dbox di;

di.dw = dover.dw*h;
di.dx = dover.dx*h;
di.dh = dover.dh*w;
di.dy = dover.dy*w;

return di;
}

dbox dunion(box a, box b)
{
dbox du;

dbox di = dintersect(a, b);
du.dw = a.h - di.dw;
du.dh = a.w - di.dh;
du.dx = -di.dx;
du.dy = -di.dy;

return du;
}


void test_dunion()
{
box a = {0, 0, 1, 1};
box dxa= {0+.0001, 0, 1, 1};
box dya= {0, 0+.0001, 1, 1};
box dwa= {0, 0, 1+.0001, 1};
box dha= {0, 0, 1, 1+.0001};

box b = {.5, .5, .2, .2};
dbox di = dunion(a,b);
printf("Union: %f %f %f %f\n", di.dx, di.dy, di.dw, di.dh);
float inter = box_union(a, b);
float xinter = box_union(dxa, b);
float yinter = box_union(dya, b);
float winter = box_union(dwa, b);
float hinter = box_union(dha, b);
xinter = (xinter - inter)/(.0001);
yinter = (yinter - inter)/(.0001);
winter = (winter - inter)/(.0001);
hinter = (hinter - inter)/(.0001);
printf("Union Manual %f %f %f %f\n", xinter, yinter, winter, hinter);
}
void test_dintersect()
{
box a = {0, 0, 1, 1};
box dxa= {0+.0001, 0, 1, 1};
box dya= {0, 0+.0001, 1, 1};
box dwa= {0, 0, 1+.0001, 1};
box dha= {0, 0, 1, 1+.0001};

box b = {.5, .5, .2, .2};
dbox di = dintersect(a,b);
printf("Inter: %f %f %f %f\n", di.dx, di.dy, di.dw, di.dh);
float inter = box_intersection(a, b);
float xinter = box_intersection(dxa, b);
float yinter = box_intersection(dya, b);
float winter = box_intersection(dwa, b);
float hinter = box_intersection(dha, b);
xinter = (xinter - inter)/(.0001);
yinter = (yinter - inter)/(.0001);
winter = (winter - inter)/(.0001);
hinter = (hinter - inter)/(.0001);
printf("Inter Manual %f %f %f %f\n", xinter, yinter, winter, hinter);
}

void test_box()
{
test_dintersect();
test_dunion();
box a = {0, 0, 1, 1};
box dxa= {0+.00001, 0, 1, 1};
box dya= {0, 0+.00001, 1, 1};
box dwa= {0, 0, 1+.00001, 1};
box dha= {0, 0, 1, 1+.00001};

box b = {.5, 0, .2, .2};

float iou = box_iou(a,b);
iou = (1-iou)*(1-iou);
printf("%f\n", iou);
dbox d = diou(a, b);
printf("%f %f %f %f\n", d.dx, d.dy, d.dw, d.dh);

float xiou = box_iou(dxa, b);
float yiou = box_iou(dya, b);
float wiou = box_iou(dwa, b);
float hiou = box_iou(dha, b);
xiou = ((1-xiou)*(1-xiou) - iou)/(.00001);
yiou = ((1-yiou)*(1-yiou) - iou)/(.00001);
wiou = ((1-wiou)*(1-wiou) - iou)/(.00001);
hiou = ((1-hiou)*(1-hiou) - iou)/(.00001);
printf("manual %f %f %f %f\n", xiou, yiou, wiou, hiou);
}

dbox diou(box a, box b)
{
float u = box_union(a,b);
float i = box_intersection(a,b);
dbox di = dintersect(a,b);
dbox du = dunion(a,b);
dbox dd = {0,0,0,0};

if(i <= 0 || 1) {
dd.dx = b.x - a.x;
dd.dy = b.y - a.y;
dd.dw = b.w - a.w;
dd.dh = b.h - a.h;
return dd;
}

dd.dx = 2*pow((1-(i/u)),1)*(di.dx*u - du.dx*i)/(u*u);
dd.dy = 2*pow((1-(i/u)),1)*(di.dy*u - du.dy*i)/(u*u);
dd.dw = 2*pow((1-(i/u)),1)*(di.dw*u - du.dw*i)/(u*u);
dd.dh = 2*pow((1-(i/u)),1)*(di.dh*u - du.dh*i)/(u*u);
return dd;
}

typedef struct{
int index;
int class;
float **probs;
} sortable_bbox;

int nms_comparator(const void *pa, const void *pb)
{
sortable_bbox a = *(sortable_bbox *)pa;
sortable_bbox b = *(sortable_bbox *)pb;
float diff = a.probs[a.index][b.class] - b.probs[b.index][b.class];
if(diff < 0) return 1;
else if(diff > 0) return -1;
return 0;
}

void do_nms_obj(box *boxes, float **probs, int total, int classes, float thresh)
{
int i, j, k;
sortable_bbox *s = calloc(total, sizeof(sortable_bbox));

for(i = 0; i < total; ++i){
s[i].index = i;
s[i].class = classes;
s[i].probs = probs;
}

qsort(s, total, sizeof(sortable_bbox), nms_comparator);
for(i = 0; i < total; ++i){
if(probs[s[i].index][classes] == 0) continue;
box a = boxes[s[i].index];
for(j = i+1; j < total; ++j){
box b = boxes[s[j].index];
if (box_iou(a, b) > thresh){
for(k = 0; k < classes+1; ++k){
probs[s[j].index][k] = 0;
}
}
}
}
free(s);
}


void do_nms_sort(box *boxes, float **probs, int total, int classes, float thresh)
{
int i, j, k;
sortable_bbox *s = calloc(total, sizeof(sortable_bbox));

for(i = 0; i < total; ++i){
s[i].index = i;
s[i].class = 0;
s[i].probs = probs;
}

for(k = 0; k < classes; ++k){
for(i = 0; i < total; ++i){
s[i].class = k;
}
qsort(s, total, sizeof(sortable_bbox), nms_comparator);
for(i = 0; i < total; ++i){
if(probs[s[i].index][k] == 0) continue;
box a = boxes[s[i].index];
for(j = i+1; j < total; ++j){
box b = boxes[s[j].index];
if (box_iou(a, b) > thresh){
probs[s[j].index][k] = 0;
}
}
}
}
free(s);
}

void do_nms(box *boxes, float **probs, int total, int classes, float thresh)
{
int i, j, k;
for(i = 0; i < total; ++i){
int any = 0;
for(k = 0; k < classes; ++k) any = any || (probs[i][k] > 0);
if(!any) {
continue;
}
for(j = i+1; j < total; ++j){
if (box_iou(boxes[i], boxes[j]) > thresh){
for(k = 0; k < classes; ++k){
if (probs[i][k] < probs[j][k]) probs[i][k] = 0;
else probs[j][k] = 0;
}
}
}
}
}

box encode_box(box b, box anchor)
{
box encode;
encode.x = (b.x - anchor.x) / anchor.w;
encode.y = (b.y - anchor.y) / anchor.h;
encode.w = log2(b.w / anchor.w);
encode.h = log2(b.h / anchor.h);
return encode;
}

box decode_box(box b, box anchor)
{
box decode;
decode.x = b.x * anchor.w + anchor.x;
decode.y = b.y * anchor.h + anchor.y;
decode.w = pow(2., b.w) * anchor.w;
decode.h = pow(2., b.h) * anchor.h;
return decode;
}
@@ -0,0 +1,22 @@
#ifndef BOX_H
#define BOX_H

typedef struct{
float x, y, w, h;
} box;

typedef struct{
float dx, dy, dw, dh;
} dbox;

box float_to_box(float *f, int stride);
float box_iou(box a, box b);
float box_rmse(box a, box b);
dbox diou(box a, box b);
void do_nms(box *boxes, float **probs, int total, int classes, float thresh);
void do_nms_sort(box *boxes, float **probs, int total, int classes, float thresh);
void do_nms_obj(box *boxes, float **probs, int total, int classes, float thresh);
box decode_box(box b, box anchor);
box encode_box(box b, box anchor);

#endif
@@ -0,0 +1,364 @@
#include "network.h"
#include "utils.h"
#include "parser.h"

void fix_data_captcha(data d, int mask)
{
matrix labels = d.y;
int i, j;
for(i = 0; i < d.y.rows; ++i){
for(j = 0; j < d.y.cols; j += 2){
if (mask){
if(!labels.vals[i][j]){
labels.vals[i][j] = SECRET_NUM;
labels.vals[i][j+1] = SECRET_NUM;
}else if(labels.vals[i][j+1]){
labels.vals[i][j] = 0;
}
} else{
if (labels.vals[i][j]) {
labels.vals[i][j+1] = 0;
} else {
labels.vals[i][j+1] = 1;
}
}
}
}
}

void train_captcha(char *cfgfile, char *weightfile)
{
srand(time(0));
float avg_loss = -1;
char *base = basecfg(cfgfile);
printf("%s\n", base);
network net = parse_network_cfg(cfgfile);
if(weightfile){
load_weights(&net, weightfile);
}
printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
int imgs = 1024;
int i = *net.seen/imgs;
int solved = 1;
list *plist;
char **labels = get_labels("/data/captcha/reimgs.labels.list");
if (solved){
plist = get_paths("/data/captcha/reimgs.solved.list");
}else{
plist = get_paths("/data/captcha/reimgs.raw.list");
}
char **paths = (char **)list_to_array(plist);
printf("%d\n", plist->size);
clock_t time;
pthread_t load_thread;
data train;
data buffer;

load_args args = {0};
args.w = net.w;
args.h = net.h;
args.paths = paths;
args.classes = 26;
args.n = imgs;
args.m = plist->size;
args.labels = labels;
args.d = &buffer;
args.type = CLASSIFICATION_DATA;

load_thread = load_data_in_thread(args);
while(1){
++i;
time=clock();
pthread_join(load_thread, 0);
train = buffer;
fix_data_captcha(train, solved);

/*
image im = float_to_image(256, 256, 3, train.X.vals[114]);
show_image(im, "training");
cvWaitKey(0);
*/

load_thread = load_data_in_thread(args);
printf("Loaded: %lf seconds\n", sec(clock()-time));
time=clock();
float loss = train_network(net, train);
if(avg_loss == -1) avg_loss = loss;
avg_loss = avg_loss*.9 + loss*.1;
printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), *net.seen);
free_data(train);
if(i%100==0){
char buff[256];
sprintf(buff, "/home/pjreddie/imagenet_backup/%s_%d.weights",base, i);
save_weights(net, buff);
}
}
}

void test_captcha(char *cfgfile, char *weightfile, char *filename)
{
network net = parse_network_cfg(cfgfile);
if(weightfile){
load_weights(&net, weightfile);
}
set_batch_network(&net, 1);
srand(2222222);
int i = 0;
char **names = get_labels("/data/captcha/reimgs.labels.list");
char buff[256];
char *input = buff;
int indexes[26];
while(1){
if(filename){
strncpy(input, filename, 256);
}else{
//printf("Enter Image Path: ");
//fflush(stdout);
input = fgets(input, 256, stdin);
if(!input) return;
strtok(input, "\n");
}
image im = load_image_color(input, net.w, net.h);
float *X = im.data;
float *predictions = network_predict(net, X);
top_predictions(net, 26, indexes);
//printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
for(i = 0; i < 26; ++i){
int index = indexes[i];
if(i != 0) printf(", ");
printf("%s %f", names[index], predictions[index]);
}
printf("\n");
fflush(stdout);
free_image(im);
if (filename) break;
}
}

void valid_captcha(char *cfgfile, char *weightfile, char *filename)
{
char **labels = get_labels("/data/captcha/reimgs.labels.list");
network net = parse_network_cfg(cfgfile);
if(weightfile){
load_weights(&net, weightfile);
}
list *plist = get_paths("/data/captcha/reimgs.fg.list");
char **paths = (char **)list_to_array(plist);
int N = plist->size;
int outputs = net.outputs;

set_batch_network(&net, 1);
srand(2222222);
int i, j;
for(i = 0; i < N; ++i){
if (i%100 == 0) fprintf(stderr, "%d\n", i);
image im = load_image_color(paths[i], net.w, net.h);
float *X = im.data;
float *predictions = network_predict(net, X);
//printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
int truth = -1;
for(j = 0; j < 13; ++j){
if (strstr(paths[i], labels[j])) truth = j;
}
if (truth == -1){
fprintf(stderr, "bad: %s\n", paths[i]);
return;
}
printf("%d, ", truth);
for(j = 0; j < outputs; ++j){
if (j != 0) printf(", ");
printf("%f", predictions[j]);
}
printf("\n");
fflush(stdout);
free_image(im);
if (filename) break;
}
}

/*
void train_captcha(char *cfgfile, char *weightfile)
{
float avg_loss = -1;
srand(time(0));
char *base = basecfg(cfgfile);
printf("%s\n", base);
network net = parse_network_cfg(cfgfile);
if(weightfile){
load_weights(&net, weightfile);
}
printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
int imgs = 1024;
int i = net.seen/imgs;
list *plist = get_paths("/data/captcha/train.auto5");
char **paths = (char **)list_to_array(plist);
printf("%d\n", plist->size);
clock_t time;
while(1){
++i;
time=clock();
data train = load_data_captcha(paths, imgs, plist->size, 10, 200, 60);
translate_data_rows(train, -128);
scale_data_rows(train, 1./128);
printf("Loaded: %lf seconds\n", sec(clock()-time));
time=clock();
float loss = train_network(net, train);
net.seen += imgs;
if(avg_loss == -1) avg_loss = loss;
avg_loss = avg_loss*.9 + loss*.1;
printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), net.seen);
free_data(train);
if(i%10==0){
char buff[256];
sprintf(buff, "/home/pjreddie/imagenet_backup/%s_%d.weights",base, i);
save_weights(net, buff);
}
}
}
void decode_captcha(char *cfgfile, char *weightfile)
{
setbuf(stdout, NULL);
srand(time(0));
network net = parse_network_cfg(cfgfile);
set_batch_network(&net, 1);
if(weightfile){
load_weights(&net, weightfile);
}
char filename[256];
while(1){
printf("Enter filename: ");
fgets(filename, 256, stdin);
strtok(filename, "\n");
image im = load_image_color(filename, 300, 57);
scale_image(im, 1./255.);
float *X = im.data;
float *predictions = network_predict(net, X);
image out = float_to_image(300, 57, 1, predictions);
show_image(out, "decoded");
#ifdef OPENCV
cvWaitKey(0);
#endif
free_image(im);
}
}
void encode_captcha(char *cfgfile, char *weightfile)
{
float avg_loss = -1;
srand(time(0));
char *base = basecfg(cfgfile);
printf("%s\n", base);
network net = parse_network_cfg(cfgfile);
if(weightfile){
load_weights(&net, weightfile);
}
printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
int imgs = 1024;
int i = net.seen/imgs;
list *plist = get_paths("/data/captcha/encode.list");
char **paths = (char **)list_to_array(plist);
printf("%d\n", plist->size);
clock_t time;
while(1){
++i;
time=clock();
data train = load_data_captcha_encode(paths, imgs, plist->size, 300, 57);
scale_data_rows(train, 1./255);
printf("Loaded: %lf seconds\n", sec(clock()-time));
time=clock();
float loss = train_network(net, train);
net.seen += imgs;
if(avg_loss == -1) avg_loss = loss;
avg_loss = avg_loss*.9 + loss*.1;
printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), net.seen);
free_matrix(train.X);
if(i%100==0){
char buff[256];
sprintf(buff, "/home/pjreddie/imagenet_backup/%s_%d.weights",base, i);
save_weights(net, buff);
}
}
}
void validate_captcha(char *cfgfile, char *weightfile)
{
srand(time(0));
char *base = basecfg(cfgfile);
printf("%s\n", base);
network net = parse_network_cfg(cfgfile);
if(weightfile){
load_weights(&net, weightfile);
}
int numchars = 37;
list *plist = get_paths("/data/captcha/solved.hard");
char **paths = (char **)list_to_array(plist);
int imgs = plist->size;
data valid = load_data_captcha(paths, imgs, 0, 10, 200, 60);
translate_data_rows(valid, -128);
scale_data_rows(valid, 1./128);
matrix pred = network_predict_data(net, valid);
int i, k;
int correct = 0;
int total = 0;
int accuracy = 0;
for(i = 0; i < imgs; ++i){
int allcorrect = 1;
for(k = 0; k < 10; ++k){
char truth = int_to_alphanum(max_index(valid.y.vals[i]+k*numchars, numchars));
char prediction = int_to_alphanum(max_index(pred.vals[i]+k*numchars, numchars));
if (truth != prediction) allcorrect=0;
if (truth != '.' && truth == prediction) ++correct;
if (truth != '.' || truth != prediction) ++total;
}
accuracy += allcorrect;
}
printf("Word Accuracy: %f, Char Accuracy %f\n", (float)accuracy/imgs, (float)correct/total);
free_data(valid);
}
void test_captcha(char *cfgfile, char *weightfile)
{
setbuf(stdout, NULL);
srand(time(0));
//char *base = basecfg(cfgfile);
//printf("%s\n", base);
network net = parse_network_cfg(cfgfile);
set_batch_network(&net, 1);
if(weightfile){
load_weights(&net, weightfile);
}
char filename[256];
while(1){
//printf("Enter filename: ");
fgets(filename, 256, stdin);
strtok(filename, "\n");
image im = load_image_color(filename, 200, 60);
translate_image(im, -128);
scale_image(im, 1/128.);
float *X = im.data;
float *predictions = network_predict(net, X);
print_letters(predictions, 10);
free_image(im);
}
}
*/
void run_captcha(int argc, char **argv)
{
if(argc < 4){
fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
return;
}

char *cfg = argv[3];
char *weights = (argc > 4) ? argv[4] : 0;
char *filename = (argc > 5) ? argv[5]: 0;
if(0==strcmp(argv[2], "train")) train_captcha(cfg, weights);
else if(0==strcmp(argv[2], "test")) test_captcha(cfg, weights, filename);
else if(0==strcmp(argv[2], "valid")) valid_captcha(cfg, weights, filename);
//if(0==strcmp(argv[2], "test")) test_captcha(cfg, weights);
//else if(0==strcmp(argv[2], "encode")) encode_captcha(cfg, weights);
//else if(0==strcmp(argv[2], "decode")) decode_captcha(cfg, weights);
//else if(0==strcmp(argv[2], "valid")) validate_captcha(cfg, weights);
}

@@ -0,0 +1,273 @@
#include "network.h"
#include "utils.h"
#include "parser.h"
#include "option_list.h"
#include "blas.h"

void train_cifar(char *cfgfile, char *weightfile)
{
srand(time(0));
float avg_loss = -1;
char *base = basecfg(cfgfile);
printf("%s\n", base);
network net = parse_network_cfg(cfgfile);
if(weightfile){
load_weights(&net, weightfile);
}
printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);

char *backup_directory = "/home/pjreddie/backup/";
int classes = 10;
int N = 50000;

char **labels = get_labels("data/cifar/labels.txt");
int epoch = (*net.seen)/N;
data train = load_all_cifar10();
while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
clock_t time=clock();

float loss = train_network_sgd(net, train, 1);
if(avg_loss == -1) avg_loss = loss;
avg_loss = avg_loss*.95 + loss*.05;
printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
if(*net.seen/N > epoch){
epoch = *net.seen/N;
char buff[256];
sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
save_weights(net, buff);
}
if(get_current_batch(net)%100 == 0){
char buff[256];
sprintf(buff, "%s/%s.backup",backup_directory,base);
save_weights(net, buff);
}
}
char buff[256];
sprintf(buff, "%s/%s.weights", backup_directory, base);
save_weights(net, buff);

free_network(net);
free_ptrs((void**)labels, classes);
free(base);
free_data(train);
}

void train_cifar_distill(char *cfgfile, char *weightfile)
{
srand(time(0));
float avg_loss = -1;
char *base = basecfg(cfgfile);
printf("%s\n", base);
network net = parse_network_cfg(cfgfile);
if(weightfile){
load_weights(&net, weightfile);
}
printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);

char *backup_directory = "/home/pjreddie/backup/";
int classes = 10;
int N = 50000;

char **labels = get_labels("data/cifar/labels.txt");
int epoch = (*net.seen)/N;

data train = load_all_cifar10();
matrix soft = csv_to_matrix("results/ensemble.csv");

float weight = .9;
scale_matrix(soft, weight);
scale_matrix(train.y, 1. - weight);
matrix_add_matrix(soft, train.y);

while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
clock_t time=clock();

float loss = train_network_sgd(net, train, 1);
if(avg_loss == -1) avg_loss = loss;
avg_loss = avg_loss*.95 + loss*.05;
printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
if(*net.seen/N > epoch){
epoch = *net.seen/N;
char buff[256];
sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
save_weights(net, buff);
}
if(get_current_batch(net)%100 == 0){
char buff[256];
sprintf(buff, "%s/%s.backup",backup_directory,base);
save_weights(net, buff);
}
}
char buff[256];
sprintf(buff, "%s/%s.weights", backup_directory, base);
save_weights(net, buff);

free_network(net);
free_ptrs((void**)labels, classes);
free(base);
free_data(train);
}

void test_cifar_multi(char *filename, char *weightfile)
{
network net = parse_network_cfg(filename);
if(weightfile){
load_weights(&net, weightfile);
}
set_batch_network(&net, 1);
srand(time(0));

float avg_acc = 0;
data test = load_cifar10_data("data/cifar/cifar-10-batches-bin/test_batch.bin");

int i;
for(i = 0; i < test.X.rows; ++i){
image im = float_to_image(32, 32, 3, test.X.vals[i]);

float pred[10] = {0};

float *p = network_predict(net, im.data);
axpy_cpu(10, 1, p, 1, pred, 1);
flip_image(im);
p = network_predict(net, im.data);
axpy_cpu(10, 1, p, 1, pred, 1);

int index = max_index(pred, 10);
int class = max_index(test.y.vals[i], 10);
if(index == class) avg_acc += 1;
free_image(im);
printf("%4d: %.2f%%\n", i, 100.*avg_acc/(i+1));
}
}

void test_cifar(char *filename, char *weightfile)
{
network net = parse_network_cfg(filename);
if(weightfile){
load_weights(&net, weightfile);
}
srand(time(0));

clock_t time;
float avg_acc = 0;
float avg_top5 = 0;
data test = load_cifar10_data("data/cifar/cifar-10-batches-bin/test_batch.bin");

time=clock();

float *acc = network_accuracies(net, test, 2);
avg_acc += acc[0];
avg_top5 += acc[1];
printf("top1: %f, %lf seconds, %d images\n", avg_acc, sec(clock()-time), test.X.rows);
free_data(test);
}

void extract_cifar()
{
char *labels[] = {"airplane","automobile","bird","cat","deer","dog","frog","horse","ship","truck"};
int i;
data train = load_all_cifar10();
data test = load_cifar10_data("data/cifar/cifar-10-batches-bin/test_batch.bin");
for(i = 0; i < train.X.rows; ++i){
image im = float_to_image(32, 32, 3, train.X.vals[i]);
int class = max_index(train.y.vals[i], 10);
char buff[256];
sprintf(buff, "data/cifar/train/%d_%s",i,labels[class]);
save_image_png(im, buff);
}
for(i = 0; i < test.X.rows; ++i){
image im = float_to_image(32, 32, 3, test.X.vals[i]);
int class = max_index(test.y.vals[i], 10);
char buff[256];
sprintf(buff, "data/cifar/test/%d_%s",i,labels[class]);
save_image_png(im, buff);
}
}

void test_cifar_csv(char *filename, char *weightfile)
{
network net = parse_network_cfg(filename);
if(weightfile){
load_weights(&net, weightfile);
}
srand(time(0));

data test = load_cifar10_data("data/cifar/cifar-10-batches-bin/test_batch.bin");

matrix pred = network_predict_data(net, test);

int i;
for(i = 0; i < test.X.rows; ++i){
image im = float_to_image(32, 32, 3, test.X.vals[i]);
flip_image(im);
}
matrix pred2 = network_predict_data(net, test);
scale_matrix(pred, .5);
scale_matrix(pred2, .5);
matrix_add_matrix(pred2, pred);

matrix_to_csv(pred);
fprintf(stderr, "Accuracy: %f\n", matrix_topk_accuracy(test.y, pred, 1));
free_data(test);
}

void test_cifar_csvtrain(char *filename, char *weightfile)
{
network net = parse_network_cfg(filename);
if(weightfile){
load_weights(&net, weightfile);
}
srand(time(0));

data test = load_all_cifar10();

matrix pred = network_predict_data(net, test);

int i;
for(i = 0; i < test.X.rows; ++i){
image im = float_to_image(32, 32, 3, test.X.vals[i]);
flip_image(im);
}
matrix pred2 = network_predict_data(net, test);
scale_matrix(pred, .5);
scale_matrix(pred2, .5);
matrix_add_matrix(pred2, pred);

matrix_to_csv(pred);
fprintf(stderr, "Accuracy: %f\n", matrix_topk_accuracy(test.y, pred, 1));
free_data(test);
}

void eval_cifar_csv()
{
data test = load_cifar10_data("data/cifar/cifar-10-batches-bin/test_batch.bin");

matrix pred = csv_to_matrix("results/combined.csv");
fprintf(stderr, "%d %d\n", pred.rows, pred.cols);

fprintf(stderr, "Accuracy: %f\n", matrix_topk_accuracy(test.y, pred, 1));
free_data(test);
free_matrix(pred);
}


void run_cifar(int argc, char **argv)
{
if(argc < 4){
fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
return;
}

char *cfg = argv[3];
char *weights = (argc > 4) ? argv[4] : 0;
if(0==strcmp(argv[2], "train")) train_cifar(cfg, weights);
else if(0==strcmp(argv[2], "extract")) extract_cifar();
else if(0==strcmp(argv[2], "distill")) train_cifar_distill(cfg, weights);
else if(0==strcmp(argv[2], "test")) test_cifar(cfg, weights);
else if(0==strcmp(argv[2], "multi")) test_cifar_multi(cfg, weights);
else if(0==strcmp(argv[2], "csv")) test_cifar_csv(cfg, weights);
else if(0==strcmp(argv[2], "csvtrain")) test_cifar_csvtrain(cfg, weights);
else if(0==strcmp(argv[2], "eval")) eval_cifar_csv();
}


Large diffs are not rendered by default.

@@ -0,0 +1,2 @@

list *read_data_cfg(char *filename);
@@ -0,0 +1,384 @@
#include <stdio.h>

#include "network.h"
#include "detection_layer.h"
#include "cost_layer.h"
#include "utils.h"
#include "parser.h"
#include "box.h"
#include "demo.h"

char *coco_classes[] = {"person","bicycle","car","motorcycle","airplane","bus","train","truck","boat","traffic light","fire hydrant","stop sign","parking meter","bench","bird","cat","dog","horse","sheep","cow","elephant","bear","zebra","giraffe","backpack","umbrella","handbag","tie","suitcase","frisbee","skis","snowboard","sports ball","kite","baseball bat","baseball glove","skateboard","surfboard","tennis racket","bottle","wine glass","cup","fork","knife","spoon","bowl","banana","apple","sandwich","orange","broccoli","carrot","hot dog","pizza","donut","cake","chair","couch","potted plant","bed","dining table","toilet","tv","laptop","mouse","remote","keyboard","cell phone","microwave","oven","toaster","sink","refrigerator","book","clock","vase","scissors","teddy bear","hair drier","toothbrush"};

int coco_ids[] = {1,2,3,4,5,6,7,8,9,10,11,13,14,15,16,17,18,19,20,21,22,23,24,25,27,28,31,32,33,34,35,36,37,38,39,40,41,42,43,44,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,67,70,72,73,74,75,76,77,78,79,80,81,82,84,85,86,87,88,89,90};

void train_coco(char *cfgfile, char *weightfile)
{
//char *train_images = "/home/pjreddie/data/voc/test/train.txt";
//char *train_images = "/home/pjreddie/data/coco/train.txt";
char *train_images = "data/coco.trainval.txt";
//char *train_images = "data/bags.train.list";
char *backup_directory = "/home/pjreddie/backup/";
srand(time(0));
char *base = basecfg(cfgfile);
printf("%s\n", base);
float avg_loss = -1;
network net = parse_network_cfg(cfgfile);
if(weightfile){
load_weights(&net, weightfile);
}
printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
int imgs = net.batch*net.subdivisions;
int i = *net.seen/imgs;
data train, buffer;


layer l = net.layers[net.n - 1];

int side = l.side;
int classes = l.classes;
float jitter = l.jitter;

list *plist = get_paths(train_images);
//int N = plist->size;
char **paths = (char **)list_to_array(plist);

load_args args = {0};
args.w = net.w;
args.h = net.h;
args.paths = paths;
args.n = imgs;
args.m = plist->size;
args.classes = classes;
args.jitter = jitter;
args.num_boxes = side;
args.d = &buffer;
args.type = REGION_DATA;

args.angle = net.angle;
args.exposure = net.exposure;
args.saturation = net.saturation;
args.hue = net.hue;

pthread_t load_thread = load_data_in_thread(args);
clock_t time;
//while(i*imgs < N*120){
while(get_current_batch(net) < net.max_batches){
i += 1;
time=clock();
pthread_join(load_thread, 0);
train = buffer;
load_thread = load_data_in_thread(args);

printf("Loaded: %lf seconds\n", sec(clock()-time));

/*
image im = float_to_image(net.w, net.h, 3, train.X.vals[113]);
image copy = copy_image(im);
draw_coco(copy, train.y.vals[113], 7, "truth");
cvWaitKey(0);
free_image(copy);
*/

time=clock();
float loss = train_network(net, train);
if (avg_loss < 0) avg_loss = loss;
avg_loss = avg_loss*.9 + loss*.1;

printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs);
if(i%1000==0 || (i < 1000 && i%100 == 0)){
char buff[256];
sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
save_weights(net, buff);
}
if(i%100==0){
char buff[256];
sprintf(buff, "%s/%s.backup", backup_directory, base);
save_weights(net, buff);
}
free_data(train);
}
char buff[256];
sprintf(buff, "%s/%s_final.weights", backup_directory, base);
save_weights(net, buff);
}

void print_cocos(FILE *fp, int image_id, box *boxes, float **probs, int num_boxes, int classes, int w, int h)
{
int i, j;
for(i = 0; i < num_boxes; ++i){
float xmin = boxes[i].x - boxes[i].w/2.;
float xmax = boxes[i].x + boxes[i].w/2.;
float ymin = boxes[i].y - boxes[i].h/2.;
float ymax = boxes[i].y + boxes[i].h/2.;

if (xmin < 0) xmin = 0;
if (ymin < 0) ymin = 0;
if (xmax > w) xmax = w;
if (ymax > h) ymax = h;

float bx = xmin;
float by = ymin;
float bw = xmax - xmin;
float bh = ymax - ymin;

for(j = 0; j < classes; ++j){
if (probs[i][j]) fprintf(fp, "{\"image_id\":%d, \"category_id\":%d, \"bbox\":[%f, %f, %f, %f], \"score\":%f},\n", image_id, coco_ids[j], bx, by, bw, bh, probs[i][j]);
}
}
}

int get_coco_image_id(char *filename)
{
char *p = strrchr(filename, '_');
return atoi(p+1);
}

void validate_coco(char *cfgfile, char *weightfile)
{
network net = parse_network_cfg(cfgfile);
if(weightfile){
load_weights(&net, weightfile);
}
set_batch_network(&net, 1);
fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
srand(time(0));

char *base = "results/";
list *plist = get_paths("data/coco_val_5k.list");
//list *plist = get_paths("/home/pjreddie/data/people-art/test.txt");
//list *plist = get_paths("/home/pjreddie/data/voc/test/2007_test.txt");
char **paths = (char **)list_to_array(plist);

layer l = net.layers[net.n-1];
int classes = l.classes;
int side = l.side;

int j;
char buff[1024];
snprintf(buff, 1024, "%s/coco_results.json", base);
FILE *fp = fopen(buff, "w");
fprintf(fp, "[\n");

box *boxes = calloc(side*side*l.n, sizeof(box));
float **probs = calloc(side*side*l.n, sizeof(float *));
for(j = 0; j < side*side*l.n; ++j) probs[j] = calloc(classes, sizeof(float *));

int m = plist->size;
int i=0;
int t;

float thresh = .01;
int nms = 1;
float iou_thresh = .5;

int nthreads = 8;
image *val = calloc(nthreads, sizeof(image));
image *val_resized = calloc(nthreads, sizeof(image));
image *buf = calloc(nthreads, sizeof(image));
image *buf_resized = calloc(nthreads, sizeof(image));
pthread_t *thr = calloc(nthreads, sizeof(pthread_t));

load_args args = {0};
args.w = net.w;
args.h = net.h;
args.type = IMAGE_DATA;

for(t = 0; t < nthreads; ++t){
args.path = paths[i+t];
args.im = &buf[t];
args.resized = &buf_resized[t];
thr[t] = load_data_in_thread(args);
}
time_t start = time(0);
for(i = nthreads; i < m+nthreads; i += nthreads){
fprintf(stderr, "%d\n", i);
for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
pthread_join(thr[t], 0);
val[t] = buf[t];
val_resized[t] = buf_resized[t];
}
for(t = 0; t < nthreads && i+t < m; ++t){
args.path = paths[i+t];
args.im = &buf[t];
args.resized = &buf_resized[t];
thr[t] = load_data_in_thread(args);
}
for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
char *path = paths[i+t-nthreads];
int image_id = get_coco_image_id(path);
float *X = val_resized[t].data;
network_predict(net, X);
int w = val[t].w;
int h = val[t].h;
get_detection_boxes(l, w, h, thresh, probs, boxes, 0);
if (nms) do_nms_sort(boxes, probs, side*side*l.n, classes, iou_thresh);
print_cocos(fp, image_id, boxes, probs, side*side*l.n, classes, w, h);
free_image(val[t]);
free_image(val_resized[t]);
}
}
fseek(fp, -2, SEEK_CUR);
fprintf(fp, "\n]\n");
fclose(fp);

fprintf(stderr, "Total Detection Time: %f Seconds\n", (double)(time(0) - start));
}

void validate_coco_recall(char *cfgfile, char *weightfile)
{
network net = parse_network_cfg(cfgfile);
if(weightfile){
load_weights(&net, weightfile);
}
set_batch_network(&net, 1);
fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
srand(time(0));

char *base = "results/comp4_det_test_";
list *plist = get_paths("/home/pjreddie/data/voc/test/2007_test.txt");
char **paths = (char **)list_to_array(plist);

layer l = net.layers[net.n-1];
int classes = l.classes;
int side = l.side;

int j, k;
FILE **fps = calloc(classes, sizeof(FILE *));
for(j = 0; j < classes; ++j){
char buff[1024];
snprintf(buff, 1024, "%s%s.txt", base, coco_classes[j]);
fps[j] = fopen(buff, "w");
}
box *boxes = calloc(side*side*l.n, sizeof(box));
float **probs = calloc(side*side*l.n, sizeof(float *));
for(j = 0; j < side*side*l.n; ++j) probs[j] = calloc(classes, sizeof(float *));

int m = plist->size;
int i=0;

float thresh = .001;
int nms = 0;
float iou_thresh = .5;
float nms_thresh = .5;

int total = 0;
int correct = 0;
int proposals = 0;
float avg_iou = 0;

for(i = 0; i < m; ++i){
char *path = paths[i];
image orig = load_image_color(path, 0, 0);
image sized = resize_image(orig, net.w, net.h);
char *id = basecfg(path);
network_predict(net, sized.data);
get_detection_boxes(l, 1, 1, thresh, probs, boxes, 1);
if (nms) do_nms(boxes, probs, side*side*l.n, 1, nms_thresh);

char labelpath[4096];
find_replace(path, "images", "labels", labelpath);
find_replace(labelpath, "JPEGImages", "labels", labelpath);
find_replace(labelpath, ".jpg", ".txt", labelpath);
find_replace(labelpath, ".JPEG", ".txt", labelpath);

int num_labels = 0;
box_label *truth = read_boxes(labelpath, &num_labels);
for(k = 0; k < side*side*l.n; ++k){
if(probs[k][0] > thresh){
++proposals;
}
}
for (j = 0; j < num_labels; ++j) {
++total;
box t = {truth[j].x, truth[j].y, truth[j].w, truth[j].h};
float best_iou = 0;
for(k = 0; k < side*side*l.n; ++k){
float iou = box_iou(boxes[k], t);
if(probs[k][0] > thresh && iou > best_iou){
best_iou = iou;
}
}
avg_iou += best_iou;
if(best_iou > iou_thresh){
++correct;
}
}

fprintf(stderr, "%5d %5d %5d\tRPs/Img: %.2f\tIOU: %.2f%%\tRecall:%.2f%%\n", i, correct, total, (float)proposals/(i+1), avg_iou*100/total, 100.*correct/total);
free(id);
free_image(orig);
free_image(sized);
}
}

void test_coco(char *cfgfile, char *weightfile, char *filename, float thresh)
{
image **alphabet = load_alphabet();
network net = parse_network_cfg(cfgfile);
if(weightfile){
load_weights(&net, weightfile);
}
detection_layer l = net.layers[net.n-1];
set_batch_network(&net, 1);
srand(2222222);
float nms = .4;
clock_t time;
char buff[256];
char *input = buff;
int j;
box *boxes = calloc(l.side*l.side*l.n, sizeof(box));
float **probs = calloc(l.side*l.side*l.n, sizeof(float *));
for(j = 0; j < l.side*l.side*l.n; ++j) probs[j] = calloc(l.classes, sizeof(float *));
while(1){
if(filename){
strncpy(input, filename, 256);
} else {
printf("Enter Image Path: ");
fflush(stdout);
input = fgets(input, 256, stdin);
if(!input) return;
strtok(input, "\n");
}
image im = load_image_color(input,0,0);
image sized = resize_image(im, net.w, net.h);
float *X = sized.data;
time=clock();
network_predict(net, X);
printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
get_detection_boxes(l, 1, 1, thresh, probs, boxes, 0);
if (nms) do_nms_sort(boxes, probs, l.side*l.side*l.n, l.classes, nms);
draw_detections(im, l.side*l.side*l.n, thresh, boxes, probs, coco_classes, alphabet, 80);
save_image(im, "prediction");
show_image(im, "predictions");
free_image(im);
free_image(sized);
#ifdef OPENCV
cvWaitKey(0);
cvDestroyAllWindows();
#endif
if (filename) break;
}
}

void run_coco(int argc, char **argv)
{
char *prefix = find_char_arg(argc, argv, "-prefix", 0);
float thresh = find_float_arg(argc, argv, "-thresh", .2);
int cam_index = find_int_arg(argc, argv, "-c", 0);
int frame_skip = find_int_arg(argc, argv, "-s", 0);

if(argc < 4){
fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
return;
}

char *cfg = argv[3];
char *weights = (argc > 4) ? argv[4] : 0;
char *filename = (argc > 5) ? argv[5]: 0;
if(0==strcmp(argv[2], "test")) test_coco(cfg, weights, filename, thresh);
else if(0==strcmp(argv[2], "train")) train_coco(cfg, weights);
else if(0==strcmp(argv[2], "valid")) validate_coco(cfg, weights);
else if(0==strcmp(argv[2], "recall")) validate_coco_recall(cfg, weights);
else if(0==strcmp(argv[2], "demo")) demo(cfg, weights, thresh, cam_index, filename, coco_classes, 80, frame_skip, prefix, .5);
}
@@ -0,0 +1,39 @@
#include <stdio.h>
#include <math.h>
void col2im_add_pixel(float *im, int height, int width, int channels,
int row, int col, int channel, int pad, float val)
{
row -= pad;
col -= pad;

if (row < 0 || col < 0 ||
row >= height || col >= width) return;
im[col + width*(row + height*channel)] += val;
}
//This one might be too, can't remember.
void col2im_cpu(float* data_col,
int channels, int height, int width,
int ksize, int stride, int pad, float* data_im)
{
int c,h,w;
int height_col = (height + 2*pad - ksize) / stride + 1;
int width_col = (width + 2*pad - ksize) / stride + 1;

int channels_col = channels * ksize * ksize;
for (c = 0; c < channels_col; ++c) {
int w_offset = c % ksize;
int h_offset = (c / ksize) % ksize;
int c_im = c / ksize / ksize;
for (h = 0; h < height_col; ++h) {
for (w = 0; w < width_col; ++w) {
int im_row = h_offset + h * stride;
int im_col = w_offset + w * stride;
int col_index = (c * height_col + h) * width_col + w;
double val = data_col[col_index];
col2im_add_pixel(data_im, height, width, channels,
im_row, im_col, c_im, pad, val);
}
}
}
}

@@ -0,0 +1,13 @@
#ifndef COL2IM_H
#define COL2IM_H

void col2im_cpu(float* data_col,
int channels, int height, int width,
int ksize, int stride, int pad, float* data_im);

#ifdef GPU
void col2im_ongpu(float *data_col,
int channels, int height, int width,
int ksize, int stride, int pad, float *data_im);
#endif
#endif
@@ -0,0 +1,58 @@
#include "cuda_runtime.h"
#include "curand.h"
#include "cublas_v2.h"

extern "C" {
#include "col2im.h"
#include "cuda.h"
}

// src: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu
// You may also want to read: https://github.com/BVLC/caffe/blob/master/LICENSE

__global__ void col2im_gpu_kernel(const int n, const float* data_col,
const int height, const int width, const int ksize,
const int pad,
const int stride,
const int height_col, const int width_col,
float *data_im) {
int index = blockIdx.x*blockDim.x+threadIdx.x;
for(; index < n; index += blockDim.x*gridDim.x){
float val = 0;
int w = index % width + pad;
int h = (index / width) % height + pad;
int c = index / (width * height);
// compute the start and end of the output
int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1;
int w_col_end = min(w / stride + 1, width_col);
int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1;
int h_col_end = min(h / stride + 1, height_col);
// equivalent implementation
int offset =
(c * ksize * ksize + h * ksize + w) * height_col * width_col;
int coeff_h_col = (1 - stride * ksize * height_col) * width_col;
int coeff_w_col = (1 - stride * height_col * width_col);
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
}
}
data_im[index] += val;
}
}

void col2im_ongpu(float *data_col,
int channels, int height, int width,
int ksize, int stride, int pad, float *data_im){
// We are going to launch channels * height_col * width_col kernels, each
// kernel responsible for copying a single-channel grid.
int height_col = (height + 2 * pad - ksize) / stride + 1;
int width_col = (width + 2 * pad - ksize) / stride + 1;
int num_kernels = channels * height * width;
col2im_gpu_kernel<<<(num_kernels+BLOCK-1)/BLOCK,
BLOCK>>>(
num_kernels, data_col, height, width, ksize, pad,
stride, height_col,
width_col, data_im);
}