| @@ -0,0 +1,165 @@ | ||
| #include "cuda_runtime.h" | ||
| #include "curand.h" | ||
| #include "cublas_v2.h" | ||
|
|
||
| extern "C" { | ||
| #include "activations.h" | ||
| #include "cuda.h" | ||
| } | ||
|
|
||
|
|
||
| __device__ float lhtan_activate_kernel(float x) | ||
| { | ||
| if(x < 0) return .001*x; | ||
| if(x > 1) return .001*(x-1) + 1; | ||
| return x; | ||
| } | ||
| __device__ float lhtan_gradient_kernel(float x) | ||
| { | ||
| if(x > 0 && x < 1) return 1; | ||
| return .001; | ||
| } | ||
|
|
||
| __device__ float hardtan_activate_kernel(float x) | ||
| { | ||
| if (x < -1) return -1; | ||
| if (x > 1) return 1; | ||
| return x; | ||
| } | ||
| __device__ float linear_activate_kernel(float x){return x;} | ||
| __device__ float logistic_activate_kernel(float x){return 1./(1. + exp(-x));} | ||
| __device__ float loggy_activate_kernel(float x){return 2./(1. + exp(-x)) - 1;} | ||
| __device__ float relu_activate_kernel(float x){return x*(x>0);} | ||
| __device__ float elu_activate_kernel(float x){return (x >= 0)*x + (x < 0)*(exp(x)-1);} | ||
| __device__ float relie_activate_kernel(float x){return (x>0) ? x : .01*x;} | ||
| __device__ float ramp_activate_kernel(float x){return x*(x>0)+.1*x;} | ||
| __device__ float leaky_activate_kernel(float x){return (x>0) ? x : .1*x;} | ||
| __device__ float tanh_activate_kernel(float x){return (2/(1 + exp(-2*x)) - 1);} | ||
| __device__ float plse_activate_kernel(float x) | ||
| { | ||
| if(x < -4) return .01 * (x + 4); | ||
| if(x > 4) return .01 * (x - 4) + 1; | ||
| return .125*x + .5; | ||
| } | ||
| __device__ float stair_activate_kernel(float x) | ||
| { | ||
| int n = floor(x); | ||
| if (n%2 == 0) return floor(x/2.); | ||
| else return (x - n) + floor(x/2.); | ||
| } | ||
|
|
||
|
|
||
| __device__ float hardtan_gradient_kernel(float x) | ||
| { | ||
| if (x > -1 && x < 1) return 1; | ||
| return 0; | ||
| } | ||
| __device__ float linear_gradient_kernel(float x){return 1;} | ||
| __device__ float logistic_gradient_kernel(float x){return (1-x)*x;} | ||
| __device__ float loggy_gradient_kernel(float x) | ||
| { | ||
| float y = (x+1.)/2.; | ||
| return 2*(1-y)*y; | ||
| } | ||
| __device__ float relu_gradient_kernel(float x){return (x>0);} | ||
| __device__ float elu_gradient_kernel(float x){return (x >= 0) + (x < 0)*(x + 1);} | ||
| __device__ float relie_gradient_kernel(float x){return (x>0) ? 1 : .01;} | ||
| __device__ float ramp_gradient_kernel(float x){return (x>0)+.1;} | ||
| __device__ float leaky_gradient_kernel(float x){return (x>0) ? 1 : .1;} | ||
| __device__ float tanh_gradient_kernel(float x){return 1-x*x;} | ||
| __device__ float plse_gradient_kernel(float x){return (x < 0 || x > 1) ? .01 : .125;} | ||
| __device__ float stair_gradient_kernel(float x) | ||
| { | ||
| if (floor(x) == x) return 0; | ||
| return 1; | ||
| } | ||
|
|
||
| __device__ float activate_kernel(float x, ACTIVATION a) | ||
| { | ||
| switch(a){ | ||
| case LINEAR: | ||
| return linear_activate_kernel(x); | ||
| case LOGISTIC: | ||
| return logistic_activate_kernel(x); | ||
| case LOGGY: | ||
| return loggy_activate_kernel(x); | ||
| case RELU: | ||
| return relu_activate_kernel(x); | ||
| case ELU: | ||
| return elu_activate_kernel(x); | ||
| case RELIE: | ||
| return relie_activate_kernel(x); | ||
| case RAMP: | ||
| return ramp_activate_kernel(x); | ||
| case LEAKY: | ||
| return leaky_activate_kernel(x); | ||
| case TANH: | ||
| return tanh_activate_kernel(x); | ||
| case PLSE: | ||
| return plse_activate_kernel(x); | ||
| case STAIR: | ||
| return stair_activate_kernel(x); | ||
| case HARDTAN: | ||
| return hardtan_activate_kernel(x); | ||
| case LHTAN: | ||
| return lhtan_activate_kernel(x); | ||
| } | ||
| return 0; | ||
| } | ||
|
|
||
| __device__ float gradient_kernel(float x, ACTIVATION a) | ||
| { | ||
| switch(a){ | ||
| case LINEAR: | ||
| return linear_gradient_kernel(x); | ||
| case LOGISTIC: | ||
| return logistic_gradient_kernel(x); | ||
| case LOGGY: | ||
| return loggy_gradient_kernel(x); | ||
| case RELU: | ||
| return relu_gradient_kernel(x); | ||
| case ELU: | ||
| return elu_gradient_kernel(x); | ||
| case RELIE: | ||
| return relie_gradient_kernel(x); | ||
| case RAMP: | ||
| return ramp_gradient_kernel(x); | ||
| case LEAKY: | ||
| return leaky_gradient_kernel(x); | ||
| case TANH: | ||
| return tanh_gradient_kernel(x); | ||
| case PLSE: | ||
| return plse_gradient_kernel(x); | ||
| case STAIR: | ||
| return stair_gradient_kernel(x); | ||
| case HARDTAN: | ||
| return hardtan_gradient_kernel(x); | ||
| case LHTAN: | ||
| return lhtan_gradient_kernel(x); | ||
| } | ||
| return 0; | ||
| } | ||
|
|
||
| __global__ void activate_array_kernel(float *x, int n, ACTIVATION a) | ||
| { | ||
| int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; | ||
| if(i < n) x[i] = activate_kernel(x[i], a); | ||
| } | ||
|
|
||
| __global__ void gradient_array_kernel(float *x, int n, ACTIVATION a, float *delta) | ||
| { | ||
| int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; | ||
| if(i < n) delta[i] *= gradient_kernel(x[i], a); | ||
| } | ||
|
|
||
| extern "C" void activate_array_ongpu(float *x, int n, ACTIVATION a) | ||
| { | ||
| activate_array_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, a); | ||
| check_error(cudaPeekAtLastError()); | ||
| } | ||
|
|
||
| extern "C" void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta) | ||
| { | ||
| gradient_array_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, a, delta); | ||
| check_error(cudaPeekAtLastError()); | ||
| } |
| @@ -0,0 +1,63 @@ | ||
| #include "activation_layer.h" | ||
| #include "utils.h" | ||
| #include "cuda.h" | ||
| #include "blas.h" | ||
| #include "gemm.h" | ||
|
|
||
| #include <math.h> | ||
| #include <stdio.h> | ||
| #include <stdlib.h> | ||
| #include <string.h> | ||
|
|
||
| layer make_activation_layer(int batch, int inputs, ACTIVATION activation) | ||
| { | ||
| layer l = {0}; | ||
| l.type = ACTIVE; | ||
|
|
||
| l.inputs = inputs; | ||
| l.outputs = inputs; | ||
| l.batch=batch; | ||
|
|
||
| l.output = calloc(batch*inputs, sizeof(float*)); | ||
| l.delta = calloc(batch*inputs, sizeof(float*)); | ||
|
|
||
| l.forward = forward_activation_layer; | ||
| l.backward = backward_activation_layer; | ||
| #ifdef GPU | ||
| l.forward_gpu = forward_activation_layer_gpu; | ||
| l.backward_gpu = backward_activation_layer_gpu; | ||
|
|
||
| l.output_gpu = cuda_make_array(l.output, inputs*batch); | ||
| l.delta_gpu = cuda_make_array(l.delta, inputs*batch); | ||
| #endif | ||
| l.activation = activation; | ||
| fprintf(stderr, "Activation Layer: %d inputs\n", inputs); | ||
| return l; | ||
| } | ||
|
|
||
| void forward_activation_layer(layer l, network_state state) | ||
| { | ||
| copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1); | ||
| activate_array(l.output, l.outputs*l.batch, l.activation); | ||
| } | ||
|
|
||
| void backward_activation_layer(layer l, network_state state) | ||
| { | ||
| gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); | ||
| copy_cpu(l.outputs*l.batch, l.delta, 1, state.delta, 1); | ||
| } | ||
|
|
||
| #ifdef GPU | ||
|
|
||
| void forward_activation_layer_gpu(layer l, network_state state) | ||
| { | ||
| copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1); | ||
| activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); | ||
| } | ||
|
|
||
| void backward_activation_layer_gpu(layer l, network_state state) | ||
| { | ||
| gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); | ||
| copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1); | ||
| } | ||
| #endif |
| @@ -0,0 +1,19 @@ | ||
| #ifndef ACTIVATION_LAYER_H | ||
| #define ACTIVATION_LAYER_H | ||
|
|
||
| #include "activations.h" | ||
| #include "layer.h" | ||
| #include "network.h" | ||
|
|
||
| layer make_activation_layer(int batch, int inputs, ACTIVATION activation); | ||
|
|
||
| void forward_activation_layer(layer l, network_state state); | ||
| void backward_activation_layer(layer l, network_state state); | ||
|
|
||
| #ifdef GPU | ||
| void forward_activation_layer_gpu(layer l, network_state state); | ||
| void backward_activation_layer_gpu(layer l, network_state state); | ||
| #endif | ||
|
|
||
| #endif | ||
|
|
| @@ -0,0 +1,143 @@ | ||
| #include "activations.h" | ||
|
|
||
| #include <math.h> | ||
| #include <stdio.h> | ||
| #include <stdlib.h> | ||
| #include <string.h> | ||
|
|
||
| char *get_activation_string(ACTIVATION a) | ||
| { | ||
| switch(a){ | ||
| case LOGISTIC: | ||
| return "logistic"; | ||
| case LOGGY: | ||
| return "loggy"; | ||
| case RELU: | ||
| return "relu"; | ||
| case ELU: | ||
| return "elu"; | ||
| case RELIE: | ||
| return "relie"; | ||
| case RAMP: | ||
| return "ramp"; | ||
| case LINEAR: | ||
| return "linear"; | ||
| case TANH: | ||
| return "tanh"; | ||
| case PLSE: | ||
| return "plse"; | ||
| case LEAKY: | ||
| return "leaky"; | ||
| case STAIR: | ||
| return "stair"; | ||
| case HARDTAN: | ||
| return "hardtan"; | ||
| case LHTAN: | ||
| return "lhtan"; | ||
| default: | ||
| break; | ||
| } | ||
| return "relu"; | ||
| } | ||
|
|
||
| ACTIVATION get_activation(char *s) | ||
| { | ||
| if (strcmp(s, "logistic")==0) return LOGISTIC; | ||
| if (strcmp(s, "loggy")==0) return LOGGY; | ||
| if (strcmp(s, "relu")==0) return RELU; | ||
| if (strcmp(s, "elu")==0) return ELU; | ||
| if (strcmp(s, "relie")==0) return RELIE; | ||
| if (strcmp(s, "plse")==0) return PLSE; | ||
| if (strcmp(s, "hardtan")==0) return HARDTAN; | ||
| if (strcmp(s, "lhtan")==0) return LHTAN; | ||
| if (strcmp(s, "linear")==0) return LINEAR; | ||
| if (strcmp(s, "ramp")==0) return RAMP; | ||
| if (strcmp(s, "leaky")==0) return LEAKY; | ||
| if (strcmp(s, "tanh")==0) return TANH; | ||
| if (strcmp(s, "stair")==0) return STAIR; | ||
| fprintf(stderr, "Couldn't find activation function %s, going with ReLU\n", s); | ||
| return RELU; | ||
| } | ||
|
|
||
| float activate(float x, ACTIVATION a) | ||
| { | ||
| switch(a){ | ||
| case LINEAR: | ||
| return linear_activate(x); | ||
| case LOGISTIC: | ||
| return logistic_activate(x); | ||
| case LOGGY: | ||
| return loggy_activate(x); | ||
| case RELU: | ||
| return relu_activate(x); | ||
| case ELU: | ||
| return elu_activate(x); | ||
| case RELIE: | ||
| return relie_activate(x); | ||
| case RAMP: | ||
| return ramp_activate(x); | ||
| case LEAKY: | ||
| return leaky_activate(x); | ||
| case TANH: | ||
| return tanh_activate(x); | ||
| case PLSE: | ||
| return plse_activate(x); | ||
| case STAIR: | ||
| return stair_activate(x); | ||
| case HARDTAN: | ||
| return hardtan_activate(x); | ||
| case LHTAN: | ||
| return lhtan_activate(x); | ||
| } | ||
| return 0; | ||
| } | ||
|
|
||
| void activate_array(float *x, const int n, const ACTIVATION a) | ||
| { | ||
| int i; | ||
| for(i = 0; i < n; ++i){ | ||
| x[i] = activate(x[i], a); | ||
| } | ||
| } | ||
|
|
||
| float gradient(float x, ACTIVATION a) | ||
| { | ||
| switch(a){ | ||
| case LINEAR: | ||
| return linear_gradient(x); | ||
| case LOGISTIC: | ||
| return logistic_gradient(x); | ||
| case LOGGY: | ||
| return loggy_gradient(x); | ||
| case RELU: | ||
| return relu_gradient(x); | ||
| case ELU: | ||
| return elu_gradient(x); | ||
| case RELIE: | ||
| return relie_gradient(x); | ||
| case RAMP: | ||
| return ramp_gradient(x); | ||
| case LEAKY: | ||
| return leaky_gradient(x); | ||
| case TANH: | ||
| return tanh_gradient(x); | ||
| case PLSE: | ||
| return plse_gradient(x); | ||
| case STAIR: | ||
| return stair_gradient(x); | ||
| case HARDTAN: | ||
| return hardtan_gradient(x); | ||
| case LHTAN: | ||
| return lhtan_gradient(x); | ||
| } | ||
| return 0; | ||
| } | ||
|
|
||
| void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta) | ||
| { | ||
| int i; | ||
| for(i = 0; i < n; ++i){ | ||
| delta[i] *= gradient(x[i], a); | ||
| } | ||
| } | ||
|
|
| @@ -0,0 +1,88 @@ | ||
| #ifndef ACTIVATIONS_H | ||
| #define ACTIVATIONS_H | ||
| #include "cuda.h" | ||
| #include "math.h" | ||
|
|
||
| typedef enum{ | ||
| LOGISTIC, RELU, RELIE, LINEAR, RAMP, TANH, PLSE, LEAKY, ELU, LOGGY, STAIR, HARDTAN, LHTAN | ||
| }ACTIVATION; | ||
|
|
||
| ACTIVATION get_activation(char *s); | ||
|
|
||
| char *get_activation_string(ACTIVATION a); | ||
| float activate(float x, ACTIVATION a); | ||
| float gradient(float x, ACTIVATION a); | ||
| void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta); | ||
| void activate_array(float *x, const int n, const ACTIVATION a); | ||
| #ifdef GPU | ||
| void activate_array_ongpu(float *x, int n, ACTIVATION a); | ||
| void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta); | ||
| #endif | ||
|
|
||
| static inline float stair_activate(float x) | ||
| { | ||
| int n = floor(x); | ||
| if (n%2 == 0) return floor(x/2.); | ||
| else return (x - n) + floor(x/2.); | ||
| } | ||
| static inline float hardtan_activate(float x) | ||
| { | ||
| if (x < -1) return -1; | ||
| if (x > 1) return 1; | ||
| return x; | ||
| } | ||
| static inline float linear_activate(float x){return x;} | ||
| static inline float logistic_activate(float x){return 1./(1. + exp(-x));} | ||
| static inline float loggy_activate(float x){return 2./(1. + exp(-x)) - 1;} | ||
| static inline float relu_activate(float x){return x*(x>0);} | ||
| static inline float elu_activate(float x){return (x >= 0)*x + (x < 0)*(exp(x)-1);} | ||
| static inline float relie_activate(float x){return (x>0) ? x : .01*x;} | ||
| static inline float ramp_activate(float x){return x*(x>0)+.1*x;} | ||
| static inline float leaky_activate(float x){return (x>0) ? x : .1*x;} | ||
| static inline float tanh_activate(float x){return (exp(2*x)-1)/(exp(2*x)+1);} | ||
| static inline float plse_activate(float x) | ||
| { | ||
| if(x < -4) return .01 * (x + 4); | ||
| if(x > 4) return .01 * (x - 4) + 1; | ||
| return .125*x + .5; | ||
| } | ||
|
|
||
| static inline float lhtan_activate(float x) | ||
| { | ||
| if(x < 0) return .001*x; | ||
| if(x > 1) return .001*(x-1) + 1; | ||
| return x; | ||
| } | ||
| static inline float lhtan_gradient(float x) | ||
| { | ||
| if(x > 0 && x < 1) return 1; | ||
| return .001; | ||
| } | ||
|
|
||
| static inline float hardtan_gradient(float x) | ||
| { | ||
| if (x > -1 && x < 1) return 1; | ||
| return 0; | ||
| } | ||
| static inline float linear_gradient(float x){return 1;} | ||
| static inline float logistic_gradient(float x){return (1-x)*x;} | ||
| static inline float loggy_gradient(float x) | ||
| { | ||
| float y = (x+1.)/2.; | ||
| return 2*(1-y)*y; | ||
| } | ||
| static inline float stair_gradient(float x) | ||
| { | ||
| if (floor(x) == x) return 0; | ||
| return 1; | ||
| } | ||
| static inline float relu_gradient(float x){return (x>0);} | ||
| static inline float elu_gradient(float x){return (x >= 0) + (x < 0)*(x + 1);} | ||
| static inline float relie_gradient(float x){return (x>0) ? 1 : .01;} | ||
| static inline float ramp_gradient(float x){return (x>0)+.1;} | ||
| static inline float leaky_gradient(float x){return (x>0) ? 1 : .1;} | ||
| static inline float tanh_gradient(float x){return 1-x*x;} | ||
| static inline float plse_gradient(float x){return (x < 0 || x > 1) ? .01 : .125;} | ||
|
|
||
| #endif | ||
|
|
| @@ -0,0 +1,71 @@ | ||
| #include "network.h" | ||
| #include "utils.h" | ||
| #include "parser.h" | ||
| #include "option_list.h" | ||
| #include "blas.h" | ||
| #include "classifier.h" | ||
| #include <sys/time.h> | ||
|
|
||
| void demo_art(char *cfgfile, char *weightfile, int cam_index) | ||
| { | ||
| #ifdef OPENCV | ||
| network net = parse_network_cfg(cfgfile); | ||
| if(weightfile){ | ||
| load_weights(&net, weightfile); | ||
| } | ||
| set_batch_network(&net, 1); | ||
|
|
||
| srand(2222222); | ||
| CvCapture * cap; | ||
|
|
||
| cap = cvCaptureFromCAM(cam_index); | ||
|
|
||
| char *window = "ArtJudgementBot9000!!!"; | ||
| if(!cap) error("Couldn't connect to webcam.\n"); | ||
| cvNamedWindow(window, CV_WINDOW_NORMAL); | ||
| cvResizeWindow(window, 512, 512); | ||
| int i; | ||
| int idx[] = {37, 401, 434}; | ||
| int n = sizeof(idx)/sizeof(idx[0]); | ||
|
|
||
| while(1){ | ||
| image in = get_image_from_stream(cap); | ||
| image in_s = resize_image(in, net.w, net.h); | ||
| show_image(in, window); | ||
|
|
||
| float *p = network_predict(net, in_s.data); | ||
|
|
||
| printf("\033[2J"); | ||
| printf("\033[1;1H"); | ||
|
|
||
| float score = 0; | ||
| for(i = 0; i < n; ++i){ | ||
| float s = p[idx[i]]; | ||
| if (s > score) score = s; | ||
| } | ||
| score = score; | ||
| printf("I APPRECIATE THIS ARTWORK: %10.7f%%\n", score*100); | ||
| printf("["); | ||
| int upper = 30; | ||
| for(i = 0; i < upper; ++i){ | ||
| printf("%c", ((i+.5) < score*upper) ? 219 : ' '); | ||
| } | ||
| printf("]\n"); | ||
|
|
||
| free_image(in_s); | ||
| free_image(in); | ||
|
|
||
| cvWaitKey(1); | ||
| } | ||
| #endif | ||
| } | ||
|
|
||
|
|
||
| void run_art(int argc, char **argv) | ||
| { | ||
| int cam_index = find_int_arg(argc, argv, "-c", 0); | ||
| char *cfg = argv[2]; | ||
| char *weights = argv[3]; | ||
| demo_art(cfg, weights, cam_index); | ||
| } | ||
|
|
| @@ -0,0 +1,71 @@ | ||
| #include "avgpool_layer.h" | ||
| #include "cuda.h" | ||
| #include <stdio.h> | ||
|
|
||
| avgpool_layer make_avgpool_layer(int batch, int w, int h, int c) | ||
| { | ||
| fprintf(stderr, "avg %4d x%4d x%4d -> %4d\n", w, h, c, c); | ||
| avgpool_layer l = {0}; | ||
| l.type = AVGPOOL; | ||
| l.batch = batch; | ||
| l.h = h; | ||
| l.w = w; | ||
| l.c = c; | ||
| l.out_w = 1; | ||
| l.out_h = 1; | ||
| l.out_c = c; | ||
| l.outputs = l.out_c; | ||
| l.inputs = h*w*c; | ||
| int output_size = l.outputs * batch; | ||
| l.output = calloc(output_size, sizeof(float)); | ||
| l.delta = calloc(output_size, sizeof(float)); | ||
| l.forward = forward_avgpool_layer; | ||
| l.backward = backward_avgpool_layer; | ||
| #ifdef GPU | ||
| l.forward_gpu = forward_avgpool_layer_gpu; | ||
| l.backward_gpu = backward_avgpool_layer_gpu; | ||
| l.output_gpu = cuda_make_array(l.output, output_size); | ||
| l.delta_gpu = cuda_make_array(l.delta, output_size); | ||
| #endif | ||
| return l; | ||
| } | ||
|
|
||
| void resize_avgpool_layer(avgpool_layer *l, int w, int h) | ||
| { | ||
| l->w = w; | ||
| l->h = h; | ||
| l->inputs = h*w*l->c; | ||
| } | ||
|
|
||
| void forward_avgpool_layer(const avgpool_layer l, network_state state) | ||
| { | ||
| int b,i,k; | ||
|
|
||
| for(b = 0; b < l.batch; ++b){ | ||
| for(k = 0; k < l.c; ++k){ | ||
| int out_index = k + b*l.c; | ||
| l.output[out_index] = 0; | ||
| for(i = 0; i < l.h*l.w; ++i){ | ||
| int in_index = i + l.h*l.w*(k + b*l.c); | ||
| l.output[out_index] += state.input[in_index]; | ||
| } | ||
| l.output[out_index] /= l.h*l.w; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| void backward_avgpool_layer(const avgpool_layer l, network_state state) | ||
| { | ||
| int b,i,k; | ||
|
|
||
| for(b = 0; b < l.batch; ++b){ | ||
| for(k = 0; k < l.c; ++k){ | ||
| int out_index = k + b*l.c; | ||
| for(i = 0; i < l.h*l.w; ++i){ | ||
| int in_index = i + l.h*l.w*(k + b*l.c); | ||
| state.delta[in_index] += l.delta[out_index] / (l.h*l.w); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
| @@ -0,0 +1,23 @@ | ||
| #ifndef AVGPOOL_LAYER_H | ||
| #define AVGPOOL_LAYER_H | ||
|
|
||
| #include "image.h" | ||
| #include "cuda.h" | ||
| #include "layer.h" | ||
| #include "network.h" | ||
|
|
||
| typedef layer avgpool_layer; | ||
|
|
||
| image get_avgpool_image(avgpool_layer l); | ||
| avgpool_layer make_avgpool_layer(int batch, int w, int h, int c); | ||
| void resize_avgpool_layer(avgpool_layer *l, int w, int h); | ||
| void forward_avgpool_layer(const avgpool_layer l, network_state state); | ||
| void backward_avgpool_layer(const avgpool_layer l, network_state state); | ||
|
|
||
| #ifdef GPU | ||
| void forward_avgpool_layer_gpu(avgpool_layer l, network_state state); | ||
| void backward_avgpool_layer_gpu(avgpool_layer l, network_state state); | ||
| #endif | ||
|
|
||
| #endif | ||
|
|
| @@ -0,0 +1,61 @@ | ||
| #include "cuda_runtime.h" | ||
| #include "curand.h" | ||
| #include "cublas_v2.h" | ||
|
|
||
| extern "C" { | ||
| #include "avgpool_layer.h" | ||
| #include "cuda.h" | ||
| } | ||
|
|
||
| __global__ void forward_avgpool_layer_kernel(int n, int w, int h, int c, float *input, float *output) | ||
| { | ||
| int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; | ||
| if(id >= n) return; | ||
|
|
||
| int k = id % c; | ||
| id /= c; | ||
| int b = id; | ||
|
|
||
| int i; | ||
| int out_index = (k + c*b); | ||
| output[out_index] = 0; | ||
| for(i = 0; i < w*h; ++i){ | ||
| int in_index = i + h*w*(k + b*c); | ||
| output[out_index] += input[in_index]; | ||
| } | ||
| output[out_index] /= w*h; | ||
| } | ||
|
|
||
| __global__ void backward_avgpool_layer_kernel(int n, int w, int h, int c, float *in_delta, float *out_delta) | ||
| { | ||
| int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; | ||
| if(id >= n) return; | ||
|
|
||
| int k = id % c; | ||
| id /= c; | ||
| int b = id; | ||
|
|
||
| int i; | ||
| int out_index = (k + c*b); | ||
| for(i = 0; i < w*h; ++i){ | ||
| int in_index = i + h*w*(k + b*c); | ||
| in_delta[in_index] += out_delta[out_index] / (w*h); | ||
| } | ||
| } | ||
|
|
||
| extern "C" void forward_avgpool_layer_gpu(avgpool_layer layer, network_state state) | ||
| { | ||
| size_t n = layer.c*layer.batch; | ||
|
|
||
| forward_avgpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.w, layer.h, layer.c, state.input, layer.output_gpu); | ||
| check_error(cudaPeekAtLastError()); | ||
| } | ||
|
|
||
| extern "C" void backward_avgpool_layer_gpu(avgpool_layer layer, network_state state) | ||
| { | ||
| size_t n = layer.c*layer.batch; | ||
|
|
||
| backward_avgpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.w, layer.h, layer.c, state.delta, layer.delta_gpu); | ||
| check_error(cudaPeekAtLastError()); | ||
| } | ||
|
|
| @@ -0,0 +1,279 @@ | ||
| #include "convolutional_layer.h" | ||
| #include "batchnorm_layer.h" | ||
| #include "blas.h" | ||
| #include <stdio.h> | ||
|
|
||
| layer make_batchnorm_layer(int batch, int w, int h, int c) | ||
| { | ||
| fprintf(stderr, "Batch Normalization Layer: %d x %d x %d image\n", w,h,c); | ||
| layer l = {0}; | ||
| l.type = BATCHNORM; | ||
| l.batch = batch; | ||
| l.h = l.out_h = h; | ||
| l.w = l.out_w = w; | ||
| l.c = l.out_c = c; | ||
| l.output = calloc(h * w * c * batch, sizeof(float)); | ||
| l.delta = calloc(h * w * c * batch, sizeof(float)); | ||
| l.inputs = w*h*c; | ||
| l.outputs = l.inputs; | ||
|
|
||
| l.scales = calloc(c, sizeof(float)); | ||
| l.scale_updates = calloc(c, sizeof(float)); | ||
| l.biases = calloc(c, sizeof(float)); | ||
| l.bias_updates = calloc(c, sizeof(float)); | ||
| int i; | ||
| for(i = 0; i < c; ++i){ | ||
| l.scales[i] = 1; | ||
| } | ||
|
|
||
| l.mean = calloc(c, sizeof(float)); | ||
| l.variance = calloc(c, sizeof(float)); | ||
|
|
||
| l.rolling_mean = calloc(c, sizeof(float)); | ||
| l.rolling_variance = calloc(c, sizeof(float)); | ||
|
|
||
| l.forward = forward_batchnorm_layer; | ||
| l.backward = backward_batchnorm_layer; | ||
| #ifdef GPU | ||
| l.forward_gpu = forward_batchnorm_layer_gpu; | ||
| l.backward_gpu = backward_batchnorm_layer_gpu; | ||
|
|
||
| l.output_gpu = cuda_make_array(l.output, h * w * c * batch); | ||
| l.delta_gpu = cuda_make_array(l.delta, h * w * c * batch); | ||
|
|
||
| l.biases_gpu = cuda_make_array(l.biases, c); | ||
| l.bias_updates_gpu = cuda_make_array(l.bias_updates, c); | ||
|
|
||
| l.scales_gpu = cuda_make_array(l.scales, c); | ||
| l.scale_updates_gpu = cuda_make_array(l.scale_updates, c); | ||
|
|
||
| l.mean_gpu = cuda_make_array(l.mean, c); | ||
| l.variance_gpu = cuda_make_array(l.variance, c); | ||
|
|
||
| l.rolling_mean_gpu = cuda_make_array(l.mean, c); | ||
| l.rolling_variance_gpu = cuda_make_array(l.variance, c); | ||
|
|
||
| l.mean_delta_gpu = cuda_make_array(l.mean, c); | ||
| l.variance_delta_gpu = cuda_make_array(l.variance, c); | ||
|
|
||
| l.x_gpu = cuda_make_array(l.output, l.batch*l.outputs); | ||
| l.x_norm_gpu = cuda_make_array(l.output, l.batch*l.outputs); | ||
| #ifdef CUDNN | ||
| cudnnCreateTensorDescriptor(&l.normTensorDesc); | ||
| cudnnCreateTensorDescriptor(&l.dstTensorDesc); | ||
| cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); | ||
| cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1); | ||
|
|
||
| #endif | ||
| #endif | ||
| return l; | ||
| } | ||
|
|
||
| void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates) | ||
| { | ||
| int i,b,f; | ||
| for(f = 0; f < n; ++f){ | ||
| float sum = 0; | ||
| for(b = 0; b < batch; ++b){ | ||
| for(i = 0; i < size; ++i){ | ||
| int index = i + size*(f + n*b); | ||
| sum += delta[index] * x_norm[index]; | ||
| } | ||
| } | ||
| scale_updates[f] += sum; | ||
| } | ||
| } | ||
|
|
||
| void mean_delta_cpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta) | ||
| { | ||
|
|
||
| int i,j,k; | ||
| for(i = 0; i < filters; ++i){ | ||
| mean_delta[i] = 0; | ||
| for (j = 0; j < batch; ++j) { | ||
| for (k = 0; k < spatial; ++k) { | ||
| int index = j*filters*spatial + i*spatial + k; | ||
| mean_delta[i] += delta[index]; | ||
| } | ||
| } | ||
| mean_delta[i] *= (-1./sqrt(variance[i] + .00001f)); | ||
| } | ||
| } | ||
| void variance_delta_cpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta) | ||
| { | ||
|
|
||
| int i,j,k; | ||
| for(i = 0; i < filters; ++i){ | ||
| variance_delta[i] = 0; | ||
| for(j = 0; j < batch; ++j){ | ||
| for(k = 0; k < spatial; ++k){ | ||
| int index = j*filters*spatial + i*spatial + k; | ||
| variance_delta[i] += delta[index]*(x[index] - mean[i]); | ||
| } | ||
| } | ||
| variance_delta[i] *= -.5 * pow(variance[i] + .00001f, (float)(-3./2.)); | ||
| } | ||
| } | ||
| void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta) | ||
| { | ||
| int f, j, k; | ||
| for(j = 0; j < batch; ++j){ | ||
| for(f = 0; f < filters; ++f){ | ||
| for(k = 0; k < spatial; ++k){ | ||
| int index = j*filters*spatial + f*spatial + k; | ||
| delta[index] = delta[index] * 1./(sqrt(variance[f] + .00001f)) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| void resize_batchnorm_layer(layer *layer, int w, int h) | ||
| { | ||
| fprintf(stderr, "Not implemented\n"); | ||
| } | ||
|
|
||
| void forward_batchnorm_layer(layer l, network_state state) | ||
| { | ||
| if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1); | ||
| if(l.type == CONNECTED){ | ||
| l.out_c = l.outputs; | ||
| l.out_h = l.out_w = 1; | ||
| } | ||
| if(state.train){ | ||
| mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean); | ||
| variance_cpu(l.output, l.mean, l.batch, l.out_c, l.out_h*l.out_w, l.variance); | ||
|
|
||
| scal_cpu(l.out_c, .99, l.rolling_mean, 1); | ||
| axpy_cpu(l.out_c, .01, l.mean, 1, l.rolling_mean, 1); | ||
| scal_cpu(l.out_c, .99, l.rolling_variance, 1); | ||
| axpy_cpu(l.out_c, .01, l.variance, 1, l.rolling_variance, 1); | ||
|
|
||
| copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1); | ||
| normalize_cpu(l.output, l.mean, l.variance, l.batch, l.out_c, l.out_h*l.out_w); | ||
| copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1); | ||
| } else { | ||
| normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.out_c, l.out_h*l.out_w); | ||
| } | ||
| scale_bias(l.output, l.scales, l.batch, l.out_c, l.out_h*l.out_w); | ||
| add_bias(l.output, l.biases, l.batch, l.out_c, l.out_h*l.out_w); | ||
| } | ||
|
|
||
| void backward_batchnorm_layer(const layer l, network_state state) | ||
| { | ||
| backward_bias(l.bias_updates, l.delta, l.batch, l.out_c, l.out_w*l.out_h); | ||
| backward_scale_cpu(l.x_norm, l.delta, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates); | ||
|
|
||
| scale_bias(l.delta, l.scales, l.batch, l.out_c, l.out_h*l.out_w); | ||
|
|
||
| mean_delta_cpu(l.delta, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta); | ||
| variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta); | ||
| normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.out_c, l.out_w*l.out_h, l.delta); | ||
| if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, l.delta, 1, state.delta, 1); | ||
| } | ||
|
|
||
| #ifdef GPU | ||
|
|
||
| void pull_batchnorm_layer(layer l) | ||
| { | ||
| cuda_pull_array(l.scales_gpu, l.scales, l.c); | ||
| cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.c); | ||
| cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.c); | ||
| } | ||
| void push_batchnorm_layer(layer l) | ||
| { | ||
| cuda_push_array(l.scales_gpu, l.scales, l.c); | ||
| cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.c); | ||
| cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.c); | ||
| } | ||
|
|
||
| void forward_batchnorm_layer_gpu(layer l, network_state state) | ||
| { | ||
| if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1); | ||
| if(l.type == CONNECTED){ | ||
| l.out_c = l.outputs; | ||
| l.out_h = l.out_w = 1; | ||
| } | ||
| if (state.train) { | ||
| #ifdef CUDNN | ||
| copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1); | ||
| float one = 1; | ||
| float zero = 0; | ||
| cudnnBatchNormalizationForwardTraining(cudnn_handle(), | ||
| CUDNN_BATCHNORM_SPATIAL, | ||
| &one, | ||
| &zero, | ||
| l.dstTensorDesc, | ||
| l.x_gpu, | ||
| l.dstTensorDesc, | ||
| l.output_gpu, | ||
| l.normTensorDesc, | ||
| l.scales_gpu, | ||
| l.biases_gpu, | ||
| .01, | ||
| l.rolling_mean_gpu, | ||
| l.rolling_variance_gpu, | ||
| .00001, | ||
| l.mean_gpu, | ||
| l.variance_gpu); | ||
| #else | ||
| fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu); | ||
| fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu); | ||
|
|
||
| scal_ongpu(l.out_c, .99, l.rolling_mean_gpu, 1); | ||
| axpy_ongpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1); | ||
| scal_ongpu(l.out_c, .99, l.rolling_variance_gpu, 1); | ||
| axpy_ongpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1); | ||
|
|
||
| copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1); | ||
| normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w); | ||
| copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1); | ||
|
|
||
| scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w); | ||
| add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h); | ||
| #endif | ||
| } else { | ||
| normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w); | ||
| scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w); | ||
| add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h); | ||
| } | ||
|
|
||
| } | ||
|
|
||
| void backward_batchnorm_layer_gpu(const layer l, network_state state) | ||
| { | ||
| #ifdef CUDNN | ||
| float one = 1; | ||
| float zero = 0; | ||
| cudnnBatchNormalizationBackward(cudnn_handle(), | ||
| CUDNN_BATCHNORM_SPATIAL, | ||
| &one, | ||
| &zero, | ||
| &one, | ||
| &one, | ||
| l.dstTensorDesc, | ||
| l.x_gpu, | ||
| l.dstTensorDesc, | ||
| l.delta_gpu, | ||
| l.dstTensorDesc, | ||
| l.x_norm_gpu, | ||
| l.normTensorDesc, | ||
| l.scales_gpu, | ||
| l.scale_updates_gpu, | ||
| l.bias_updates_gpu, | ||
| .00001, | ||
| l.mean_gpu, | ||
| l.variance_gpu); | ||
| copy_ongpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1); | ||
| #else | ||
| backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h); | ||
| backward_scale_gpu(l.x_norm_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates_gpu); | ||
|
|
||
| scale_bias_gpu(l.delta_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w); | ||
|
|
||
| fast_mean_delta_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta_gpu); | ||
| fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu); | ||
| normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu); | ||
| #endif | ||
| if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1); | ||
| } | ||
| #endif |
| @@ -0,0 +1,19 @@ | ||
| #ifndef BATCHNORM_LAYER_H | ||
| #define BATCHNORM_LAYER_H | ||
|
|
||
| #include "image.h" | ||
| #include "layer.h" | ||
| #include "network.h" | ||
|
|
||
| layer make_batchnorm_layer(int batch, int w, int h, int c); | ||
| void forward_batchnorm_layer(layer l, network_state state); | ||
| void backward_batchnorm_layer(layer l, network_state state); | ||
|
|
||
| #ifdef GPU | ||
| void forward_batchnorm_layer_gpu(layer l, network_state state); | ||
| void backward_batchnorm_layer_gpu(layer l, network_state state); | ||
| void pull_batchnorm_layer(layer l); | ||
| void push_batchnorm_layer(layer l); | ||
| #endif | ||
|
|
||
| #endif |
| @@ -0,0 +1,243 @@ | ||
| #include "blas.h" | ||
| #include "math.h" | ||
| #include <assert.h> | ||
| #include <float.h> | ||
| #include <stdio.h> | ||
| #include <stdlib.h> | ||
| #include <string.h> | ||
| void reorg_cpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out) | ||
| { | ||
| int b,i,j,k; | ||
| int out_c = c/(stride*stride); | ||
|
|
||
| for(b = 0; b < batch; ++b){ | ||
| for(k = 0; k < c; ++k){ | ||
| for(j = 0; j < h; ++j){ | ||
| for(i = 0; i < w; ++i){ | ||
| int in_index = i + w*(j + h*(k + c*b)); | ||
| int c2 = k % out_c; | ||
| int offset = k / out_c; | ||
| int w2 = i*stride + offset % stride; | ||
| int h2 = j*stride + offset / stride; | ||
| int out_index = w2 + w*stride*(h2 + h*stride*(c2 + out_c*b)); | ||
| if(forward) out[out_index] = x[in_index]; | ||
| else out[in_index] = x[out_index]; | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| void flatten(float *x, int size, int layers, int batch, int forward) | ||
| { | ||
| float *swap = calloc(size*layers*batch, sizeof(float)); | ||
| int i,c,b; | ||
| for(b = 0; b < batch; ++b){ | ||
| for(c = 0; c < layers; ++c){ | ||
| for(i = 0; i < size; ++i){ | ||
| int i1 = b*layers*size + c*size + i; | ||
| int i2 = b*layers*size + i*layers + c; | ||
| if (forward) swap[i2] = x[i1]; | ||
| else swap[i1] = x[i2]; | ||
| } | ||
| } | ||
| } | ||
| memcpy(x, swap, size*layers*batch*sizeof(float)); | ||
| free(swap); | ||
| } | ||
|
|
||
| void weighted_sum_cpu(float *a, float *b, float *s, int n, float *c) | ||
| { | ||
| int i; | ||
| for(i = 0; i < n; ++i){ | ||
| c[i] = s[i]*a[i] + (1-s[i])*(b ? b[i] : 0); | ||
| } | ||
| } | ||
|
|
||
| void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out) | ||
| { | ||
| int stride = w1/w2; | ||
| int sample = w2/w1; | ||
| assert(stride == h1/h2); | ||
| assert(sample == h2/h1); | ||
| if(stride < 1) stride = 1; | ||
| if(sample < 1) sample = 1; | ||
| int minw = (w1 < w2) ? w1 : w2; | ||
| int minh = (h1 < h2) ? h1 : h2; | ||
| int minc = (c1 < c2) ? c1 : c2; | ||
|
|
||
| int i,j,k,b; | ||
| for(b = 0; b < batch; ++b){ | ||
| for(k = 0; k < minc; ++k){ | ||
| for(j = 0; j < minh; ++j){ | ||
| for(i = 0; i < minw; ++i){ | ||
| int out_index = i*sample + w2*(j*sample + h2*(k + c2*b)); | ||
| int add_index = i*stride + w1*(j*stride + h1*(k + c1*b)); | ||
| out[out_index] += add[add_index]; | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| void mean_cpu(float *x, int batch, int filters, int spatial, float *mean) | ||
| { | ||
| float scale = 1./(batch * spatial); | ||
| int i,j,k; | ||
| for(i = 0; i < filters; ++i){ | ||
| mean[i] = 0; | ||
| for(j = 0; j < batch; ++j){ | ||
| for(k = 0; k < spatial; ++k){ | ||
| int index = j*filters*spatial + i*spatial + k; | ||
| mean[i] += x[index]; | ||
| } | ||
| } | ||
| mean[i] *= scale; | ||
| } | ||
| } | ||
|
|
||
| void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance) | ||
| { | ||
| float scale = 1./(batch * spatial - 1); | ||
| int i,j,k; | ||
| for(i = 0; i < filters; ++i){ | ||
| variance[i] = 0; | ||
| for(j = 0; j < batch; ++j){ | ||
| for(k = 0; k < spatial; ++k){ | ||
| int index = j*filters*spatial + i*spatial + k; | ||
| variance[i] += pow((x[index] - mean[i]), 2); | ||
| } | ||
| } | ||
| variance[i] *= scale; | ||
| } | ||
| } | ||
|
|
||
| void normalize_cpu(float *x, float *mean, float *variance, int batch, int filters, int spatial) | ||
| { | ||
| int b, f, i; | ||
| for(b = 0; b < batch; ++b){ | ||
| for(f = 0; f < filters; ++f){ | ||
| for(i = 0; i < spatial; ++i){ | ||
| int index = b*filters*spatial + f*spatial + i; | ||
| x[index] = (x[index] - mean[f])/(sqrt(variance[f]) + .000001f); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| void const_cpu(int N, float ALPHA, float *X, int INCX) | ||
| { | ||
| int i; | ||
| for(i = 0; i < N; ++i) X[i*INCX] = ALPHA; | ||
| } | ||
|
|
||
| void mul_cpu(int N, float *X, int INCX, float *Y, int INCY) | ||
| { | ||
| int i; | ||
| for(i = 0; i < N; ++i) Y[i*INCY] *= X[i*INCX]; | ||
| } | ||
|
|
||
| void pow_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY) | ||
| { | ||
| int i; | ||
| for(i = 0; i < N; ++i) Y[i*INCY] = pow(X[i*INCX], ALPHA); | ||
| } | ||
|
|
||
| void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY) | ||
| { | ||
| int i; | ||
| for(i = 0; i < N; ++i) Y[i*INCY] += ALPHA*X[i*INCX]; | ||
| } | ||
|
|
||
| void scal_cpu(int N, float ALPHA, float *X, int INCX) | ||
| { | ||
| int i; | ||
| for(i = 0; i < N; ++i) X[i*INCX] *= ALPHA; | ||
| } | ||
|
|
||
| void fill_cpu(int N, float ALPHA, float *X, int INCX) | ||
| { | ||
| int i; | ||
| for(i = 0; i < N; ++i) X[i*INCX] = ALPHA; | ||
| } | ||
|
|
||
| void copy_cpu(int N, float *X, int INCX, float *Y, int INCY) | ||
| { | ||
| int i; | ||
| for(i = 0; i < N; ++i) Y[i*INCY] = X[i*INCX]; | ||
| } | ||
|
|
||
| void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error) | ||
| { | ||
| int i; | ||
| for(i = 0; i < n; ++i){ | ||
| float diff = truth[i] - pred[i]; | ||
| float abs_val = fabs(diff); | ||
| if(abs_val < 1) { | ||
| error[i] = diff * diff; | ||
| delta[i] = diff; | ||
| } | ||
| else { | ||
| error[i] = 2*abs_val - 1; | ||
| delta[i] = (diff < 0) ? 1 : -1; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| void l1_cpu(int n, float *pred, float *truth, float *delta, float *error) | ||
| { | ||
| int i; | ||
| for(i = 0; i < n; ++i){ | ||
| float diff = truth[i] - pred[i]; | ||
| error[i] = fabs(diff); | ||
| delta[i] = diff > 0 ? 1 : -1; | ||
| } | ||
| } | ||
|
|
||
| void l2_cpu(int n, float *pred, float *truth, float *delta, float *error) | ||
| { | ||
| int i; | ||
| for(i = 0; i < n; ++i){ | ||
| float diff = truth[i] - pred[i]; | ||
| error[i] = diff * diff; | ||
| delta[i] = diff; | ||
| } | ||
| } | ||
|
|
||
| float dot_cpu(int N, float *X, int INCX, float *Y, int INCY) | ||
| { | ||
| int i; | ||
| float dot = 0; | ||
| for(i = 0; i < N; ++i) dot += X[i*INCX] * Y[i*INCY]; | ||
| return dot; | ||
| } | ||
|
|
||
| void softmax(float *input, int n, float temp, int stride, float *output) | ||
| { | ||
| int i; | ||
| float sum = 0; | ||
| float largest = -FLT_MAX; | ||
| for(i = 0; i < n; ++i){ | ||
| if(input[i*stride] > largest) largest = input[i*stride]; | ||
| } | ||
| for(i = 0; i < n; ++i){ | ||
| float e = exp(input[i*stride]/temp - largest/temp); | ||
| sum += e; | ||
| output[i*stride] = e; | ||
| } | ||
| for(i = 0; i < n; ++i){ | ||
| output[i*stride] /= sum; | ||
| } | ||
| } | ||
|
|
||
|
|
||
| void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output) | ||
| { | ||
| int g, b; | ||
| for(b = 0; b < batch; ++b){ | ||
| for(g = 0; g < groups; ++g){ | ||
| softmax(input + b*batch_offset + g*group_offset, n, temp, stride, output + b*batch_offset + g*group_offset); | ||
| } | ||
| } | ||
| } | ||
|
|
| @@ -0,0 +1,91 @@ | ||
| #ifndef BLAS_H | ||
| #define BLAS_H | ||
| void flatten(float *x, int size, int layers, int batch, int forward); | ||
| void pm(int M, int N, float *A); | ||
| float *random_matrix(int rows, int cols); | ||
| void time_random_matrix(int TA, int TB, int m, int k, int n); | ||
| void reorg_cpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out); | ||
|
|
||
| void test_blas(); | ||
|
|
||
| void const_cpu(int N, float ALPHA, float *X, int INCX); | ||
| void constrain_ongpu(int N, float ALPHA, float * X, int INCX); | ||
| void pow_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY); | ||
| void mul_cpu(int N, float *X, int INCX, float *Y, int INCY); | ||
|
|
||
| void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY); | ||
| void copy_cpu(int N, float *X, int INCX, float *Y, int INCY); | ||
| void scal_cpu(int N, float ALPHA, float *X, int INCX); | ||
| void fill_cpu(int N, float ALPHA, float * X, int INCX); | ||
| float dot_cpu(int N, float *X, int INCX, float *Y, int INCY); | ||
| void test_gpu_blas(); | ||
| void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out); | ||
|
|
||
| void mean_cpu(float *x, int batch, int filters, int spatial, float *mean); | ||
| void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance); | ||
| void normalize_cpu(float *x, float *mean, float *variance, int batch, int filters, int spatial); | ||
|
|
||
| void scale_bias(float *output, float *scales, int batch, int n, int size); | ||
| void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates); | ||
| void mean_delta_cpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta); | ||
| void variance_delta_cpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta); | ||
| void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta); | ||
|
|
||
| void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error); | ||
| void l2_cpu(int n, float *pred, float *truth, float *delta, float *error); | ||
| void l1_cpu(int n, float *pred, float *truth, float *delta, float *error); | ||
| void weighted_sum_cpu(float *a, float *b, float *s, int num, float *c); | ||
|
|
||
| void softmax(float *input, int n, float temp, int stride, float *output); | ||
| void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output); | ||
|
|
||
| #ifdef GPU | ||
| #include "cuda.h" | ||
|
|
||
| void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY); | ||
| void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY); | ||
| void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY); | ||
| void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY); | ||
| void scal_ongpu(int N, float ALPHA, float * X, int INCX); | ||
| void add_ongpu(int N, float ALPHA, float * X, int INCX); | ||
| void supp_ongpu(int N, float ALPHA, float * X, int INCX); | ||
| void mask_ongpu(int N, float * X, float mask_num, float * mask); | ||
| void const_ongpu(int N, float ALPHA, float *X, int INCX); | ||
| void pow_ongpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY); | ||
| void mul_ongpu(int N, float *X, int INCX, float *Y, int INCY); | ||
| void fill_ongpu(int N, float ALPHA, float * X, int INCX); | ||
|
|
||
| void mean_gpu(float *x, int batch, int filters, int spatial, float *mean); | ||
| void variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance); | ||
| void normalize_gpu(float *x, float *mean, float *variance, int batch, int filters, int spatial); | ||
|
|
||
| void normalize_delta_gpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta); | ||
|
|
||
| void fast_mean_delta_gpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta); | ||
| void fast_variance_delta_gpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta); | ||
|
|
||
| void fast_variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance); | ||
| void fast_mean_gpu(float *x, int batch, int filters, int spatial, float *mean); | ||
| void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out); | ||
| void scale_bias_gpu(float *output, float *biases, int batch, int n, int size); | ||
| void backward_scale_gpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates); | ||
| void scale_bias_gpu(float *output, float *biases, int batch, int n, int size); | ||
| void add_bias_gpu(float *output, float *biases, int batch, int n, int size); | ||
| void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size); | ||
|
|
||
| void smooth_l1_gpu(int n, float *pred, float *truth, float *delta, float *error); | ||
| void l2_gpu(int n, float *pred, float *truth, float *delta, float *error); | ||
| void l1_gpu(int n, float *pred, float *truth, float *delta, float *error); | ||
| void weighted_delta_gpu(float *a, float *b, float *s, float *da, float *db, float *ds, int num, float *dc); | ||
| void weighted_sum_gpu(float *a, float *b, float *s, int num, float *c); | ||
| void mult_add_into_gpu(int num, float *a, float *b, float *c); | ||
|
|
||
| void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out); | ||
|
|
||
| void softmax_gpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output); | ||
| void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t); | ||
|
|
||
| void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out); | ||
|
|
||
| #endif | ||
| #endif |
| @@ -0,0 +1,345 @@ | ||
| #include "box.h" | ||
| #include <stdio.h> | ||
| #include <math.h> | ||
| #include <stdlib.h> | ||
|
|
||
| box float_to_box(float *f, int stride) | ||
| { | ||
| box b; | ||
| b.x = f[0]; | ||
| b.y = f[1*stride]; | ||
| b.w = f[2*stride]; | ||
| b.h = f[3*stride]; | ||
| return b; | ||
| } | ||
|
|
||
| dbox derivative(box a, box b) | ||
| { | ||
| dbox d; | ||
| d.dx = 0; | ||
| d.dw = 0; | ||
| float l1 = a.x - a.w/2; | ||
| float l2 = b.x - b.w/2; | ||
| if (l1 > l2){ | ||
| d.dx -= 1; | ||
| d.dw += .5; | ||
| } | ||
| float r1 = a.x + a.w/2; | ||
| float r2 = b.x + b.w/2; | ||
| if(r1 < r2){ | ||
| d.dx += 1; | ||
| d.dw += .5; | ||
| } | ||
| if (l1 > r2) { | ||
| d.dx = -1; | ||
| d.dw = 0; | ||
| } | ||
| if (r1 < l2){ | ||
| d.dx = 1; | ||
| d.dw = 0; | ||
| } | ||
|
|
||
| d.dy = 0; | ||
| d.dh = 0; | ||
| float t1 = a.y - a.h/2; | ||
| float t2 = b.y - b.h/2; | ||
| if (t1 > t2){ | ||
| d.dy -= 1; | ||
| d.dh += .5; | ||
| } | ||
| float b1 = a.y + a.h/2; | ||
| float b2 = b.y + b.h/2; | ||
| if(b1 < b2){ | ||
| d.dy += 1; | ||
| d.dh += .5; | ||
| } | ||
| if (t1 > b2) { | ||
| d.dy = -1; | ||
| d.dh = 0; | ||
| } | ||
| if (b1 < t2){ | ||
| d.dy = 1; | ||
| d.dh = 0; | ||
| } | ||
| return d; | ||
| } | ||
|
|
||
| float overlap(float x1, float w1, float x2, float w2) | ||
| { | ||
| float l1 = x1 - w1/2; | ||
| float l2 = x2 - w2/2; | ||
| float left = l1 > l2 ? l1 : l2; | ||
| float r1 = x1 + w1/2; | ||
| float r2 = x2 + w2/2; | ||
| float right = r1 < r2 ? r1 : r2; | ||
| return right - left; | ||
| } | ||
|
|
||
| float box_intersection(box a, box b) | ||
| { | ||
| float w = overlap(a.x, a.w, b.x, b.w); | ||
| float h = overlap(a.y, a.h, b.y, b.h); | ||
| if(w < 0 || h < 0) return 0; | ||
| float area = w*h; | ||
| return area; | ||
| } | ||
|
|
||
| float box_union(box a, box b) | ||
| { | ||
| float i = box_intersection(a, b); | ||
| float u = a.w*a.h + b.w*b.h - i; | ||
| return u; | ||
| } | ||
|
|
||
| float box_iou(box a, box b) | ||
| { | ||
| return box_intersection(a, b)/box_union(a, b); | ||
| } | ||
|
|
||
| float box_rmse(box a, box b) | ||
| { | ||
| return sqrt(pow(a.x-b.x, 2) + | ||
| pow(a.y-b.y, 2) + | ||
| pow(a.w-b.w, 2) + | ||
| pow(a.h-b.h, 2)); | ||
| } | ||
|
|
||
| dbox dintersect(box a, box b) | ||
| { | ||
| float w = overlap(a.x, a.w, b.x, b.w); | ||
| float h = overlap(a.y, a.h, b.y, b.h); | ||
| dbox dover = derivative(a, b); | ||
| dbox di; | ||
|
|
||
| di.dw = dover.dw*h; | ||
| di.dx = dover.dx*h; | ||
| di.dh = dover.dh*w; | ||
| di.dy = dover.dy*w; | ||
|
|
||
| return di; | ||
| } | ||
|
|
||
| dbox dunion(box a, box b) | ||
| { | ||
| dbox du; | ||
|
|
||
| dbox di = dintersect(a, b); | ||
| du.dw = a.h - di.dw; | ||
| du.dh = a.w - di.dh; | ||
| du.dx = -di.dx; | ||
| du.dy = -di.dy; | ||
|
|
||
| return du; | ||
| } | ||
|
|
||
|
|
||
| void test_dunion() | ||
| { | ||
| box a = {0, 0, 1, 1}; | ||
| box dxa= {0+.0001, 0, 1, 1}; | ||
| box dya= {0, 0+.0001, 1, 1}; | ||
| box dwa= {0, 0, 1+.0001, 1}; | ||
| box dha= {0, 0, 1, 1+.0001}; | ||
|
|
||
| box b = {.5, .5, .2, .2}; | ||
| dbox di = dunion(a,b); | ||
| printf("Union: %f %f %f %f\n", di.dx, di.dy, di.dw, di.dh); | ||
| float inter = box_union(a, b); | ||
| float xinter = box_union(dxa, b); | ||
| float yinter = box_union(dya, b); | ||
| float winter = box_union(dwa, b); | ||
| float hinter = box_union(dha, b); | ||
| xinter = (xinter - inter)/(.0001); | ||
| yinter = (yinter - inter)/(.0001); | ||
| winter = (winter - inter)/(.0001); | ||
| hinter = (hinter - inter)/(.0001); | ||
| printf("Union Manual %f %f %f %f\n", xinter, yinter, winter, hinter); | ||
| } | ||
| void test_dintersect() | ||
| { | ||
| box a = {0, 0, 1, 1}; | ||
| box dxa= {0+.0001, 0, 1, 1}; | ||
| box dya= {0, 0+.0001, 1, 1}; | ||
| box dwa= {0, 0, 1+.0001, 1}; | ||
| box dha= {0, 0, 1, 1+.0001}; | ||
|
|
||
| box b = {.5, .5, .2, .2}; | ||
| dbox di = dintersect(a,b); | ||
| printf("Inter: %f %f %f %f\n", di.dx, di.dy, di.dw, di.dh); | ||
| float inter = box_intersection(a, b); | ||
| float xinter = box_intersection(dxa, b); | ||
| float yinter = box_intersection(dya, b); | ||
| float winter = box_intersection(dwa, b); | ||
| float hinter = box_intersection(dha, b); | ||
| xinter = (xinter - inter)/(.0001); | ||
| yinter = (yinter - inter)/(.0001); | ||
| winter = (winter - inter)/(.0001); | ||
| hinter = (hinter - inter)/(.0001); | ||
| printf("Inter Manual %f %f %f %f\n", xinter, yinter, winter, hinter); | ||
| } | ||
|
|
||
| void test_box() | ||
| { | ||
| test_dintersect(); | ||
| test_dunion(); | ||
| box a = {0, 0, 1, 1}; | ||
| box dxa= {0+.00001, 0, 1, 1}; | ||
| box dya= {0, 0+.00001, 1, 1}; | ||
| box dwa= {0, 0, 1+.00001, 1}; | ||
| box dha= {0, 0, 1, 1+.00001}; | ||
|
|
||
| box b = {.5, 0, .2, .2}; | ||
|
|
||
| float iou = box_iou(a,b); | ||
| iou = (1-iou)*(1-iou); | ||
| printf("%f\n", iou); | ||
| dbox d = diou(a, b); | ||
| printf("%f %f %f %f\n", d.dx, d.dy, d.dw, d.dh); | ||
|
|
||
| float xiou = box_iou(dxa, b); | ||
| float yiou = box_iou(dya, b); | ||
| float wiou = box_iou(dwa, b); | ||
| float hiou = box_iou(dha, b); | ||
| xiou = ((1-xiou)*(1-xiou) - iou)/(.00001); | ||
| yiou = ((1-yiou)*(1-yiou) - iou)/(.00001); | ||
| wiou = ((1-wiou)*(1-wiou) - iou)/(.00001); | ||
| hiou = ((1-hiou)*(1-hiou) - iou)/(.00001); | ||
| printf("manual %f %f %f %f\n", xiou, yiou, wiou, hiou); | ||
| } | ||
|
|
||
| dbox diou(box a, box b) | ||
| { | ||
| float u = box_union(a,b); | ||
| float i = box_intersection(a,b); | ||
| dbox di = dintersect(a,b); | ||
| dbox du = dunion(a,b); | ||
| dbox dd = {0,0,0,0}; | ||
|
|
||
| if(i <= 0 || 1) { | ||
| dd.dx = b.x - a.x; | ||
| dd.dy = b.y - a.y; | ||
| dd.dw = b.w - a.w; | ||
| dd.dh = b.h - a.h; | ||
| return dd; | ||
| } | ||
|
|
||
| dd.dx = 2*pow((1-(i/u)),1)*(di.dx*u - du.dx*i)/(u*u); | ||
| dd.dy = 2*pow((1-(i/u)),1)*(di.dy*u - du.dy*i)/(u*u); | ||
| dd.dw = 2*pow((1-(i/u)),1)*(di.dw*u - du.dw*i)/(u*u); | ||
| dd.dh = 2*pow((1-(i/u)),1)*(di.dh*u - du.dh*i)/(u*u); | ||
| return dd; | ||
| } | ||
|
|
||
| typedef struct{ | ||
| int index; | ||
| int class; | ||
| float **probs; | ||
| } sortable_bbox; | ||
|
|
||
| int nms_comparator(const void *pa, const void *pb) | ||
| { | ||
| sortable_bbox a = *(sortable_bbox *)pa; | ||
| sortable_bbox b = *(sortable_bbox *)pb; | ||
| float diff = a.probs[a.index][b.class] - b.probs[b.index][b.class]; | ||
| if(diff < 0) return 1; | ||
| else if(diff > 0) return -1; | ||
| return 0; | ||
| } | ||
|
|
||
| void do_nms_obj(box *boxes, float **probs, int total, int classes, float thresh) | ||
| { | ||
| int i, j, k; | ||
| sortable_bbox *s = calloc(total, sizeof(sortable_bbox)); | ||
|
|
||
| for(i = 0; i < total; ++i){ | ||
| s[i].index = i; | ||
| s[i].class = classes; | ||
| s[i].probs = probs; | ||
| } | ||
|
|
||
| qsort(s, total, sizeof(sortable_bbox), nms_comparator); | ||
| for(i = 0; i < total; ++i){ | ||
| if(probs[s[i].index][classes] == 0) continue; | ||
| box a = boxes[s[i].index]; | ||
| for(j = i+1; j < total; ++j){ | ||
| box b = boxes[s[j].index]; | ||
| if (box_iou(a, b) > thresh){ | ||
| for(k = 0; k < classes+1; ++k){ | ||
| probs[s[j].index][k] = 0; | ||
| } | ||
| } | ||
| } | ||
| } | ||
| free(s); | ||
| } | ||
|
|
||
|
|
||
| void do_nms_sort(box *boxes, float **probs, int total, int classes, float thresh) | ||
| { | ||
| int i, j, k; | ||
| sortable_bbox *s = calloc(total, sizeof(sortable_bbox)); | ||
|
|
||
| for(i = 0; i < total; ++i){ | ||
| s[i].index = i; | ||
| s[i].class = 0; | ||
| s[i].probs = probs; | ||
| } | ||
|
|
||
| for(k = 0; k < classes; ++k){ | ||
| for(i = 0; i < total; ++i){ | ||
| s[i].class = k; | ||
| } | ||
| qsort(s, total, sizeof(sortable_bbox), nms_comparator); | ||
| for(i = 0; i < total; ++i){ | ||
| if(probs[s[i].index][k] == 0) continue; | ||
| box a = boxes[s[i].index]; | ||
| for(j = i+1; j < total; ++j){ | ||
| box b = boxes[s[j].index]; | ||
| if (box_iou(a, b) > thresh){ | ||
| probs[s[j].index][k] = 0; | ||
| } | ||
| } | ||
| } | ||
| } | ||
| free(s); | ||
| } | ||
|
|
||
| void do_nms(box *boxes, float **probs, int total, int classes, float thresh) | ||
| { | ||
| int i, j, k; | ||
| for(i = 0; i < total; ++i){ | ||
| int any = 0; | ||
| for(k = 0; k < classes; ++k) any = any || (probs[i][k] > 0); | ||
| if(!any) { | ||
| continue; | ||
| } | ||
| for(j = i+1; j < total; ++j){ | ||
| if (box_iou(boxes[i], boxes[j]) > thresh){ | ||
| for(k = 0; k < classes; ++k){ | ||
| if (probs[i][k] < probs[j][k]) probs[i][k] = 0; | ||
| else probs[j][k] = 0; | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| box encode_box(box b, box anchor) | ||
| { | ||
| box encode; | ||
| encode.x = (b.x - anchor.x) / anchor.w; | ||
| encode.y = (b.y - anchor.y) / anchor.h; | ||
| encode.w = log2(b.w / anchor.w); | ||
| encode.h = log2(b.h / anchor.h); | ||
| return encode; | ||
| } | ||
|
|
||
| box decode_box(box b, box anchor) | ||
| { | ||
| box decode; | ||
| decode.x = b.x * anchor.w + anchor.x; | ||
| decode.y = b.y * anchor.h + anchor.y; | ||
| decode.w = pow(2., b.w) * anchor.w; | ||
| decode.h = pow(2., b.h) * anchor.h; | ||
| return decode; | ||
| } |
| @@ -0,0 +1,22 @@ | ||
| #ifndef BOX_H | ||
| #define BOX_H | ||
|
|
||
| typedef struct{ | ||
| float x, y, w, h; | ||
| } box; | ||
|
|
||
| typedef struct{ | ||
| float dx, dy, dw, dh; | ||
| } dbox; | ||
|
|
||
| box float_to_box(float *f, int stride); | ||
| float box_iou(box a, box b); | ||
| float box_rmse(box a, box b); | ||
| dbox diou(box a, box b); | ||
| void do_nms(box *boxes, float **probs, int total, int classes, float thresh); | ||
| void do_nms_sort(box *boxes, float **probs, int total, int classes, float thresh); | ||
| void do_nms_obj(box *boxes, float **probs, int total, int classes, float thresh); | ||
| box decode_box(box b, box anchor); | ||
| box encode_box(box b, box anchor); | ||
|
|
||
| #endif |
| @@ -0,0 +1,364 @@ | ||
| #include "network.h" | ||
| #include "utils.h" | ||
| #include "parser.h" | ||
|
|
||
| void fix_data_captcha(data d, int mask) | ||
| { | ||
| matrix labels = d.y; | ||
| int i, j; | ||
| for(i = 0; i < d.y.rows; ++i){ | ||
| for(j = 0; j < d.y.cols; j += 2){ | ||
| if (mask){ | ||
| if(!labels.vals[i][j]){ | ||
| labels.vals[i][j] = SECRET_NUM; | ||
| labels.vals[i][j+1] = SECRET_NUM; | ||
| }else if(labels.vals[i][j+1]){ | ||
| labels.vals[i][j] = 0; | ||
| } | ||
| } else{ | ||
| if (labels.vals[i][j]) { | ||
| labels.vals[i][j+1] = 0; | ||
| } else { | ||
| labels.vals[i][j+1] = 1; | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| void train_captcha(char *cfgfile, char *weightfile) | ||
| { | ||
| srand(time(0)); | ||
| float avg_loss = -1; | ||
| char *base = basecfg(cfgfile); | ||
| printf("%s\n", base); | ||
| network net = parse_network_cfg(cfgfile); | ||
| if(weightfile){ | ||
| load_weights(&net, weightfile); | ||
| } | ||
| printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); | ||
| int imgs = 1024; | ||
| int i = *net.seen/imgs; | ||
| int solved = 1; | ||
| list *plist; | ||
| char **labels = get_labels("/data/captcha/reimgs.labels.list"); | ||
| if (solved){ | ||
| plist = get_paths("/data/captcha/reimgs.solved.list"); | ||
| }else{ | ||
| plist = get_paths("/data/captcha/reimgs.raw.list"); | ||
| } | ||
| char **paths = (char **)list_to_array(plist); | ||
| printf("%d\n", plist->size); | ||
| clock_t time; | ||
| pthread_t load_thread; | ||
| data train; | ||
| data buffer; | ||
|
|
||
| load_args args = {0}; | ||
| args.w = net.w; | ||
| args.h = net.h; | ||
| args.paths = paths; | ||
| args.classes = 26; | ||
| args.n = imgs; | ||
| args.m = plist->size; | ||
| args.labels = labels; | ||
| args.d = &buffer; | ||
| args.type = CLASSIFICATION_DATA; | ||
|
|
||
| load_thread = load_data_in_thread(args); | ||
| while(1){ | ||
| ++i; | ||
| time=clock(); | ||
| pthread_join(load_thread, 0); | ||
| train = buffer; | ||
| fix_data_captcha(train, solved); | ||
|
|
||
| /* | ||
| image im = float_to_image(256, 256, 3, train.X.vals[114]); | ||
| show_image(im, "training"); | ||
| cvWaitKey(0); | ||
| */ | ||
|
|
||
| load_thread = load_data_in_thread(args); | ||
| printf("Loaded: %lf seconds\n", sec(clock()-time)); | ||
| time=clock(); | ||
| float loss = train_network(net, train); | ||
| if(avg_loss == -1) avg_loss = loss; | ||
| avg_loss = avg_loss*.9 + loss*.1; | ||
| printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), *net.seen); | ||
| free_data(train); | ||
| if(i%100==0){ | ||
| char buff[256]; | ||
| sprintf(buff, "/home/pjreddie/imagenet_backup/%s_%d.weights",base, i); | ||
| save_weights(net, buff); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| void test_captcha(char *cfgfile, char *weightfile, char *filename) | ||
| { | ||
| network net = parse_network_cfg(cfgfile); | ||
| if(weightfile){ | ||
| load_weights(&net, weightfile); | ||
| } | ||
| set_batch_network(&net, 1); | ||
| srand(2222222); | ||
| int i = 0; | ||
| char **names = get_labels("/data/captcha/reimgs.labels.list"); | ||
| char buff[256]; | ||
| char *input = buff; | ||
| int indexes[26]; | ||
| while(1){ | ||
| if(filename){ | ||
| strncpy(input, filename, 256); | ||
| }else{ | ||
| //printf("Enter Image Path: "); | ||
| //fflush(stdout); | ||
| input = fgets(input, 256, stdin); | ||
| if(!input) return; | ||
| strtok(input, "\n"); | ||
| } | ||
| image im = load_image_color(input, net.w, net.h); | ||
| float *X = im.data; | ||
| float *predictions = network_predict(net, X); | ||
| top_predictions(net, 26, indexes); | ||
| //printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time)); | ||
| for(i = 0; i < 26; ++i){ | ||
| int index = indexes[i]; | ||
| if(i != 0) printf(", "); | ||
| printf("%s %f", names[index], predictions[index]); | ||
| } | ||
| printf("\n"); | ||
| fflush(stdout); | ||
| free_image(im); | ||
| if (filename) break; | ||
| } | ||
| } | ||
|
|
||
| void valid_captcha(char *cfgfile, char *weightfile, char *filename) | ||
| { | ||
| char **labels = get_labels("/data/captcha/reimgs.labels.list"); | ||
| network net = parse_network_cfg(cfgfile); | ||
| if(weightfile){ | ||
| load_weights(&net, weightfile); | ||
| } | ||
| list *plist = get_paths("/data/captcha/reimgs.fg.list"); | ||
| char **paths = (char **)list_to_array(plist); | ||
| int N = plist->size; | ||
| int outputs = net.outputs; | ||
|
|
||
| set_batch_network(&net, 1); | ||
| srand(2222222); | ||
| int i, j; | ||
| for(i = 0; i < N; ++i){ | ||
| if (i%100 == 0) fprintf(stderr, "%d\n", i); | ||
| image im = load_image_color(paths[i], net.w, net.h); | ||
| float *X = im.data; | ||
| float *predictions = network_predict(net, X); | ||
| //printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time)); | ||
| int truth = -1; | ||
| for(j = 0; j < 13; ++j){ | ||
| if (strstr(paths[i], labels[j])) truth = j; | ||
| } | ||
| if (truth == -1){ | ||
| fprintf(stderr, "bad: %s\n", paths[i]); | ||
| return; | ||
| } | ||
| printf("%d, ", truth); | ||
| for(j = 0; j < outputs; ++j){ | ||
| if (j != 0) printf(", "); | ||
| printf("%f", predictions[j]); | ||
| } | ||
| printf("\n"); | ||
| fflush(stdout); | ||
| free_image(im); | ||
| if (filename) break; | ||
| } | ||
| } | ||
|
|
||
| /* | ||
| void train_captcha(char *cfgfile, char *weightfile) | ||
| { | ||
| float avg_loss = -1; | ||
| srand(time(0)); | ||
| char *base = basecfg(cfgfile); | ||
| printf("%s\n", base); | ||
| network net = parse_network_cfg(cfgfile); | ||
| if(weightfile){ | ||
| load_weights(&net, weightfile); | ||
| } | ||
| printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); | ||
| int imgs = 1024; | ||
| int i = net.seen/imgs; | ||
| list *plist = get_paths("/data/captcha/train.auto5"); | ||
| char **paths = (char **)list_to_array(plist); | ||
| printf("%d\n", plist->size); | ||
| clock_t time; | ||
| while(1){ | ||
| ++i; | ||
| time=clock(); | ||
| data train = load_data_captcha(paths, imgs, plist->size, 10, 200, 60); | ||
| translate_data_rows(train, -128); | ||
| scale_data_rows(train, 1./128); | ||
| printf("Loaded: %lf seconds\n", sec(clock()-time)); | ||
| time=clock(); | ||
| float loss = train_network(net, train); | ||
| net.seen += imgs; | ||
| if(avg_loss == -1) avg_loss = loss; | ||
| avg_loss = avg_loss*.9 + loss*.1; | ||
| printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), net.seen); | ||
| free_data(train); | ||
| if(i%10==0){ | ||
| char buff[256]; | ||
| sprintf(buff, "/home/pjreddie/imagenet_backup/%s_%d.weights",base, i); | ||
| save_weights(net, buff); | ||
| } | ||
| } | ||
| } | ||
| void decode_captcha(char *cfgfile, char *weightfile) | ||
| { | ||
| setbuf(stdout, NULL); | ||
| srand(time(0)); | ||
| network net = parse_network_cfg(cfgfile); | ||
| set_batch_network(&net, 1); | ||
| if(weightfile){ | ||
| load_weights(&net, weightfile); | ||
| } | ||
| char filename[256]; | ||
| while(1){ | ||
| printf("Enter filename: "); | ||
| fgets(filename, 256, stdin); | ||
| strtok(filename, "\n"); | ||
| image im = load_image_color(filename, 300, 57); | ||
| scale_image(im, 1./255.); | ||
| float *X = im.data; | ||
| float *predictions = network_predict(net, X); | ||
| image out = float_to_image(300, 57, 1, predictions); | ||
| show_image(out, "decoded"); | ||
| #ifdef OPENCV | ||
| cvWaitKey(0); | ||
| #endif | ||
| free_image(im); | ||
| } | ||
| } | ||
| void encode_captcha(char *cfgfile, char *weightfile) | ||
| { | ||
| float avg_loss = -1; | ||
| srand(time(0)); | ||
| char *base = basecfg(cfgfile); | ||
| printf("%s\n", base); | ||
| network net = parse_network_cfg(cfgfile); | ||
| if(weightfile){ | ||
| load_weights(&net, weightfile); | ||
| } | ||
| printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); | ||
| int imgs = 1024; | ||
| int i = net.seen/imgs; | ||
| list *plist = get_paths("/data/captcha/encode.list"); | ||
| char **paths = (char **)list_to_array(plist); | ||
| printf("%d\n", plist->size); | ||
| clock_t time; | ||
| while(1){ | ||
| ++i; | ||
| time=clock(); | ||
| data train = load_data_captcha_encode(paths, imgs, plist->size, 300, 57); | ||
| scale_data_rows(train, 1./255); | ||
| printf("Loaded: %lf seconds\n", sec(clock()-time)); | ||
| time=clock(); | ||
| float loss = train_network(net, train); | ||
| net.seen += imgs; | ||
| if(avg_loss == -1) avg_loss = loss; | ||
| avg_loss = avg_loss*.9 + loss*.1; | ||
| printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), net.seen); | ||
| free_matrix(train.X); | ||
| if(i%100==0){ | ||
| char buff[256]; | ||
| sprintf(buff, "/home/pjreddie/imagenet_backup/%s_%d.weights",base, i); | ||
| save_weights(net, buff); | ||
| } | ||
| } | ||
| } | ||
| void validate_captcha(char *cfgfile, char *weightfile) | ||
| { | ||
| srand(time(0)); | ||
| char *base = basecfg(cfgfile); | ||
| printf("%s\n", base); | ||
| network net = parse_network_cfg(cfgfile); | ||
| if(weightfile){ | ||
| load_weights(&net, weightfile); | ||
| } | ||
| int numchars = 37; | ||
| list *plist = get_paths("/data/captcha/solved.hard"); | ||
| char **paths = (char **)list_to_array(plist); | ||
| int imgs = plist->size; | ||
| data valid = load_data_captcha(paths, imgs, 0, 10, 200, 60); | ||
| translate_data_rows(valid, -128); | ||
| scale_data_rows(valid, 1./128); | ||
| matrix pred = network_predict_data(net, valid); | ||
| int i, k; | ||
| int correct = 0; | ||
| int total = 0; | ||
| int accuracy = 0; | ||
| for(i = 0; i < imgs; ++i){ | ||
| int allcorrect = 1; | ||
| for(k = 0; k < 10; ++k){ | ||
| char truth = int_to_alphanum(max_index(valid.y.vals[i]+k*numchars, numchars)); | ||
| char prediction = int_to_alphanum(max_index(pred.vals[i]+k*numchars, numchars)); | ||
| if (truth != prediction) allcorrect=0; | ||
| if (truth != '.' && truth == prediction) ++correct; | ||
| if (truth != '.' || truth != prediction) ++total; | ||
| } | ||
| accuracy += allcorrect; | ||
| } | ||
| printf("Word Accuracy: %f, Char Accuracy %f\n", (float)accuracy/imgs, (float)correct/total); | ||
| free_data(valid); | ||
| } | ||
| void test_captcha(char *cfgfile, char *weightfile) | ||
| { | ||
| setbuf(stdout, NULL); | ||
| srand(time(0)); | ||
| //char *base = basecfg(cfgfile); | ||
| //printf("%s\n", base); | ||
| network net = parse_network_cfg(cfgfile); | ||
| set_batch_network(&net, 1); | ||
| if(weightfile){ | ||
| load_weights(&net, weightfile); | ||
| } | ||
| char filename[256]; | ||
| while(1){ | ||
| //printf("Enter filename: "); | ||
| fgets(filename, 256, stdin); | ||
| strtok(filename, "\n"); | ||
| image im = load_image_color(filename, 200, 60); | ||
| translate_image(im, -128); | ||
| scale_image(im, 1/128.); | ||
| float *X = im.data; | ||
| float *predictions = network_predict(net, X); | ||
| print_letters(predictions, 10); | ||
| free_image(im); | ||
| } | ||
| } | ||
| */ | ||
| void run_captcha(int argc, char **argv) | ||
| { | ||
| if(argc < 4){ | ||
| fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]); | ||
| return; | ||
| } | ||
|
|
||
| char *cfg = argv[3]; | ||
| char *weights = (argc > 4) ? argv[4] : 0; | ||
| char *filename = (argc > 5) ? argv[5]: 0; | ||
| if(0==strcmp(argv[2], "train")) train_captcha(cfg, weights); | ||
| else if(0==strcmp(argv[2], "test")) test_captcha(cfg, weights, filename); | ||
| else if(0==strcmp(argv[2], "valid")) valid_captcha(cfg, weights, filename); | ||
| //if(0==strcmp(argv[2], "test")) test_captcha(cfg, weights); | ||
| //else if(0==strcmp(argv[2], "encode")) encode_captcha(cfg, weights); | ||
| //else if(0==strcmp(argv[2], "decode")) decode_captcha(cfg, weights); | ||
| //else if(0==strcmp(argv[2], "valid")) validate_captcha(cfg, weights); | ||
| } | ||
|
|
| @@ -0,0 +1,273 @@ | ||
| #include "network.h" | ||
| #include "utils.h" | ||
| #include "parser.h" | ||
| #include "option_list.h" | ||
| #include "blas.h" | ||
|
|
||
| void train_cifar(char *cfgfile, char *weightfile) | ||
| { | ||
| srand(time(0)); | ||
| float avg_loss = -1; | ||
| char *base = basecfg(cfgfile); | ||
| printf("%s\n", base); | ||
| network net = parse_network_cfg(cfgfile); | ||
| if(weightfile){ | ||
| load_weights(&net, weightfile); | ||
| } | ||
| printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); | ||
|
|
||
| char *backup_directory = "/home/pjreddie/backup/"; | ||
| int classes = 10; | ||
| int N = 50000; | ||
|
|
||
| char **labels = get_labels("data/cifar/labels.txt"); | ||
| int epoch = (*net.seen)/N; | ||
| data train = load_all_cifar10(); | ||
| while(get_current_batch(net) < net.max_batches || net.max_batches == 0){ | ||
| clock_t time=clock(); | ||
|
|
||
| float loss = train_network_sgd(net, train, 1); | ||
| if(avg_loss == -1) avg_loss = loss; | ||
| avg_loss = avg_loss*.95 + loss*.05; | ||
| printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); | ||
| if(*net.seen/N > epoch){ | ||
| epoch = *net.seen/N; | ||
| char buff[256]; | ||
| sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch); | ||
| save_weights(net, buff); | ||
| } | ||
| if(get_current_batch(net)%100 == 0){ | ||
| char buff[256]; | ||
| sprintf(buff, "%s/%s.backup",backup_directory,base); | ||
| save_weights(net, buff); | ||
| } | ||
| } | ||
| char buff[256]; | ||
| sprintf(buff, "%s/%s.weights", backup_directory, base); | ||
| save_weights(net, buff); | ||
|
|
||
| free_network(net); | ||
| free_ptrs((void**)labels, classes); | ||
| free(base); | ||
| free_data(train); | ||
| } | ||
|
|
||
| void train_cifar_distill(char *cfgfile, char *weightfile) | ||
| { | ||
| srand(time(0)); | ||
| float avg_loss = -1; | ||
| char *base = basecfg(cfgfile); | ||
| printf("%s\n", base); | ||
| network net = parse_network_cfg(cfgfile); | ||
| if(weightfile){ | ||
| load_weights(&net, weightfile); | ||
| } | ||
| printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); | ||
|
|
||
| char *backup_directory = "/home/pjreddie/backup/"; | ||
| int classes = 10; | ||
| int N = 50000; | ||
|
|
||
| char **labels = get_labels("data/cifar/labels.txt"); | ||
| int epoch = (*net.seen)/N; | ||
|
|
||
| data train = load_all_cifar10(); | ||
| matrix soft = csv_to_matrix("results/ensemble.csv"); | ||
|
|
||
| float weight = .9; | ||
| scale_matrix(soft, weight); | ||
| scale_matrix(train.y, 1. - weight); | ||
| matrix_add_matrix(soft, train.y); | ||
|
|
||
| while(get_current_batch(net) < net.max_batches || net.max_batches == 0){ | ||
| clock_t time=clock(); | ||
|
|
||
| float loss = train_network_sgd(net, train, 1); | ||
| if(avg_loss == -1) avg_loss = loss; | ||
| avg_loss = avg_loss*.95 + loss*.05; | ||
| printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); | ||
| if(*net.seen/N > epoch){ | ||
| epoch = *net.seen/N; | ||
| char buff[256]; | ||
| sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch); | ||
| save_weights(net, buff); | ||
| } | ||
| if(get_current_batch(net)%100 == 0){ | ||
| char buff[256]; | ||
| sprintf(buff, "%s/%s.backup",backup_directory,base); | ||
| save_weights(net, buff); | ||
| } | ||
| } | ||
| char buff[256]; | ||
| sprintf(buff, "%s/%s.weights", backup_directory, base); | ||
| save_weights(net, buff); | ||
|
|
||
| free_network(net); | ||
| free_ptrs((void**)labels, classes); | ||
| free(base); | ||
| free_data(train); | ||
| } | ||
|
|
||
| void test_cifar_multi(char *filename, char *weightfile) | ||
| { | ||
| network net = parse_network_cfg(filename); | ||
| if(weightfile){ | ||
| load_weights(&net, weightfile); | ||
| } | ||
| set_batch_network(&net, 1); | ||
| srand(time(0)); | ||
|
|
||
| float avg_acc = 0; | ||
| data test = load_cifar10_data("data/cifar/cifar-10-batches-bin/test_batch.bin"); | ||
|
|
||
| int i; | ||
| for(i = 0; i < test.X.rows; ++i){ | ||
| image im = float_to_image(32, 32, 3, test.X.vals[i]); | ||
|
|
||
| float pred[10] = {0}; | ||
|
|
||
| float *p = network_predict(net, im.data); | ||
| axpy_cpu(10, 1, p, 1, pred, 1); | ||
| flip_image(im); | ||
| p = network_predict(net, im.data); | ||
| axpy_cpu(10, 1, p, 1, pred, 1); | ||
|
|
||
| int index = max_index(pred, 10); | ||
| int class = max_index(test.y.vals[i], 10); | ||
| if(index == class) avg_acc += 1; | ||
| free_image(im); | ||
| printf("%4d: %.2f%%\n", i, 100.*avg_acc/(i+1)); | ||
| } | ||
| } | ||
|
|
||
| void test_cifar(char *filename, char *weightfile) | ||
| { | ||
| network net = parse_network_cfg(filename); | ||
| if(weightfile){ | ||
| load_weights(&net, weightfile); | ||
| } | ||
| srand(time(0)); | ||
|
|
||
| clock_t time; | ||
| float avg_acc = 0; | ||
| float avg_top5 = 0; | ||
| data test = load_cifar10_data("data/cifar/cifar-10-batches-bin/test_batch.bin"); | ||
|
|
||
| time=clock(); | ||
|
|
||
| float *acc = network_accuracies(net, test, 2); | ||
| avg_acc += acc[0]; | ||
| avg_top5 += acc[1]; | ||
| printf("top1: %f, %lf seconds, %d images\n", avg_acc, sec(clock()-time), test.X.rows); | ||
| free_data(test); | ||
| } | ||
|
|
||
| void extract_cifar() | ||
| { | ||
| char *labels[] = {"airplane","automobile","bird","cat","deer","dog","frog","horse","ship","truck"}; | ||
| int i; | ||
| data train = load_all_cifar10(); | ||
| data test = load_cifar10_data("data/cifar/cifar-10-batches-bin/test_batch.bin"); | ||
| for(i = 0; i < train.X.rows; ++i){ | ||
| image im = float_to_image(32, 32, 3, train.X.vals[i]); | ||
| int class = max_index(train.y.vals[i], 10); | ||
| char buff[256]; | ||
| sprintf(buff, "data/cifar/train/%d_%s",i,labels[class]); | ||
| save_image_png(im, buff); | ||
| } | ||
| for(i = 0; i < test.X.rows; ++i){ | ||
| image im = float_to_image(32, 32, 3, test.X.vals[i]); | ||
| int class = max_index(test.y.vals[i], 10); | ||
| char buff[256]; | ||
| sprintf(buff, "data/cifar/test/%d_%s",i,labels[class]); | ||
| save_image_png(im, buff); | ||
| } | ||
| } | ||
|
|
||
| void test_cifar_csv(char *filename, char *weightfile) | ||
| { | ||
| network net = parse_network_cfg(filename); | ||
| if(weightfile){ | ||
| load_weights(&net, weightfile); | ||
| } | ||
| srand(time(0)); | ||
|
|
||
| data test = load_cifar10_data("data/cifar/cifar-10-batches-bin/test_batch.bin"); | ||
|
|
||
| matrix pred = network_predict_data(net, test); | ||
|
|
||
| int i; | ||
| for(i = 0; i < test.X.rows; ++i){ | ||
| image im = float_to_image(32, 32, 3, test.X.vals[i]); | ||
| flip_image(im); | ||
| } | ||
| matrix pred2 = network_predict_data(net, test); | ||
| scale_matrix(pred, .5); | ||
| scale_matrix(pred2, .5); | ||
| matrix_add_matrix(pred2, pred); | ||
|
|
||
| matrix_to_csv(pred); | ||
| fprintf(stderr, "Accuracy: %f\n", matrix_topk_accuracy(test.y, pred, 1)); | ||
| free_data(test); | ||
| } | ||
|
|
||
| void test_cifar_csvtrain(char *filename, char *weightfile) | ||
| { | ||
| network net = parse_network_cfg(filename); | ||
| if(weightfile){ | ||
| load_weights(&net, weightfile); | ||
| } | ||
| srand(time(0)); | ||
|
|
||
| data test = load_all_cifar10(); | ||
|
|
||
| matrix pred = network_predict_data(net, test); | ||
|
|
||
| int i; | ||
| for(i = 0; i < test.X.rows; ++i){ | ||
| image im = float_to_image(32, 32, 3, test.X.vals[i]); | ||
| flip_image(im); | ||
| } | ||
| matrix pred2 = network_predict_data(net, test); | ||
| scale_matrix(pred, .5); | ||
| scale_matrix(pred2, .5); | ||
| matrix_add_matrix(pred2, pred); | ||
|
|
||
| matrix_to_csv(pred); | ||
| fprintf(stderr, "Accuracy: %f\n", matrix_topk_accuracy(test.y, pred, 1)); | ||
| free_data(test); | ||
| } | ||
|
|
||
| void eval_cifar_csv() | ||
| { | ||
| data test = load_cifar10_data("data/cifar/cifar-10-batches-bin/test_batch.bin"); | ||
|
|
||
| matrix pred = csv_to_matrix("results/combined.csv"); | ||
| fprintf(stderr, "%d %d\n", pred.rows, pred.cols); | ||
|
|
||
| fprintf(stderr, "Accuracy: %f\n", matrix_topk_accuracy(test.y, pred, 1)); | ||
| free_data(test); | ||
| free_matrix(pred); | ||
| } | ||
|
|
||
|
|
||
| void run_cifar(int argc, char **argv) | ||
| { | ||
| if(argc < 4){ | ||
| fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]); | ||
| return; | ||
| } | ||
|
|
||
| char *cfg = argv[3]; | ||
| char *weights = (argc > 4) ? argv[4] : 0; | ||
| if(0==strcmp(argv[2], "train")) train_cifar(cfg, weights); | ||
| else if(0==strcmp(argv[2], "extract")) extract_cifar(); | ||
| else if(0==strcmp(argv[2], "distill")) train_cifar_distill(cfg, weights); | ||
| else if(0==strcmp(argv[2], "test")) test_cifar(cfg, weights); | ||
| else if(0==strcmp(argv[2], "multi")) test_cifar_multi(cfg, weights); | ||
| else if(0==strcmp(argv[2], "csv")) test_cifar_csv(cfg, weights); | ||
| else if(0==strcmp(argv[2], "csvtrain")) test_cifar_csvtrain(cfg, weights); | ||
| else if(0==strcmp(argv[2], "eval")) eval_cifar_csv(); | ||
| } | ||
|
|
||
|
|
| @@ -0,0 +1,2 @@ | ||
|
|
||
| list *read_data_cfg(char *filename); |
| @@ -0,0 +1,384 @@ | ||
| #include <stdio.h> | ||
|
|
||
| #include "network.h" | ||
| #include "detection_layer.h" | ||
| #include "cost_layer.h" | ||
| #include "utils.h" | ||
| #include "parser.h" | ||
| #include "box.h" | ||
| #include "demo.h" | ||
|
|
||
| char *coco_classes[] = {"person","bicycle","car","motorcycle","airplane","bus","train","truck","boat","traffic light","fire hydrant","stop sign","parking meter","bench","bird","cat","dog","horse","sheep","cow","elephant","bear","zebra","giraffe","backpack","umbrella","handbag","tie","suitcase","frisbee","skis","snowboard","sports ball","kite","baseball bat","baseball glove","skateboard","surfboard","tennis racket","bottle","wine glass","cup","fork","knife","spoon","bowl","banana","apple","sandwich","orange","broccoli","carrot","hot dog","pizza","donut","cake","chair","couch","potted plant","bed","dining table","toilet","tv","laptop","mouse","remote","keyboard","cell phone","microwave","oven","toaster","sink","refrigerator","book","clock","vase","scissors","teddy bear","hair drier","toothbrush"}; | ||
|
|
||
| int coco_ids[] = {1,2,3,4,5,6,7,8,9,10,11,13,14,15,16,17,18,19,20,21,22,23,24,25,27,28,31,32,33,34,35,36,37,38,39,40,41,42,43,44,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,67,70,72,73,74,75,76,77,78,79,80,81,82,84,85,86,87,88,89,90}; | ||
|
|
||
| void train_coco(char *cfgfile, char *weightfile) | ||
| { | ||
| //char *train_images = "/home/pjreddie/data/voc/test/train.txt"; | ||
| //char *train_images = "/home/pjreddie/data/coco/train.txt"; | ||
| char *train_images = "data/coco.trainval.txt"; | ||
| //char *train_images = "data/bags.train.list"; | ||
| char *backup_directory = "/home/pjreddie/backup/"; | ||
| srand(time(0)); | ||
| char *base = basecfg(cfgfile); | ||
| printf("%s\n", base); | ||
| float avg_loss = -1; | ||
| network net = parse_network_cfg(cfgfile); | ||
| if(weightfile){ | ||
| load_weights(&net, weightfile); | ||
| } | ||
| printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); | ||
| int imgs = net.batch*net.subdivisions; | ||
| int i = *net.seen/imgs; | ||
| data train, buffer; | ||
|
|
||
|
|
||
| layer l = net.layers[net.n - 1]; | ||
|
|
||
| int side = l.side; | ||
| int classes = l.classes; | ||
| float jitter = l.jitter; | ||
|
|
||
| list *plist = get_paths(train_images); | ||
| //int N = plist->size; | ||
| char **paths = (char **)list_to_array(plist); | ||
|
|
||
| load_args args = {0}; | ||
| args.w = net.w; | ||
| args.h = net.h; | ||
| args.paths = paths; | ||
| args.n = imgs; | ||
| args.m = plist->size; | ||
| args.classes = classes; | ||
| args.jitter = jitter; | ||
| args.num_boxes = side; | ||
| args.d = &buffer; | ||
| args.type = REGION_DATA; | ||
|
|
||
| args.angle = net.angle; | ||
| args.exposure = net.exposure; | ||
| args.saturation = net.saturation; | ||
| args.hue = net.hue; | ||
|
|
||
| pthread_t load_thread = load_data_in_thread(args); | ||
| clock_t time; | ||
| //while(i*imgs < N*120){ | ||
| while(get_current_batch(net) < net.max_batches){ | ||
| i += 1; | ||
| time=clock(); | ||
| pthread_join(load_thread, 0); | ||
| train = buffer; | ||
| load_thread = load_data_in_thread(args); | ||
|
|
||
| printf("Loaded: %lf seconds\n", sec(clock()-time)); | ||
|
|
||
| /* | ||
| image im = float_to_image(net.w, net.h, 3, train.X.vals[113]); | ||
| image copy = copy_image(im); | ||
| draw_coco(copy, train.y.vals[113], 7, "truth"); | ||
| cvWaitKey(0); | ||
| free_image(copy); | ||
| */ | ||
|
|
||
| time=clock(); | ||
| float loss = train_network(net, train); | ||
| if (avg_loss < 0) avg_loss = loss; | ||
| avg_loss = avg_loss*.9 + loss*.1; | ||
|
|
||
| printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs); | ||
| if(i%1000==0 || (i < 1000 && i%100 == 0)){ | ||
| char buff[256]; | ||
| sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); | ||
| save_weights(net, buff); | ||
| } | ||
| if(i%100==0){ | ||
| char buff[256]; | ||
| sprintf(buff, "%s/%s.backup", backup_directory, base); | ||
| save_weights(net, buff); | ||
| } | ||
| free_data(train); | ||
| } | ||
| char buff[256]; | ||
| sprintf(buff, "%s/%s_final.weights", backup_directory, base); | ||
| save_weights(net, buff); | ||
| } | ||
|
|
||
| void print_cocos(FILE *fp, int image_id, box *boxes, float **probs, int num_boxes, int classes, int w, int h) | ||
| { | ||
| int i, j; | ||
| for(i = 0; i < num_boxes; ++i){ | ||
| float xmin = boxes[i].x - boxes[i].w/2.; | ||
| float xmax = boxes[i].x + boxes[i].w/2.; | ||
| float ymin = boxes[i].y - boxes[i].h/2.; | ||
| float ymax = boxes[i].y + boxes[i].h/2.; | ||
|
|
||
| if (xmin < 0) xmin = 0; | ||
| if (ymin < 0) ymin = 0; | ||
| if (xmax > w) xmax = w; | ||
| if (ymax > h) ymax = h; | ||
|
|
||
| float bx = xmin; | ||
| float by = ymin; | ||
| float bw = xmax - xmin; | ||
| float bh = ymax - ymin; | ||
|
|
||
| for(j = 0; j < classes; ++j){ | ||
| if (probs[i][j]) fprintf(fp, "{\"image_id\":%d, \"category_id\":%d, \"bbox\":[%f, %f, %f, %f], \"score\":%f},\n", image_id, coco_ids[j], bx, by, bw, bh, probs[i][j]); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| int get_coco_image_id(char *filename) | ||
| { | ||
| char *p = strrchr(filename, '_'); | ||
| return atoi(p+1); | ||
| } | ||
|
|
||
| void validate_coco(char *cfgfile, char *weightfile) | ||
| { | ||
| network net = parse_network_cfg(cfgfile); | ||
| if(weightfile){ | ||
| load_weights(&net, weightfile); | ||
| } | ||
| set_batch_network(&net, 1); | ||
| fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); | ||
| srand(time(0)); | ||
|
|
||
| char *base = "results/"; | ||
| list *plist = get_paths("data/coco_val_5k.list"); | ||
| //list *plist = get_paths("/home/pjreddie/data/people-art/test.txt"); | ||
| //list *plist = get_paths("/home/pjreddie/data/voc/test/2007_test.txt"); | ||
| char **paths = (char **)list_to_array(plist); | ||
|
|
||
| layer l = net.layers[net.n-1]; | ||
| int classes = l.classes; | ||
| int side = l.side; | ||
|
|
||
| int j; | ||
| char buff[1024]; | ||
| snprintf(buff, 1024, "%s/coco_results.json", base); | ||
| FILE *fp = fopen(buff, "w"); | ||
| fprintf(fp, "[\n"); | ||
|
|
||
| box *boxes = calloc(side*side*l.n, sizeof(box)); | ||
| float **probs = calloc(side*side*l.n, sizeof(float *)); | ||
| for(j = 0; j < side*side*l.n; ++j) probs[j] = calloc(classes, sizeof(float *)); | ||
|
|
||
| int m = plist->size; | ||
| int i=0; | ||
| int t; | ||
|
|
||
| float thresh = .01; | ||
| int nms = 1; | ||
| float iou_thresh = .5; | ||
|
|
||
| int nthreads = 8; | ||
| image *val = calloc(nthreads, sizeof(image)); | ||
| image *val_resized = calloc(nthreads, sizeof(image)); | ||
| image *buf = calloc(nthreads, sizeof(image)); | ||
| image *buf_resized = calloc(nthreads, sizeof(image)); | ||
| pthread_t *thr = calloc(nthreads, sizeof(pthread_t)); | ||
|
|
||
| load_args args = {0}; | ||
| args.w = net.w; | ||
| args.h = net.h; | ||
| args.type = IMAGE_DATA; | ||
|
|
||
| for(t = 0; t < nthreads; ++t){ | ||
| args.path = paths[i+t]; | ||
| args.im = &buf[t]; | ||
| args.resized = &buf_resized[t]; | ||
| thr[t] = load_data_in_thread(args); | ||
| } | ||
| time_t start = time(0); | ||
| for(i = nthreads; i < m+nthreads; i += nthreads){ | ||
| fprintf(stderr, "%d\n", i); | ||
| for(t = 0; t < nthreads && i+t-nthreads < m; ++t){ | ||
| pthread_join(thr[t], 0); | ||
| val[t] = buf[t]; | ||
| val_resized[t] = buf_resized[t]; | ||
| } | ||
| for(t = 0; t < nthreads && i+t < m; ++t){ | ||
| args.path = paths[i+t]; | ||
| args.im = &buf[t]; | ||
| args.resized = &buf_resized[t]; | ||
| thr[t] = load_data_in_thread(args); | ||
| } | ||
| for(t = 0; t < nthreads && i+t-nthreads < m; ++t){ | ||
| char *path = paths[i+t-nthreads]; | ||
| int image_id = get_coco_image_id(path); | ||
| float *X = val_resized[t].data; | ||
| network_predict(net, X); | ||
| int w = val[t].w; | ||
| int h = val[t].h; | ||
| get_detection_boxes(l, w, h, thresh, probs, boxes, 0); | ||
| if (nms) do_nms_sort(boxes, probs, side*side*l.n, classes, iou_thresh); | ||
| print_cocos(fp, image_id, boxes, probs, side*side*l.n, classes, w, h); | ||
| free_image(val[t]); | ||
| free_image(val_resized[t]); | ||
| } | ||
| } | ||
| fseek(fp, -2, SEEK_CUR); | ||
| fprintf(fp, "\n]\n"); | ||
| fclose(fp); | ||
|
|
||
| fprintf(stderr, "Total Detection Time: %f Seconds\n", (double)(time(0) - start)); | ||
| } | ||
|
|
||
| void validate_coco_recall(char *cfgfile, char *weightfile) | ||
| { | ||
| network net = parse_network_cfg(cfgfile); | ||
| if(weightfile){ | ||
| load_weights(&net, weightfile); | ||
| } | ||
| set_batch_network(&net, 1); | ||
| fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); | ||
| srand(time(0)); | ||
|
|
||
| char *base = "results/comp4_det_test_"; | ||
| list *plist = get_paths("/home/pjreddie/data/voc/test/2007_test.txt"); | ||
| char **paths = (char **)list_to_array(plist); | ||
|
|
||
| layer l = net.layers[net.n-1]; | ||
| int classes = l.classes; | ||
| int side = l.side; | ||
|
|
||
| int j, k; | ||
| FILE **fps = calloc(classes, sizeof(FILE *)); | ||
| for(j = 0; j < classes; ++j){ | ||
| char buff[1024]; | ||
| snprintf(buff, 1024, "%s%s.txt", base, coco_classes[j]); | ||
| fps[j] = fopen(buff, "w"); | ||
| } | ||
| box *boxes = calloc(side*side*l.n, sizeof(box)); | ||
| float **probs = calloc(side*side*l.n, sizeof(float *)); | ||
| for(j = 0; j < side*side*l.n; ++j) probs[j] = calloc(classes, sizeof(float *)); | ||
|
|
||
| int m = plist->size; | ||
| int i=0; | ||
|
|
||
| float thresh = .001; | ||
| int nms = 0; | ||
| float iou_thresh = .5; | ||
| float nms_thresh = .5; | ||
|
|
||
| int total = 0; | ||
| int correct = 0; | ||
| int proposals = 0; | ||
| float avg_iou = 0; | ||
|
|
||
| for(i = 0; i < m; ++i){ | ||
| char *path = paths[i]; | ||
| image orig = load_image_color(path, 0, 0); | ||
| image sized = resize_image(orig, net.w, net.h); | ||
| char *id = basecfg(path); | ||
| network_predict(net, sized.data); | ||
| get_detection_boxes(l, 1, 1, thresh, probs, boxes, 1); | ||
| if (nms) do_nms(boxes, probs, side*side*l.n, 1, nms_thresh); | ||
|
|
||
| char labelpath[4096]; | ||
| find_replace(path, "images", "labels", labelpath); | ||
| find_replace(labelpath, "JPEGImages", "labels", labelpath); | ||
| find_replace(labelpath, ".jpg", ".txt", labelpath); | ||
| find_replace(labelpath, ".JPEG", ".txt", labelpath); | ||
|
|
||
| int num_labels = 0; | ||
| box_label *truth = read_boxes(labelpath, &num_labels); | ||
| for(k = 0; k < side*side*l.n; ++k){ | ||
| if(probs[k][0] > thresh){ | ||
| ++proposals; | ||
| } | ||
| } | ||
| for (j = 0; j < num_labels; ++j) { | ||
| ++total; | ||
| box t = {truth[j].x, truth[j].y, truth[j].w, truth[j].h}; | ||
| float best_iou = 0; | ||
| for(k = 0; k < side*side*l.n; ++k){ | ||
| float iou = box_iou(boxes[k], t); | ||
| if(probs[k][0] > thresh && iou > best_iou){ | ||
| best_iou = iou; | ||
| } | ||
| } | ||
| avg_iou += best_iou; | ||
| if(best_iou > iou_thresh){ | ||
| ++correct; | ||
| } | ||
| } | ||
|
|
||
| fprintf(stderr, "%5d %5d %5d\tRPs/Img: %.2f\tIOU: %.2f%%\tRecall:%.2f%%\n", i, correct, total, (float)proposals/(i+1), avg_iou*100/total, 100.*correct/total); | ||
| free(id); | ||
| free_image(orig); | ||
| free_image(sized); | ||
| } | ||
| } | ||
|
|
||
| void test_coco(char *cfgfile, char *weightfile, char *filename, float thresh) | ||
| { | ||
| image **alphabet = load_alphabet(); | ||
| network net = parse_network_cfg(cfgfile); | ||
| if(weightfile){ | ||
| load_weights(&net, weightfile); | ||
| } | ||
| detection_layer l = net.layers[net.n-1]; | ||
| set_batch_network(&net, 1); | ||
| srand(2222222); | ||
| float nms = .4; | ||
| clock_t time; | ||
| char buff[256]; | ||
| char *input = buff; | ||
| int j; | ||
| box *boxes = calloc(l.side*l.side*l.n, sizeof(box)); | ||
| float **probs = calloc(l.side*l.side*l.n, sizeof(float *)); | ||
| for(j = 0; j < l.side*l.side*l.n; ++j) probs[j] = calloc(l.classes, sizeof(float *)); | ||
| while(1){ | ||
| if(filename){ | ||
| strncpy(input, filename, 256); | ||
| } else { | ||
| printf("Enter Image Path: "); | ||
| fflush(stdout); | ||
| input = fgets(input, 256, stdin); | ||
| if(!input) return; | ||
| strtok(input, "\n"); | ||
| } | ||
| image im = load_image_color(input,0,0); | ||
| image sized = resize_image(im, net.w, net.h); | ||
| float *X = sized.data; | ||
| time=clock(); | ||
| network_predict(net, X); | ||
| printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time)); | ||
| get_detection_boxes(l, 1, 1, thresh, probs, boxes, 0); | ||
| if (nms) do_nms_sort(boxes, probs, l.side*l.side*l.n, l.classes, nms); | ||
| draw_detections(im, l.side*l.side*l.n, thresh, boxes, probs, coco_classes, alphabet, 80); | ||
| save_image(im, "prediction"); | ||
| show_image(im, "predictions"); | ||
| free_image(im); | ||
| free_image(sized); | ||
| #ifdef OPENCV | ||
| cvWaitKey(0); | ||
| cvDestroyAllWindows(); | ||
| #endif | ||
| if (filename) break; | ||
| } | ||
| } | ||
|
|
||
| void run_coco(int argc, char **argv) | ||
| { | ||
| char *prefix = find_char_arg(argc, argv, "-prefix", 0); | ||
| float thresh = find_float_arg(argc, argv, "-thresh", .2); | ||
| int cam_index = find_int_arg(argc, argv, "-c", 0); | ||
| int frame_skip = find_int_arg(argc, argv, "-s", 0); | ||
|
|
||
| if(argc < 4){ | ||
| fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]); | ||
| return; | ||
| } | ||
|
|
||
| char *cfg = argv[3]; | ||
| char *weights = (argc > 4) ? argv[4] : 0; | ||
| char *filename = (argc > 5) ? argv[5]: 0; | ||
| if(0==strcmp(argv[2], "test")) test_coco(cfg, weights, filename, thresh); | ||
| else if(0==strcmp(argv[2], "train")) train_coco(cfg, weights); | ||
| else if(0==strcmp(argv[2], "valid")) validate_coco(cfg, weights); | ||
| else if(0==strcmp(argv[2], "recall")) validate_coco_recall(cfg, weights); | ||
| else if(0==strcmp(argv[2], "demo")) demo(cfg, weights, thresh, cam_index, filename, coco_classes, 80, frame_skip, prefix, .5); | ||
| } |
| @@ -0,0 +1,39 @@ | ||
| #include <stdio.h> | ||
| #include <math.h> | ||
| void col2im_add_pixel(float *im, int height, int width, int channels, | ||
| int row, int col, int channel, int pad, float val) | ||
| { | ||
| row -= pad; | ||
| col -= pad; | ||
|
|
||
| if (row < 0 || col < 0 || | ||
| row >= height || col >= width) return; | ||
| im[col + width*(row + height*channel)] += val; | ||
| } | ||
| //This one might be too, can't remember. | ||
| void col2im_cpu(float* data_col, | ||
| int channels, int height, int width, | ||
| int ksize, int stride, int pad, float* data_im) | ||
| { | ||
| int c,h,w; | ||
| int height_col = (height + 2*pad - ksize) / stride + 1; | ||
| int width_col = (width + 2*pad - ksize) / stride + 1; | ||
|
|
||
| int channels_col = channels * ksize * ksize; | ||
| for (c = 0; c < channels_col; ++c) { | ||
| int w_offset = c % ksize; | ||
| int h_offset = (c / ksize) % ksize; | ||
| int c_im = c / ksize / ksize; | ||
| for (h = 0; h < height_col; ++h) { | ||
| for (w = 0; w < width_col; ++w) { | ||
| int im_row = h_offset + h * stride; | ||
| int im_col = w_offset + w * stride; | ||
| int col_index = (c * height_col + h) * width_col + w; | ||
| double val = data_col[col_index]; | ||
| col2im_add_pixel(data_im, height, width, channels, | ||
| im_row, im_col, c_im, pad, val); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
| @@ -0,0 +1,13 @@ | ||
| #ifndef COL2IM_H | ||
| #define COL2IM_H | ||
|
|
||
| void col2im_cpu(float* data_col, | ||
| int channels, int height, int width, | ||
| int ksize, int stride, int pad, float* data_im); | ||
|
|
||
| #ifdef GPU | ||
| void col2im_ongpu(float *data_col, | ||
| int channels, int height, int width, | ||
| int ksize, int stride, int pad, float *data_im); | ||
| #endif | ||
| #endif |
| @@ -0,0 +1,58 @@ | ||
| #include "cuda_runtime.h" | ||
| #include "curand.h" | ||
| #include "cublas_v2.h" | ||
|
|
||
| extern "C" { | ||
| #include "col2im.h" | ||
| #include "cuda.h" | ||
| } | ||
|
|
||
| // src: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu | ||
| // You may also want to read: https://github.com/BVLC/caffe/blob/master/LICENSE | ||
|
|
||
| __global__ void col2im_gpu_kernel(const int n, const float* data_col, | ||
| const int height, const int width, const int ksize, | ||
| const int pad, | ||
| const int stride, | ||
| const int height_col, const int width_col, | ||
| float *data_im) { | ||
| int index = blockIdx.x*blockDim.x+threadIdx.x; | ||
| for(; index < n; index += blockDim.x*gridDim.x){ | ||
| float val = 0; | ||
| int w = index % width + pad; | ||
| int h = (index / width) % height + pad; | ||
| int c = index / (width * height); | ||
| // compute the start and end of the output | ||
| int w_col_start = (w < ksize) ? 0 : (w - ksize) / stride + 1; | ||
| int w_col_end = min(w / stride + 1, width_col); | ||
| int h_col_start = (h < ksize) ? 0 : (h - ksize) / stride + 1; | ||
| int h_col_end = min(h / stride + 1, height_col); | ||
| // equivalent implementation | ||
| int offset = | ||
| (c * ksize * ksize + h * ksize + w) * height_col * width_col; | ||
| int coeff_h_col = (1 - stride * ksize * height_col) * width_col; | ||
| int coeff_w_col = (1 - stride * height_col * width_col); | ||
| for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { | ||
| for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { | ||
| val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col]; | ||
| } | ||
| } | ||
| data_im[index] += val; | ||
| } | ||
| } | ||
|
|
||
| void col2im_ongpu(float *data_col, | ||
| int channels, int height, int width, | ||
| int ksize, int stride, int pad, float *data_im){ | ||
| // We are going to launch channels * height_col * width_col kernels, each | ||
| // kernel responsible for copying a single-channel grid. | ||
| int height_col = (height + 2 * pad - ksize) / stride + 1; | ||
| int width_col = (width + 2 * pad - ksize) / stride + 1; | ||
| int num_kernels = channels * height * width; | ||
| col2im_gpu_kernel<<<(num_kernels+BLOCK-1)/BLOCK, | ||
| BLOCK>>>( | ||
| num_kernels, data_col, height, width, ksize, pad, | ||
| stride, height_col, | ||
| width_col, data_im); | ||
| } | ||
|
|