diff --git a/src/solver.rs b/src/solver.rs index cbd8b967..7a90acba 100644 --- a/src/solver.rs +++ b/src/solver.rs @@ -176,7 +176,7 @@ impl<'a, S: ISolver> Solver<'a, S>{ // if (this->param_.display() && this->iter_ % this->param_.display() == 0) { // LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate; // } - self.worker.apply_update(&self.param, &mut self.net); + self.worker.apply_update(&self.param, &mut self.net, self.iter); // Increment the internal iter counter -- its value should always indicate // the number of times the weights have been updated. @@ -205,7 +205,7 @@ impl<'a, S: ISolver> Solver<'a, S>{ /// Implementation of a specific Solver. pub trait ISolver { /// TODO: what does this do? - fn apply_update(&self, param: &SolverConfig, network: &mut Network); + fn apply_update(&self, param: &SolverConfig, network: &mut Network, iter: usize); } #[derive(Debug)] @@ -214,24 +214,40 @@ pub struct SolverConfig { /// Name of the solver. pub name: String, /// The `NetworkConfig` that is used to initialize the training network. - train_net: NetworkConfig, + pub train_net: NetworkConfig, /// Display the loss averaged over the last average_loss iterations. /// /// Default: 1 - average_loss: usize, + pub average_loss: usize, /// The number of iterations between two testing phases. /// /// Default: None - test_interval: Option, + pub test_interval: Option, /// If true, run an initial test pass before the first iteration, /// ensuring memory availability and printing the starting value of the loss. /// /// Default: true - test_initialization: bool, + pub test_initialization: bool, /// Accumulate gradients over minibatch_size instances. /// /// Default: 1 - minibatch_size: usize, + pub minibatch_size: usize, + /// The learning rate policy to be used. + /// + /// Default: Fixed + pub lr_policy: LRPolicy, + /// The base learning rate. + /// + /// Default: 0.01 + pub base_lr: f32, + /// gamma as used in the calculation of most learning rate policies. + /// + /// Default: 0.1 + pub gamma: f32, + /// The stepsize used in Step and Sigmoid learning policies. + /// + /// Default: 10 + pub stepsize: usize, } impl Default for SolverConfig { @@ -244,6 +260,12 @@ impl Default for SolverConfig { test_interval: None, test_initialization: true, minibatch_size: 1, + + lr_policy: LRPolicy::Fixed, + base_lr: 0.01f32, + gamma: 0.1f32, + + stepsize: 10, } } } @@ -253,4 +275,108 @@ impl SolverConfig { pub fn test_interval(&self) -> usize { self.test_interval.unwrap_or(0) } + + /// Return the current learning rate. The currently implemented learning rate + /// policies are as follows: + /// - fixed: always return base_lr. + /// - step: return base_lr * gamma ^ (floor(iter / step)) + /// - exp: return base_lr * gamma ^ iter + /// - inv: return base_lr * (1 + gamma * iter) ^ (- power) + /// - multistep: similar to step but it allows non uniform steps defined by + /// stepvalue + /// - poly: the effective learning rate follows a polynomial decay, to be + /// zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power) + /// - sigmoid: the effective learning rate follows a sigmod decay + /// return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize)))) + /// + /// where base_lr, max_iter, gamma, step, stepvalue and power are defined + /// in the solver config, and iter is the current iteration. + pub fn get_learning_rate(&self, iter: usize) -> f32 { + match self.lr_policy() { + LRPolicy::Fixed => { + self.base_lr() + } + LRPolicy::Step => { + let current_step = iter / self.stepsize(); + self.base_lr() * self.gamma().powf(current_step as f32) + } + LRPolicy::Multistep => { + // if (this->current_step_ < this->param_.stepvalue_size() && + // this->iter_ >= this->param_.stepvalue(this->current_step_)) { + // this->current_step_++; + // LOG(INFO) << "MultiStep Status: Iteration " << + // this->iter_ << ", step = " << this->current_step_; + // } + // rate = this->param_.base_lr() * + // pow(this->param_.gamma(), this->current_step_); + unimplemented!(); + } + LRPolicy::Exp => { + self.base_lr() * self.gamma().powf(iter as f32) + } + LRPolicy::Inv => { + // rate = this->param_.base_lr() * + // pow(Dtype(1) + this->param_.gamma() * this->iter_, + // - this->param_.power()); + unimplemented!(); + } + LRPolicy::Poly => { + // rate = this->param_.base_lr() * pow(Dtype(1.) - + // (Dtype(this->iter_) / Dtype(this->param_.max_iter())), + // this->param_.power()); + unimplemented!(); + } + LRPolicy::Sigmoid => { + // rate = this->param_.base_lr() * (Dtype(1.) / + // (Dtype(1.) + exp(-this->param_.gamma() * (Dtype(this->iter_) - + // Dtype(this->param_.stepsize()))))); + unimplemented!(); + } + } + } + + /// Return learning rate policy. + fn lr_policy(&self) -> LRPolicy { + self.lr_policy + } + + /// Return the base learning rate. + fn base_lr(&self) -> f32 { + self.base_lr + } + + /// Return the gamma for learning rate calculations. + fn gamma(&self) -> f32 { + self.gamma + } + + /// Return the stepsize for learning rate calculations. + fn stepsize(&self) -> usize { + self.stepsize + } +} + + +#[derive(Debug, Copy, Clone)] +/// Learning Rate Policy for a Solver +pub enum LRPolicy { + /// always return base_lr + Fixed, + /// learning rate decays every `step` iterations. + /// return base_lr * gamma ^ (floor(iter / step)) + Step, + /// similar to step but it allows non uniform steps defined by + /// stepvalue + Multistep, + /// return base_lr * gamma ^ iter + Exp, + /// return base_lr * (1 + gamma * iter) ^ (- power) + Inv, + /// the effective learning rate follows a polynomial decay, to be + /// zero by the max_iter. + /// return base_lr (1 - iter/max_iter) ^ (power) + Poly, + /// the effective learning rate follows a sigmod decay + /// return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize)))) + Sigmoid, } diff --git a/src/solvers/mod.rs.bk b/src/solvers/mod.rs.bk deleted file mode 100644 index f4274596..00000000 --- a/src/solvers/mod.rs.bk +++ /dev/null @@ -1,4 +0,0 @@ -pub use self::sgd::SGD; - -/// Stochastic Gradient Descent -pub mod sgd; diff --git a/src/solvers/sgd.rs b/src/solvers/sgd.rs index 3e110f3d..747b186b 100644 --- a/src/solvers/sgd.rs +++ b/src/solvers/sgd.rs @@ -6,60 +6,6 @@ use network::Network; pub struct SGD; impl SGD { - /// Return the current learning rate. The currently implemented learning rate - /// policies are as follows: - /// - fixed: always return base_lr. - /// - step: return base_lr * gamma ^ (floor(iter / step)) - /// - exp: return base_lr * gamma ^ iter - /// - inv: return base_lr * (1 + gamma * iter) ^ (- power) - /// - multistep: similar to step but it allows non uniform steps defined by - /// stepvalue - /// - poly: the effective learning rate follows a polynomial decay, to be - /// zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power) - /// - sigmoid: the effective learning rate follows a sigmod decay - /// return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize)))) - /// - /// where base_lr, max_iter, gamma, step, stepvalue and power are defined - /// in the solver parameter protocol buffer, and iter is the current iteration. - fn get_learning_rate(&self) -> f32 { - // Dtype rate; - // const string& lr_policy = this->param_.lr_policy(); - // if (lr_policy == "fixed") { - // rate = this->param_.base_lr(); - // } else if (lr_policy == "step") { - // this->current_step_ = this->iter_ / this->param_.stepsize(); - // rate = this->param_.base_lr() * - // pow(this->param_.gamma(), this->current_step_); - // } else if (lr_policy == "exp") { - // rate = this->param_.base_lr() * pow(this->param_.gamma(), this->iter_); - // } else if (lr_policy == "inv") { - // rate = this->param_.base_lr() * - // pow(Dtype(1) + this->param_.gamma() * this->iter_, - // - this->param_.power()); - // } else if (lr_policy == "multistep") { - // if (this->current_step_ < this->param_.stepvalue_size() && - // this->iter_ >= this->param_.stepvalue(this->current_step_)) { - // this->current_step_++; - // LOG(INFO) << "MultiStep Status: Iteration " << - // this->iter_ << ", step = " << this->current_step_; - // } - // rate = this->param_.base_lr() * - // pow(this->param_.gamma(), this->current_step_); - // } else if (lr_policy == "poly") { - // rate = this->param_.base_lr() * pow(Dtype(1.) - - // (Dtype(this->iter_) / Dtype(this->param_.max_iter())), - // this->param_.power()); - // } else if (lr_policy == "sigmoid") { - // rate = this->param_.base_lr() * (Dtype(1.) / - // (Dtype(1.) + exp(-this->param_.gamma() * (Dtype(this->iter_) - - // Dtype(this->param_.stepsize()))))); - // } else { - // LOG(FATAL) << "Unknown learning rate policy: " << lr_policy; - // } - // return rate; - unimplemented!(); - } - fn clip_gradients(&self) { // const Dtype clip_gradients = this->param_.clip_gradients(); // if (clip_gradients < 0) { return; } @@ -82,9 +28,9 @@ impl SGD { } impl ISolver for SGD { - fn apply_update(&self, param: &SolverConfig, net: &mut Network) { + fn apply_update(&self, param: &SolverConfig, net: &mut Network, iter: usize) { // CHECK(Caffe::root_solver()); // Caffe - let rate = self.get_learning_rate(); + let rate = param.get_learning_rate(iter); self.clip_gradients(); for (param_id, param) in net.learnable_params().iter().enumerate() { diff --git a/src/solvers/sgd.rs.bk b/src/solvers/sgd.rs.bk deleted file mode 100644 index e06ed0ad..00000000 --- a/src/solvers/sgd.rs.bk +++ /dev/null @@ -1,100 +0,0 @@ -use solver::*; -use network::Network; - -#[derive(Debug, Copy, Clone)] -/// Stochastic Gradient Descent Solver -pub struct SGD; - -impl SGD { - /// Return the current learning rate. The currently implemented learning rate - /// policies are as follows: - /// - fixed: always return base_lr. - /// - step: return base_lr * gamma ^ (floor(iter / step)) - /// - exp: return base_lr * gamma ^ iter - /// - inv: return base_lr * (1 + gamma * iter) ^ (- power) - /// - multistep: similar to step but it allows non uniform steps defined by - /// stepvalue - /// - poly: the effective learning rate follows a polynomial decay, to be - /// zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power) - /// - sigmoid: the effective learning rate follows a sigmod decay - /// return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize)))) - /// - /// where base_lr, max_iter, gamma, step, stepvalue and power are defined - /// in the solver parameter protocol buffer, and iter is the current iteration. - fn get_learning_rate(&self) -> f32 { - // Dtype rate; - // const string& lr_policy = this->param_.lr_policy(); - // if (lr_policy == "fixed") { - // rate = this->param_.base_lr(); - // } else if (lr_policy == "step") { - // this->current_step_ = this->iter_ / this->param_.stepsize(); - // rate = this->param_.base_lr() * - // pow(this->param_.gamma(), this->current_step_); - // } else if (lr_policy == "exp") { - // rate = this->param_.base_lr() * pow(this->param_.gamma(), this->iter_); - // } else if (lr_policy == "inv") { - // rate = this->param_.base_lr() * - // pow(Dtype(1) + this->param_.gamma() * this->iter_, - // - this->param_.power()); - // } else if (lr_policy == "multistep") { - // if (this->current_step_ < this->param_.stepvalue_size() && - // this->iter_ >= this->param_.stepvalue(this->current_step_)) { - // this->current_step_++; - // LOG(INFO) << "MultiStep Status: Iteration " << - // this->iter_ << ", step = " << this->current_step_; - // } - // rate = this->param_.base_lr() * - // pow(this->param_.gamma(), this->current_step_); - // } else if (lr_policy == "poly") { - // rate = this->param_.base_lr() * pow(Dtype(1.) - - // (Dtype(this->iter_) / Dtype(this->param_.max_iter())), - // this->param_.power()); - // } else if (lr_policy == "sigmoid") { - // rate = this->param_.base_lr() * (Dtype(1.) / - // (Dtype(1.) + exp(-this->param_.gamma() * (Dtype(this->iter_) - - // Dtype(this->param_.stepsize()))))); - // } else { - // LOG(FATAL) << "Unknown learning rate policy: " << lr_policy; - // } - // return rate; - unimplemented!(); - } - - fn clip_gradients(&self) { - // const Dtype clip_gradients = this->param_.clip_gradients(); - // if (clip_gradients < 0) { return; } - // const vector*>& net_params = this->net_->learnable_params(); - // Dtype sumsq_diff = 0; - // for (int i = 0; i < net_params.size(); ++i) { - // sumsq_diff += net_params[i]->sumsq_diff(); - // } - // const Dtype l2norm_diff = std::sqrt(sumsq_diff); - // if (l2norm_diff > clip_gradients) { - // Dtype scale_factor = clip_gradients / l2norm_diff; - // LOG(INFO) << "Gradient clipping: scaling down gradients (L2 norm " - // << l2norm_diff << " > " << clip_gradients << ") " - // << "by scale factor " << scale_factor; - // for (int i = 0; i < net_params.size(); ++i) { - // net_params[i]->scale_diff(scale_factor); - // } - // } - } -} - -impl ISolver for SGD { - fn apply_update(&self, param: &SolverConfig, net: &mut Network) { - // CHECK(Caffe::root_solver()); // Caffe - let rate = self.get_learning_rate(); - - self.clip_gradients(); - for (param_id, param) in net.learnable_params().iter().enumerate() { - // Normalize(param_id); - // Regularize(param_id); - // ComputeUpdateValue(param_id, rate); - unimplemented!(); - } - net.update_params(); - - unimplemented!(); - } -} diff --git a/tests/solver_specs.rs b/tests/solver_specs.rs new file mode 100644 index 00000000..470caad4 --- /dev/null +++ b/tests/solver_specs.rs @@ -0,0 +1,39 @@ +extern crate leaf; + +#[cfg(test)] +mod network_spec { + use leaf::solver::*; + + #[test] + // fixed: always return base_lr. + fn lr_fixed() { + let cfg = SolverConfig{ lr_policy: LRPolicy::Fixed, base_lr: 5f32, gamma: 0.5f32, ..SolverConfig::default()}; + assert!(cfg.get_learning_rate(0) == 5f32); + assert!(cfg.get_learning_rate(100) == 5f32); + assert!(cfg.get_learning_rate(1000) == 5f32); + } + + #[test] + // step: return base_lr * gamma ^ (floor(iter / step)) + fn lr_step() { + let cfg = SolverConfig{ lr_policy: LRPolicy::Step, base_lr: 5f32, gamma: 0.5f32, stepsize: 10, ..SolverConfig::default()}; + assert!(cfg.get_learning_rate(0) == 5f32); + assert!(cfg.get_learning_rate(10) == 2.5f32); + assert!(cfg.get_learning_rate(20) == 1.25f32); + } + + #[test] + // exp: return base_lr * gamma ^ iter + fn lr_exp() { + let cfg = SolverConfig{ lr_policy: LRPolicy::Exp, base_lr: 5f32, gamma: 0.5f32, ..SolverConfig::default()}; + assert!(cfg.get_learning_rate(0) == 5f32); + assert!(cfg.get_learning_rate(1) == 2.5f32); + assert!(cfg.get_learning_rate(2) == 1.25f32); + assert!(cfg.get_learning_rate(3) == 0.625f32); + + let cfg2 = SolverConfig{ lr_policy: LRPolicy::Exp, base_lr: 5f32, gamma: 0.25f32, ..SolverConfig::default()}; + assert!(cfg2.get_learning_rate(0) == 5f32); + assert!(cfg2.get_learning_rate(1) == 1.25f32); + assert!(cfg2.get_learning_rate(2) == 0.3125f32); + } +}