diff --git a/.gitignore b/.gitignore
index b8bd026..91ec9f5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,3 +26,8 @@
 *.exe
 *.out
 *.app
+
+.cproject
+.project
+include/thrift
+src/thrift
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..57392e9
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,6 @@
+FIND_PACKAGE(ArrayFire)
+INCLUDE_DIRECTORIES(${ArrayFire_INCLUDE_DIRS})
+...
+
+ADD_EXECUTABLE(some_executable ...)
+TARGET_LINK_LIBRARIES(some_executable ${ArrayFire_LIBRARIES} )
\ No newline at end of file
diff --git a/examples/googlenet/googlenet.cpp b/examples/googlenet/googlenet.cpp
new file mode 100644
index 0000000..78a8349
--- /dev/null
+++ b/examples/googlenet/googlenet.cpp
@@ -0,0 +1,132 @@
+#include "afml/afml.hpp"
+
+using namespace afml;
+
+// https://github.com/soumith/imagenet-multiGPU.torch/blob/master/models/googlenet_cudnn.lua
+// Note that Lua index starts from 1.
+// TODO: make_shared is very frequently used. Is MS a good alias macro for make_shared?
+// #define MS make_shared
+NodePtr inception(const int inputSize, std::initializer_list<int>& config) {
+  shared_ptr < Concat > concat(new Concat(1));
+  if (config[0][0] != 0) {
+    shared_ptr < Sequential > conv1(new Sequential());
+    conv1->add(
+        make_shared < Convolution > (inputSize, config[0][0], 1, 1, 1, 1))->add(
+        make_shared<ReLU>());
+    concat->add(conv1);
+  }
+
+  shared_ptr < Sequential > conv3(new Sequential());
+  conv3->add(make_shared < Convolution > (inputSize, config[1][0], 1, 1, 1, 1))
+      ->add(make_shared<ReLU>());
+  conv3->add(
+      make_shared < Convolution
+          > (config[1][0], config[1][1], 3, 3, 1, 1, 1, 1))->add(
+      make_shared<ReLU>());
+  concat->add(conv3);
+
+  shared_ptr < Sequential > conv3xx(new Sequential());
+  conv3xx->add(
+      make_shared < Convolution > (inputSize, config[2][0], 1, 1, 1, 1))->add(
+      make_shared<ReLU>());
+  conv3xx->add(
+      make_shared < Convolution
+          > (config[2][0], config[2][1], 3, 3, 1, 1, 1, 1))->add(
+      make_shared<ReLU>());
+  conv3xx->add(
+      make_shared < Convolution
+          > (config[2][1], config[2][1], 3, 3, 1, 1, 1, 1))->add(
+      make_shared<ReLU>());
+  concat->add(conv3xx);
+
+  shared_ptr < Sequential > pool(new Sequential());
+  pool->add(make_shared < Padding > (3, 3, 1, 1));
+  if (config[3][0] == PoolingType::MAX) {
+    pool->add(make_shared < MaxPooling > (3, 3, 1, 1));
+  } else if (config[3][0] == PoolingType::AVERAGE) {
+    pool->add(make_shared < AveragePooling > (3, 3, 1, 1));
+  } else {
+    printf("Unknown pooling");
+    exit(1);
+  }
+
+  if (config[3][1] != 0) {
+    pool->add(
+        make_shared < Convolution
+            > (inputSize, config[3][1], 1, 1, 1, 1)->add(make_shared<ReLU>()));
+  }
+  concat->add(pool);
+  return concat;
+
+}
+
+NodePtr createModel(int numGPU) {
+  shared_ptr < Sequential > features(new Sequential());
+  features->add(make_shared < Convolution > (3, 64, 7, 7, 2, 2, 3, 3))->add(
+      make_shared<ReLU>());
+  features->add(make_shared < MaxPooling > (3, 3, 2, 2));
+  features->add(make_shared < Convolution > (64, 64, 1, 1))->add(
+      make_shared<ReLU>());
+  features->add(make_shared < Convolution > (64, 192, 3, 3, 1, 1, 1, 1))->add(
+      make_shared<ReLU>());
+  features->add(make_shared < MaxPooling > (3, 3, 2, 2));
+
+  int uselessPlaceHoder = -1;
+  features->add(inception(192, { { 64, uselessPlaceHoder }, { 64, 64 },
+                              { 64, 96 }, { PoolingType::AVERAGE, 32 } }));
+  features->add(inception(256, { { 64, uselessPlaceHoder }, { 64, 96 },
+                              { 64, 96 }, { PoolingType::AVERAGE, 64 } }));
+  features->add(inception(320, { { 0, uselessPlaceHoder }, { 128, 160 }, { 64,
+      96 }, { PoolingType::MAX, 0 } }));
+  features->add(make_shared < Convolution > (576, 576, 2, 2, 2, 2));
+  features->add(inception(576, { { 224, uselessPlaceHoder }, { 64, 96 }, { 96,
+      128 }, { PoolingType::AVERAGE, 128 } }));
+  features->add(inception(576, { { 192, uselessPlaceHoder }, { 96, 128 }, { 96,
+      128 }, { PoolingType::AVERAGE, 128 } }));
+  features->add(inception(576, { { 160, uselessPlaceHoder }, { 128, 160 }, {
+      128, 160 }, { PoolingType::AVERAGE, 96 } }));
+  features->add(inception(576, { { 96, uselessPlaceHoder }, { 128, 192 }, { 160,
+      192 }, { PoolingType::AVERAGE, 96 } }));
+
+  shared_ptr < Sequential > mainBranch(new Sequential());
+  mainBranch->add(inception(576, { { 0, uselessPlaceHoder }, { 128, 192 }, {
+      192, 256 }, { PoolingType::MAX, 0 } }));
+  mainBranch->add(make_shared < Convolution > (1024, 1024, 2, 2, 2, 2));
+  mainBranch->add(inception(1024, { { 352 }, { 192, 320 }, { 160, 224 }, {
+      PoolingType::AVERAGE, 128 } }));
+  mainBranch->add(inception(1024, { { 352 }, { 192, 320 }, { 192, 224 }, {
+      PoolingType::MAX, 128 } }));
+  mainBranch->add(make_shared < AveragePooling > (7, 7, 1, 1));
+  mainBranch->add(make_shared < View > (1024)->withNumInputDims(3));
+  mainBranch->add(make_shared < Linear > (1024, 1000));
+  mainBranch->add(make_shared<LogSoftmax>());
+
+  shared_ptr < Sequential > auxClassifier(new Sequential());
+  auxClassifier->add(make_shared < AveragePooling > (5, 5, 3, 3));
+  auxClassifier->add(make_shared < Convolution > (576, 128, 1, 1, 1, 1));
+  auxClassifier->add(make_shared < View > (128 * 4 * 4)->withNumInputDims(3));
+  auxClassifier->add(make_shared < Linear > (128 * 4 * 4, 768));
+  auxClassifier->add(make_shared<ReLU>());
+  auxClassifier->add(make_shared < Linear > (768, 1000));
+  auxClassifier->add(make_shared<LogSoftmax>());
+
+  shared_ptr < Concat > splitter(new Concat(1));
+  splitter->add(mainBranch)->add(auxClassifier);
+  shared_ptr < Sequential > model = make_shared<Sequential>()->add(features)
+      ->add(splitter);
+
+  if (numGPU > 0) {
+    shared_ptr < DataParallel > dp(new DataParallel(1));
+    for (int i = 0; i < numGPU; ++i) {
+      dp->add(i, root->clone());
+    }
+    return dp;
+  }
+  return model;
+}
+
+int main(int argc, char *argv[]) {
+  int numGPU = atoi(argv[1]);
+  NodePtr model = createModel(numGPU);
+  printf(model->toString());
+}
diff --git a/include/afml/.gitignore b/include/afml/.gitignore
new file mode 100644
index 0000000..67a9013
--- /dev/null
+++ b/include/afml/.gitignore
@@ -0,0 +1 @@
+/thrift/
diff --git a/include/afml/afml.hpp b/include/afml/afml.hpp
new file mode 100644
index 0000000..1ad1987
--- /dev/null
+++ b/include/afml/afml.hpp
@@ -0,0 +1,17 @@
+#ifndef AFML_AFML_HPP_
+#define AFML_AFML_HPP_
+
+namespace afml {
+
+#include "afml/common.hpp"
+#include "afml/container.hpp"
+#include "afml/io.hpp"
+#include "afml/node.hpp"
+#include "afml/nodes.hpp"
+#include "afml/parallel.hpp"
+
+
+}  // namespace afml
+
+
+#endif /* AFML_AFML_HPP_ */
diff --git a/include/afml/common.hpp b/include/afml/common.hpp
new file mode 100644
index 0000000..c2287c3
--- /dev/null
+++ b/include/afml/common.hpp
@@ -0,0 +1,53 @@
+#ifndef AFML_COMMON_HPP_
+#define AFML_COMMON_HPP_
+
+#include <arrayfire.h>
+
+#if __cplusplus < 201100L
+#include <boost/make_shared.hpp>
+#include <boost/shared_ptr.hpp>
+#include <boost/unordered_map.hpp>
+#else
+#include <initializer_list>
+#include <memory>
+#include <unordered_map>
+#include <cstddef>
+#endif
+
+#include "afml/thrift/afml_types.h"
+
+namespace afml {
+
+using namespace af;
+using namespace boost;
+using namespace std;
+
+#if __cplusplus < 201100L
+using boost::make_shared;
+using boost::shared_ptr;
+using boost::nullptr;
+using boost::unordered_map;
+#else
+using std::initializer_list;
+using std::make_shared;
+using std::shared_ptr; // Can CUDA with this?
+using std::nullptr;
+using std::unordered_map;
+#endif
+
+typedef unordered_map map;
+
+// Just to be consistent in camel case style.
+typedef array Array;
+
+typedef vector<Array> ArrayVec;
+typedef shared_ptr<Array> ArrayPtr;
+typedef vector<ArrayPtr> ArrayPtrVec;
+
+class Node;
+typedef shared_ptr<Node> NodePtr;
+typedef vector<NodePtr> NodePtrVec;
+
+}  // namespace afml
+
+#endif /* AFML_COMMON_HPP_ */
diff --git a/include/afml/container.hpp b/include/afml/container.hpp
new file mode 100644
index 0000000..eb2c9ff
--- /dev/null
+++ b/include/afml/container.hpp
@@ -0,0 +1,62 @@
+#ifndef AFML_CONTAINER_HPP_
+#define AFML_CONTAINER_HPP_
+
+#include "afml/afml.hpp"
+
+namespace afml {
+
+// From Torch7
+// https://github.com/torch/nn/blob/master/Container.lua
+// Contains multiple nodes for easier management or building complex networks
+class Container : public Node {
+ public:
+  Container();
+
+  // Return this shared_ptr to chain calls add()->add()->add()
+  virtual shared_ptr<Container> add(const NodePtr node);
+
+  // Returns the contained modules at index index.
+  virtual NodePtr get(const size_t index) const;
+
+  // Returns the number of contained modules.
+  size_t size() const;
+
+  virtual void forward();
+  virtual void backward();
+  virtual void toString() const;
+};
+
+// https://github.com/torch/nn/blob/master/Concat.lua
+class Concat : public Container {
+ public:
+  Concat(const size_t concatDim);
+  virtual ~Concat();
+
+};
+
+// https://github.com/torch/nn/blob/master/Sequential.lua
+// To simplify management of sequentially connected nodes.
+class Sequential : public Container {
+  virtual ~Sequential();
+  // Return this shared_ptr to chain calls add()->add()->add()
+  virtual shared_ptr<SequentialContainer> add(const NodePtr node);
+  virtual void insert(const NodePtr node, const size_t index);
+  virtual void remove (const size_t index);
+  virtual void toString() const;
+};
+
+// https://github.com/torch/nn/blob/master/Parallel.lua
+// To run multiple copies of a part of a model on different GPUs.
+class Parallel : public Container {
+ public:
+  Parallel(const size_t inputDim, const size_t outputDim);
+  virtual ~Parallel();
+  virtual void forward();
+  virtual void backward();
+  virtual void toString() const;
+};
+
+
+}  // namespace afml
+
+#endif /* AFML_CONTAINER_HPP_ */
diff --git a/include/afml/io.hpp b/include/afml/io.hpp
new file mode 100644
index 0000000..3e7b5a7
--- /dev/null
+++ b/include/afml/io.hpp
@@ -0,0 +1,20 @@
+#ifndef AFML_IO_HPP_
+#define AFML_IO_HPP_
+
+#include "afml/common.hpp"
+
+namespace afml {
+
+class SerDe {
+ public:
+
+  // Defined in thrift/afml.thrift
+  Data& serialize(const Array& arr) const;
+  Array& deserialize(const Data& data);
+};
+
+
+}  // namespace afml
+
+
+#endif /* AFML_IO_HPP_ */
diff --git a/include/afml/node.hpp b/include/afml/node.hpp
new file mode 100644
index 0000000..e588e0f
--- /dev/null
+++ b/include/afml/node.hpp
@@ -0,0 +1,252 @@
+#ifndef AFML_NODE_HPP_
+#define AFML_NODE_HPP_
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "afml/common.hpp"
+
+namespace afml {
+
+// https://github.com/BVLC/caffe/blob/master/include/caffe/layer.hpp
+// https://github.com/torch/nn/blob/master/Module.lua
+class Node {
+ public:
+  explicit Node(const NodeConfig& nodeConfig);
+  virtual ~Node();
+  // getAllNodes and traverse are from
+  // https://github.com/zxie/nn/blob/master/nets/graph.py
+  static NodePtrVec getAllNodes(const NodePtrVec& startNodes);
+  template<class Function>
+  static void traverse(const NodePtrVec& startNodes, Function fn) {
+    NodePtrVec readyNodes;
+    map<string, int> deps;
+    for (size_t i = 0; i, startNodes.size(); ++i) {
+      if (startNodes[i]->numPrev() == 0) {
+        readyNodes.push_back(startNodes[i]);
+      }
+      deps[startNodes[i]->name()] = startNodes[i]->numPrev();
+    }
+    vector < string > names;
+    while (readyNodes.size() > 0) {
+      NodePtrVec nextReadyNodes;
+      for (size_t i = 0; i < readyNodes.size(); ++i) {
+        fn(readyNodes[i]);
+        names = readyNodes[i]->nextNames();
+        for (size_t j = 0; j < names.size(); ++j) {
+          deps[names[j]]--;
+          if (deps[names[j]] == 0) {
+            nextReadyNodes.push_back(readyNodes[i]->next(names[j]));
+          }
+        }
+        deps.erase(readyNodes[i]->name());
+      }
+      readyNodes = nextReadyNodes;
+    }
+  }
+
+  // Input and output are more general than the top and bottom of Caffe
+  virtual void forward() = 0;
+  // Since the network is DAG, propagate_back is more general than
+  // propagate_down of Caffe
+  virtual void backward() = 0;
+
+  void computeGradientWrtOutput() {
+    gradientWrtInput_ = nextNodes_[0].gradient();
+    for (size_t i = 1; i < nextNodes_.size(); ++i) {
+      gradientWrtInput_ += nextNodes_[i].gradient();
+    }
+  }
+
+  void composeGradient() {
+    gradientWrtInput_ *= gradientWrtOutput_;
+  }
+
+  // Must be called after all the addNext has been called
+  // i.e. the nodes in the network has connected with each other
+  // Calls initNode which subclass can override
+  void init() {
+    thisNode_.reset(NodePtr<this>);
+    checkNumNextPrevNodes();
+    initNode();
+  }
+
+  string name() const {
+    return name_;
+  }
+
+  // Must make sure there's no copy to return Array
+  ArrayPtr output() const {
+    return output_;
+  }
+
+  ArrayPtr gradient() const {
+    return gradientWrtInput();
+  }
+
+  ArrayPtr gradientWrtInput() const {
+    return gradientWrtInput_;
+  }
+
+  ArrayPtr gradientWrtOutput() const {
+    return gradientWrtOutput_;
+  }
+
+  ArrayPtr param(const string& name) const {
+    CHECK(params_.find(name) != params_.end());
+    return params_[name];
+  }
+
+  ArrayPtr gradientWrtParams(const string& name) const {
+    CHECK(gradientWrtParams_.find(name) != gradientWrtParams_.end());
+    return gradientWrtParams_[name];
+  }
+
+  void add(NodePtr node) {
+    addNext(node);
+  }
+
+  // The model is DAG(Directed Acyclic Graph)
+  void addNext(NodePtr node) {
+    if (!hasNext(node) && node->hasPrev(thisNode_)) {
+      next_[node->name()] = node;
+      nextNodes_.push_back(node);
+      node->addPrev(thisNode_);
+    }
+  }
+
+  void addPrev(NodePtr node) {
+    prev_[node->name()] = node;
+    prevNodes_.push_back(node);
+  }
+
+// Shortcut to add multiple nodes
+  void addNext(vector<NodePtr>& nodes) {
+    for (size_t i = 0; i < nodes.size(); ++i) {
+      addNext(nodes[i]);
+    }
+  }
+
+  void addPrev(vector<NodePtr>& nodes) {
+    for (size_t i = 0; i < nodes.size(); ++i) {
+      addPrev(nodes[i]);
+    }
+  }
+
+  bool hasNext(NodePtr node) const {
+    return next_.find(node) != next_.end();
+  }
+
+  bool hasPrev(NodePtr node) const {
+    return prev_.find(node) != prev_.end();
+  }
+
+  map<string, NodePtr>& next() const {
+    return next_;
+  }
+
+  map<string, NodePtr>& prev() const {
+    return prev_;
+  }
+
+  vector<string> nextNames() const {
+    vector < string > names;
+    for (map<string, NodePtr>::iterator iter = next_.begin();
+        iter != next_.end(); ++iter) {
+      names.push_back(iter->name());
+    }
+    return names;
+  }
+
+  vector<string> prevNames() const {
+    vector < string > names;
+    for (map<string, NodePtr>::iterator iter = prev_.begin();
+        iter != prev_.end(); ++iter) {
+      names.push_back(iter->name());
+    }
+    return names;
+  }
+
+  vector<NodePtr>& nextNodes() const {
+    return nextNodes_;
+  }
+
+  vector<NodePtr>& prevNodes() const {
+    return prevNodes_;
+  }
+
+  NodePtr next(const string& name) const {
+    if (next_.find(name) != next_.end()) {
+      return next_[name];
+    }
+    return nullptr;
+  }
+
+  NodePtr prev(const string& name) const {
+    if (prev_.find(name) != prev_.end()) {
+      return prev_[name];
+    }
+    return nullptr;
+  }
+
+  size_t numNext() const {
+    return next_.size();
+  }
+
+  size_t numPrev() const {
+    return prev_.size();
+  }
+
+  // From Torch7 module API
+  // https://github.com/torch/nn/blob/master/doc/module.md
+  void training() {
+    train_ = true;
+  }
+  void evaluating() {
+    train_ = false;
+  }
+
+  void shareParams(const NotePtr node);
+  void shareParams(const NotePtr node, const string& name);
+  void shareParams(const NotePtr node, const string& name1, const string& name2);
+  void shareParams(const NotePtr node, const vector<string>& names);
+  NodePtr clone();
+  NodePtr clone(const string& name);
+  NodePtr clone(const string& name1, const string& name2);
+  NodePtr clone(const vector<string>& names);
+
+  // Use CPU or GPU
+  void cpu();
+  void gpu();
+
+  virtual void toString() const;
+
+ protected:
+  virtual void checkNumNextPrevNodes() = 0;
+  virtual void initNode() = 0;
+
+  string name_;
+  NodeConfig& config_;
+  NodePtr thisNode_;
+  ArrayPtr output_;
+  // wrt = ith regardt to
+  ArrayPtr gradientWrtInput_;
+  ArrayPtr gradientWrtOutput_;
+  map<string, ArrayPtr> params_;
+  map<string, ArrayPtr> gradientWrtParams_;
+  map<string, NodePtr> next_;
+  vector<NodePtr> nextNodes_;
+  map<string, NodePtr> prev_;
+  vector<NodePtr> prevNodes_;
+  size_t expectedNumNextNodes_;
+  size_t expectedNumPrevNodes_;
+  size_t expectedMinNumNextNodes_;
+  size_t expectedMinNumPrevNodes_;
+  size_t expectedMaxNumNextNodes_;
+  size_t expectedMaxNumPrevNodes_;
+};
+
+}  // namespace afml
+
+#endif /* AFML_NODE_HPP_ */
diff --git a/include/afml/nodes.hpp b/include/afml/nodes.hpp
new file mode 100644
index 0000000..18a16a8
--- /dev/null
+++ b/include/afml/nodes.hpp
@@ -0,0 +1,10 @@
+#ifndef AFML_NODES_HPP_
+#define AFML_NODES_HPP_
+
+#include "afml/nodes/common_nodes.hpp"
+#include "afml/nodes/convolution_nodes.hpp"
+#include "afml/nodes/data_nodes.hpp"
+#include "afml/nodes/elementwise_nodes.hpp"
+#include "afml/nodes/loss_nodes.hpp"
+
+#endif /* AFML_NODES_HPP_ */
diff --git a/include/afml/nodes/common_nodes.hpp b/include/afml/nodes/common_nodes.hpp
new file mode 100644
index 0000000..5a6b2aa
--- /dev/null
+++ b/include/afml/nodes/common_nodes.hpp
@@ -0,0 +1,63 @@
+#ifndef AFML_COMMON_NODES_HPP_
+#define AFML_COMMON_NODES_HPP_
+
+#include "afml/common.hpp"
+
+namespace afml {
+
+enum PoolingType {
+  AVERAGE,
+  MAX,
+  STOCHASTIC
+};
+
+// Fully connected inner product of input and output
+class Linear : public Node {
+  virtual ~Linear();
+  virtual void checkNumNextPrevNodes() {
+    CHECK(nextNodes_.size() >= expectedMinNumNextNodes_);
+    CHECK(prevNodes_.size() == expectedNumPrevNodes_);
+  }
+
+  virtual void initNode() {
+    output_->resize(prevNodes_[0].output()->dims());
+  }
+
+  virtual void forward() {
+    output_ = params_['W'] * prevNodes_[0].output();
+  }
+
+  virtual void backward() {
+    computeGradientWrtOutput();
+    gradientWrtInput_ = params_['W']->T() * gradientWrtOutput_;
+    gradientWrtParams_['W'] = gradientWrtOutput_->matmul(output_->T());
+  }
+};
+
+class Softmax : public Node {
+  virtual ~Softmax();
+  virtual void checkNumNextPrevNodes() {
+    CHECK(nextNodes_.size() >= expectedMinNumNextNodes_);
+    CHECK(prevNodes_.size() == expectedNumPrevNodes_);
+  }
+
+  virtual void initNode() {
+    output_->resize(prevNodes_[0].output()->dims());
+  }
+
+  virtual void forward() {
+    int axis = 0;
+    Array probs  = (prevNodes_[0].output() - prevNodes_[0].output().max(axis)).exp();
+    output_ = probs / probs.sum(axis);
+  }
+
+  virtual void backward() {
+    gradient_ = output_ * output_ - output_;
+  }
+
+};
+
+}  // namespace afml
+
+
+#endif /* AFML_COMMON_NODES_HPP_ */
diff --git a/include/afml/nodes/convolution_nodes.hpp b/include/afml/nodes/convolution_nodes.hpp
new file mode 100644
index 0000000..3a38f9b
--- /dev/null
+++ b/include/afml/nodes/convolution_nodes.hpp
@@ -0,0 +1,42 @@
+#ifndef AFML_CONVOLUTION_NODES_HPP_
+#define AFML_CONVOLUTION_NODES_HPP_
+
+#include "afml/common.hpp"
+
+namespace afml {
+
+// TODO: Which is better, Caffe or Torch7's convolution API?
+
+// https://github.com/BVLC/caffe/blob/master/include/caffe/vision_layers.hpp
+class BaseConvolution : public Node {
+
+};
+
+class Convolution : public BaseConvolution {
+
+};
+
+class DeconvolutionLayer : public BaseConvolution {
+
+};
+
+
+// What's the difference between SpatialConvolution, SpatialConvolutionMM,
+// SpatialConvolutionMap,  SpatialFullConvolution,  SpatialFullConvolutionMap,
+// TemporalConvolution, and VolumetricConvolution of torch/nn?
+// Can they be simplified and unified?
+// https://github.com/torch/nn/
+
+// https://github.com/torch/nn/blob/master/SpatialConvolution.lua
+class SpatialConvolution : public Node {
+ public:
+  SpatialConvolution(const string& name, const size_t numInputPlane,
+      const size_t numOutputPlane, const size_t kernalWidth,
+      const size_t kernalHeight, const size_t strideWidth,
+      const size_t strideHeight, const size_t padWidth, const size_t padHeight);
+};
+
+}  // namespace afml
+
+#endif /* AFML_CONVOLUTION_NODES_HPP_ */
+
diff --git a/include/afml/nodes/data_nodes.hpp b/include/afml/nodes/data_nodes.hpp
new file mode 100644
index 0000000..801c135
--- /dev/null
+++ b/include/afml/nodes/data_nodes.hpp
@@ -0,0 +1,26 @@
+#ifndef AFML_DATA_NODES_HPP_
+#define AFML_DATA_NODES_HPP_
+
+#include "afml/common.hpp"
+
+namespace afml {
+
+class Data : public Node {
+  virtual ~Data();
+  virtual void checkNumNextPrevNodes() {
+  }
+
+  virtual void initNode() {
+  }
+
+  virtual void forward() {
+  }
+
+  virtual void backward() {
+  }
+};
+
+}  // namespace afml
+
+
+#endif /* AFML_DATA_NODES_HPP_ */
diff --git a/include/afml/nodes/elementwise_nodes.hpp b/include/afml/nodes/elementwise_nodes.hpp
new file mode 100644
index 0000000..0c33423
--- /dev/null
+++ b/include/afml/nodes/elementwise_nodes.hpp
@@ -0,0 +1,33 @@
+#ifndef AFML_ELEMENTWISE_NODES_HPP_
+#define AFML_ELEMENTWISE_NODES_HPP_
+
+#include "afml/common.hpp"
+
+namespace afml {
+
+class ReLU : public Node {
+  virtual ~ReLU();
+  virtual void checkNumNextPrevNodes() {
+    CHECK(nextNodes_.size() >= expectedMinNumNextNodes_);
+    CHECK(prevNodes_.size() == expectedNumPrevNodes_);
+  }
+
+  virtual void initNode() {
+    output_->resize(prevNodes_[0].output()->dims());
+  }
+
+  virtual void forward() {
+    output_ = prevNodes_[0].output() * (prevNodes_[0].output() > 0);
+  }
+
+  virtual void backward() {
+    gradient_ = output_ > 0;
+    computeGradientInput();
+    composeGradient();
+  }
+};
+
+}  // namespace afml
+
+
+#endif /* AFML_ELEMENTWISE_NODES_HPP_ */
diff --git a/include/afml/nodes/loss_nodes.hpp b/include/afml/nodes/loss_nodes.hpp
new file mode 100644
index 0000000..b38c73c
--- /dev/null
+++ b/include/afml/nodes/loss_nodes.hpp
@@ -0,0 +1,20 @@
+#ifndef AFML_LOSS_NODES_HPP_
+#define AFML_LOSS_NODES_HPP_
+
+#include "afml/common.hpp"
+
+namespace afml {
+
+class Accuracy : public Node {
+
+};
+
+// This is probably the most commonly used loss for classification
+class NegativeLogLikelihood : public Node {
+
+};
+
+}  // namespace afml
+
+
+#endif /* AFML_LOSS_NODES_HPP_ */
diff --git a/include/afml/parallel.hpp b/include/afml/parallel.hpp
new file mode 100644
index 0000000..3dc2e65
--- /dev/null
+++ b/include/afml/parallel.hpp
@@ -0,0 +1,46 @@
+#ifndef AFML_PARALLEL_HPP_
+#define AFML_PARALLEL_HPP_
+
+#include "afml/common.hpp"
+#include "afml/container.hpp"
+
+namespace afml {
+
+// https://github.com/facebook/fbcunn/blob/master/fbcunn/AbstractParallel.lua
+class AbstractParallel : public Container {
+ public:
+  AbstractParallel(const size_t dim);
+  virtual size_t nextGPU() const;
+  // Add the node to run on gpuID
+  virtual void add(const size_t gpuID, const NodePtr node);
+  virtual NodePtr get(const size_t index) const;
+  void asyncCopy(const ArrayPtr source, const ArrayPtr dest);
+ protected:
+  void distributeGradientWrtOutput();
+
+};
+
+// https://github.com/facebook/fbcunn/blob/master/fbcunn/DataParallel.lua
+class DataParallel : public AbstractParallel {
+ protected:
+  void distributeInput(const ArrayPtr input);
+  void gatherGradients();
+  void combineGradients(const size_t row, const ArrayPtrVec& gradients);
+
+};
+
+// https://github.com/facebook/fbcunn/blob/master/fbcunn/ModelParallel.lua
+class ModelParallel : public AbstractParallel {
+ public:
+  ModelParallel(const size_t dim);
+  virtual size_t nextGPU() const;
+  virtual void add(const size_t gpuID, const NodePtr node);
+  virtual NodePtr get(const size_t index) const;
+  void distributeInput(const ArrayPtr input);
+
+};
+
+}  // namespace afml
+
+
+#endif /* AFML_PARALLEL_HPP_ */
diff --git a/src/afml/.gitignore b/src/afml/.gitignore
new file mode 100644
index 0000000..67a9013
--- /dev/null
+++ b/src/afml/.gitignore
@@ -0,0 +1 @@
+/thrift/
diff --git a/thrift/afml.thrift b/thrift/afml.thrift
new file mode 100644
index 0000000..1e2a362
--- /dev/null
+++ b/thrift/afml.thrift
@@ -0,0 +1,22 @@
+namespace cpp afml
+namespace csharp afml
+namespace go afml
+namespace html afml
+namespace java afml
+namespace js afml
+namespace json afml
+namespace lua afml
+namespace perl afml
+namespace php afml
+namespace py afml
+namespace rb afml
+
+// In Torch7, tensors are backed by storages
+struct Storage {
+  1: list<i32> dims,
+  2: string data
+}
+
+struct NodeConfig {
+  1: string name,
+}
\ No newline at end of file
diff --git a/thrift/gen_thrift.sh b/thrift/gen_thrift.sh
new file mode 100755
index 0000000..64c6550
--- /dev/null
+++ b/thrift/gen_thrift.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+NAME=afml
+
+#include "afml/thrift/afml_constants.h"
+
+thrift -r --gen cpp ${NAME}.thrift
+sed -i "s|#include \"${NAME}_constants|#include \"${NAME}/thrift/${NAME}_constants|g" gen-cpp/*
+sed -i "s|#include \"${NAME}_types|#include \"${NAME}/thrift/${NAME}_types|g" gen-cpp/*
+if [ ! -d ../include/${NAME}/thrift ]; then
+  mkdir ../include/${NAME}/thrift
+fi
+mv gen-cpp/*.h ../include/${NAME}/thrift
+if [ ! -d ../src/${NAME}/thrift ]; then
+  mkdir ../src/${NAME}/thrift
+fi
+mv gen-cpp/*.cpp ../src/${NAME}/thrift
+rm -rf gen-cpp
+
+# thrift -r --gen java afml.thrift
+# thrift -r --gen js afml.thrift
+# thrift -r --gen json afml.thrift
+# thrift -r --gen lua afml.thrift
+# thrift -r --gen py afml.thrift