Merge pull request #26 from adeeconometrics/dev-etparser

Update Documentation
adeeconometrics · Jun 19, 2024 · c6e9bd6 · c6e9bd6
2 parents 456447f + c695106
commit c6e9bd6
Show file tree

Hide file tree

Showing 4 changed files with 72 additions and 73 deletions.
diff --git a/benchmark/linalg_libs/gflops.py b/benchmark/linalg_libs/gflops.py
@@ -37,7 +37,7 @@ def measure_expr(A: Array,
 if __name__ == "__main__":
     table = []
     M, N, K = 256, 256, 256
-    nsec: int = 230162833 * 1e-9  # 49758236*1e-9  # 38_283_092_347
+    nsec: int = 813708 * 1e-9  # 49758236*1e-9  # 38_283_092_347
 
     floating_ops: int = 4*M*N*K + 5*M*N  # 2 * M*N*K
     lm_gflops: int = get_gflops(floating_ops, nsec)

diff --git a/bluprint/outline.md b/bluprint/outline.md
diff --git a/docs_sphinx/index.rst b/docs_sphinx/index.rst
@@ -11,10 +11,76 @@ Welcome to LazyMat's documentation!
    :caption: Contents:
 
 
+.. image:: ../img/LazyMatLogo.png
+   :align: center
+
+
+
+Motivation, Goals, and Disclaimers
+----------------------------------
+
+This is a personal project that aims to explore ideas of deferred execution and ought to
+investigate how it can improve performance for vectorized expressions. I wrote this
+project to educate myself on the subtle techniques of expression templates, and locality of reference.
+
+The project aims to support vector and matrix operations. There is no long-term initiative
+to maintain and continuously develop this code. You are welcome to submit an issue or suggest improvements.
+The robustness of this codebase is only limited to its test coverage.
+This is not recommended to be used in a production environment.
+
+
+Performance
+-----------
+
+Benchmark on Apple M2 8C MacOS 14.5 Release mode -- compiled on `arm64` build for Matmul.
+
+.. code-block:: console
+
+   (venv) ➜  benchmark git:(dev-etparser) ✗ python linalg_libs/gflops.py
+   lib           gflop/s         secs  relative throughput    Size
+   ----------  ---------  -----------  ---------------------  --------------
+   lazy_mat    2356.53    0.000911291  ---                    1024x1024x1024
+   numpy         67.4947  0.0318171    34.9143x               1024x1024x1024
+   pytorch       86.5684  0.0248068    27.2216x               1024x1024x1024
+   tensorflow    94.7282  0.02267      24.8767x               1024x1024x1024
+
+   (venv) ➜  benchmark git:(dev-etparser) ✗ python linalg_libs/gflops.py
+   lib           gflop/s       secs  relative throughput    Size
+   ----------  ---------  ---------  ---------------------  --------------
+   lazy_mat    7470.82    0.0022996  ---                    2048x2048x2048
+   numpy         96.7968  0.177484   77.1804x               2048x2048x2048
+   pytorch      172.91    0.0993571  43.2063x               2048x2048x2048
+   tensorflow   156.748   0.109602   47.6613x               2048x2048x2048
+
+   (venv) ➜  benchmark git:(dev-etparser) ✗ python linalg_libs/gflops.py
+   lib           gflop/s      secs  relative throughput    Size
+   ----------  ---------  --------  ---------------------  --------------
+   lazy_mat     6614.64   0.020778  ---                    4096x4096x4096
+   numpy         146.621  0.937376  45.1139x               4096x4096x4096
+   pytorch       277.298  0.495636  23.8539x               4096x4096x4096
+   tensorflow    158.817  0.865391  41.6494x               4096x4096x4096
+
+**Build for Debug**:
+
+.. code-block:: bash
+
+   $ mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=Debug ../ && make
+
+**Build for Release**:
+
+.. code-block:: bash
+
+   $ mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=Debug ../ && make
+
+**Running tests**
+
+.. code-block:: bash
+
+   $ ./bin/unittest
+
 
 Indices and tables
 ==================
 
 * :ref:`genindex`
-* :ref:`modindex`
 * :ref:`search`
diff --git a/src/main.cpp b/src/main.cpp
@@ -6,16 +6,14 @@
 
 using namespace lm;
 
-auto main(int argc, char **argv) -> int {
+auto main() -> int {
 
   std::mt19937 rng_a(67);
   std::mt19937 rng_b(65);
 
-  // get the matrix dimensions from command line
-
-  constexpr int M = 4096 * 2;
-  constexpr int N = 4096 * 2;
-  constexpr int K = 4096 * 2;
+  constexpr int M = 255;
+  constexpr int N = 255;
+  constexpr int K = 255;
 
   Matrix<float, M, N> A{make_vmatrix<float, M, N>(std::ref(rng_a))};
   Matrix<float, N, K> B{make_vmatrix<float, N, K>(std::ref(rng_b))};