From 131b42d02806579cd66935510274ee436834d805 Mon Sep 17 00:00:00 2001 From: ayman Date: Wed, 9 Jun 2021 18:32:11 -0400 Subject: [PATCH] First commit --- CHANGELOG.md | 12 +++ LICENSE.txt | 202 +++++++++++++++++++++++++++++++++++++++ README.md | 129 +++++++++++++++++++++++++ distances.nimble | 14 +++ src/distances/seq.nim | 159 ++++++++++++++++++++++++++++++ src/distances/tensor.nim | 159 ++++++++++++++++++++++++++++++ src/distances/vector.nim | 159 ++++++++++++++++++++++++++++++ tests/config.nims | 1 + tests/test_seq.nim | 91 ++++++++++++++++++ tests/test_tensor.nim | 91 ++++++++++++++++++ tests/test_vector.nim | 90 +++++++++++++++++ 11 files changed, 1107 insertions(+) create mode 100644 CHANGELOG.md create mode 100644 LICENSE.txt create mode 100644 README.md create mode 100644 distances.nimble create mode 100644 src/distances/seq.nim create mode 100644 src/distances/tensor.nim create mode 100644 src/distances/vector.nim create mode 100644 tests/config.nims create mode 100644 tests/test_seq.nim create mode 100644 tests/test_tensor.nim create mode 100644 tests/test_vector.nim diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..8a19316 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,12 @@ +# Changelog +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + + +## [0.1.0] - 2021-06-09 +### Added +- Created distances library diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..692c6f0 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2021 Ayman Albaz + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..041aa8a --- /dev/null +++ b/README.md @@ -0,0 +1,129 @@ +# Distances + +Distances is a high performance Nim library for calculating distances. + +This library is designed to allow users to calculate common distance metrics across all the popular sequence based libraries in Nim. + +## Supported Libraries +Current supported sequence based libraries include: +1. [sequtils](https://github.com/nim-lang/Nim/blob/devel/lib/pure/collections/sequtils.nim) +2. [arraymancer](https://github.com/mratsim/Arraymancer) +3. [neo](https://github.com/andreaferretti/neo) + + +## Supported Distance Metrics +Current supported distance metrics include: + +| Distance | Command | +|-------------------|---------------------------------| +| Hamming | hamming_distance(x1, x2) | +| Euclidean | euclidean_distance(x1, x2) | +| Squared Euclidean | sqeuclidean_distance(x1, x2) | +| City Block | cityblock_distance(x1, x2) | +| Total Variation | totalvariation_distance(x1, x2) | +| Jaccard | jaccard_distance(x1, x2) | +| Cosine | cosine_distance(x1, x2) | +| KL Divergence | kldivergence_distance(x1, x2) | + +## Examples + +### Calculating Cosine Distance - sequtils +Note: All computations are done row-wise. +```Nim +import sequtils +import distances/seq + +let + num_rows = 100 + num_cols = 100 + input_seq_int = newSeq[int](num_cols) + input_seq_seq_int = newSeqWith(num_rows, newSeq[int](num_cols)) + +# 1D distance +echo cosine_distance(input_seq_int, input_seq_int) + +# 2D distance (Pairwise) +echo pairwise(input_seq_seq_int, cosine_distance) +``` + + +### Calculating Cosine Distance - arraymancer +Note: Only 2D Tensors are supported. All computations are done column wise. +```Nim +import arraymancer +import distances/tensor + +let + num_rows = 100 + num_cols = 100 + input_tensor_1d_int = zeros[int](1, num_cols) + input_tensor_2d_int = zeros[int](num_rows, num_cols) + +# 1D distance +echo cosine_distance(input_tensor_1d_int, input_tensor_1d_int) + +# 2D distance (Pairwise) +echo pairwise(input_tensor_2d_int, cosine_distance) +``` + + +### Calculating Cosine Distance - neo +Note: All computations are done column wise. +Warning: Neo matrices seem to run 1-2 order of magnitudes slower than sequtils and arraymancer. Please contact me or submit a PR if you know why. +```Nim +import neo +import distances/vector + +let + num_rows = 100 + num_cols = 100 + input_vector_int = makeVector(num_cols, proc(i: int): int = 0) + input_matrix_int = makeMatrix(num_rows, num_cols, proc(i, j: int): int = 0) + +# 1D distance +echo cosine_distance(input_vector_int, input_vector_int) + +# 2D distance (Pairwise) +echo pairwise(input_matrix_int, cosine_distance) +``` + +### Normalization +All distance metrics support the optional `normalize` (defaults to `false`) parameter. This normalizes distance outputs (between -1 and 1). Note, while all distance metrics have this parameter only, it will do nothing for jaccard, cosine, and KL divergence distances. + +E.g. +```Nim +discard cosine_distance(input_vector_int, input_vector_int, normalize=true) +discard pairwise(input_matrix_int, cosine_distance, normalize=true) +``` + +### Symmetry +The `pairwise` procs always compute the lower left triangle of the 2D sequence to save time. To get a full matrix, use the `symmetrize(X, how: string = "l=>u")` proc. + +E.g. +```Nim +discard symmetrize(X, "l=>u") # Copy lower left triangle to upper right triangle +discard symmetrize(X, "u=>l") # Copy upper right triangle to lower left triangle +``` + +## Performance + +To get optimal performance, here are the recommended compiler flags: +`nim --cc:gcc --passC:"-fopenmp -ffast-math" --passL:"-fopenmp -ffast-math" --d:release -t:-mavx2 -t:-mfma c -r myScript.nim`. +- openmp -> Multiprocessing for the `pairwise` procs. Number of threads is equal to ENV variable `OMP_NUM_THREADS`. +- ffast-math -> ~2x float multiplication speedups +- d:release -> ~100x pairwise speedup +- d:danger -> ~120x pairwise speedup +- t:-mavx2 and -t:-mfma -> ~0.20x pairwise speedup + + +## TODO +- Neo matrices seem to run 1-2 orders of magnitudes slower than sequtils and arraymancer. The reason is unknown to me. +- Add more distance metrics +- Add support for distance metrics with more than 2 arguments + +Performance, feature, and documentation PR's are always welcome. + + +## Contact +I can be reached at aymanalbaz98@gmail.com + diff --git a/distances.nimble b/distances.nimble new file mode 100644 index 0000000..c661f97 --- /dev/null +++ b/distances.nimble @@ -0,0 +1,14 @@ +# Package + +version = "0.1.0" +author = "ayman albaz" +description = "Distances is a high performance Nim library for calculating distances." +license = "Apache License 2.0" +srcDir = "src" + + +# Dependencies + +requires "nim >= 1.4.6" +requires "arraymancer >= 0.6.2" +requires "neo >= 0.3.1" \ No newline at end of file diff --git a/src/distances/seq.nim b/src/distances/seq.nim new file mode 100644 index 0000000..87121b6 --- /dev/null +++ b/src/distances/seq.nim @@ -0,0 +1,159 @@ +type + Submodule* = object + name*: string + +proc initSubmodule*(): Submodule = + Submodule(name: "Anonymous") + + +import math +import sequtils + +{.nanChecks: on, infChecks: on.} +proc hamming_distance*[T: SomeNumber](x1, x2: seq[T], normalize: bool = false): float = + let num_cols = len(x1) + var total: int32 = 0 + for k in 0..num_cols-1: + total += (x1[k] != x2[k]).int32 + if normalize: + return total.float / num_cols.float + else: + return total.float + + +proc euclidean_distance*[T: SomeNumber](x1, x2: seq[T], normalize: bool = false): float = + let num_cols = len(x1) + var total: typeof(x1[0]) = 0 + var total_intermediate: typeof(x1[0]) = 0 + for k in 0..num_cols-1: + total_intermediate = (x1[k] - x2[k]) + total += total_intermediate * total_intermediate + if normalize: + return sqrt(total.float) / num_cols.float + else: + return sqrt(total.float) + + +proc sqeuclidean_distance*[T: SomeNumber](x1, x2: seq[T], normalize: bool = false): float = + let num_cols = len(x1) + var total: typeof(x1[0]) = 0 + var total_intermediate: typeof(x1[0]) = 0 + for k in 0..num_cols-1: + total_intermediate = (x1[k] - x2[k]) + total += total_intermediate * total_intermediate + if normalize: + return total.float / num_cols.float + else: + return total.float + + +proc cityblock_distance*[T: SomeNumber](x1, x2: seq[T], normalize: bool = false): float = + let num_cols = len(x1) + var total: typeof(x1[0]) = 0 + for k in 0..num_cols-1: + total += abs(x1[k] - x2[k]) + if normalize: + return total.float / num_cols.float + else: + return total.float + + +proc totalvariation_distance*[T: SomeNumber](x1, x2: seq[T], normalize: bool = false): float = + let num_cols = len(x1) + var total: typeof(x1[0]) = 0 + for k in 0..num_cols-1: + total += abs(x1[k] - x2[k]) + if normalize: + return total.float / 2.0.float / num_cols.float + else: + return total.float / 2.0.float + + +proc jaccard_distance*[T: SomeNumber](x1, x2: seq[T], normalize: bool = false): float = + let num_cols = len(x1) + var + total_min: typeof(x1[0]) = 0 + total_max: typeof(x1[0]) = 0 + for k in 0..num_cols-1: + total_min += min(x1[k], x2[k]) + total_max += max(x1[k], x2[k]) + if total_max == 0: + return 0.0.float + else: + return 1.0.float - (total_min / total_max).float + + +proc cosine_distance*[T: SomeNumber](x1, x2: seq[T], normalize: bool = false): float = + let num_cols = len(x1) + var + total_x12: typeof(x1[0]) = 0 + total_x11: typeof(x1[0]) = 0 + total_x22: typeof(x1[0]) = 0 + for k in 0..num_cols-1: + total_x12 += x1[k] * x2[k] + total_x11 += x1[k] * x1[k] + total_x22 += x2[k] * x2[k] + if total_x11 == 0 or total_x22 == 0: + return 0.0.float + else: + return 1.0.float - total_x12.float / (sqrt(total_x11.float) * sqrt(total_x22.float)) + + +proc kldivergence_distance*[T: SomeNumber](x1, x2: seq[T], normalize: bool = false): float = + let num_cols = len(x1) + var total: float + for k in 0..num_cols-1: + if x1[k] == 0 and x2[k] == 0: + continue + else: + total += x1[k].float * ln(x1[k] / x2[k]).float + return total.float + + +proc pairwise*[T: SomeNumber](X: seq[seq[T]], distance: (proc(x1, x2: seq[T], normalize: bool): float), normalize: bool = false): seq[seq[float]] = + # Computes the pairwise distance of a 2D matrix across the selected distance + let num_rows = len(X) + var dist = newSeqWith(num_rows, newSeq[float](num_rows)) + for i in 0||(num_rows-1): + for j in 0..i: + dist[i][j] = distance(X[i], X[j], normalize) + return dist + + +proc symmetrize*(X: seq[seq[SomeNumber]], how: string = "l=>u"): seq[seq[SomeNumber]] = + # Mutates X symmetrical by copying upper triangle to lower triangle ("u=>l") or lower triangle to upper triangle ("l=>r") + let num_rows = len(X) + let num_cols = len(X[0]) + var X_copy = X + if num_rows != num_cols: + raise newException(ValueError, "Tensor must be symmetrical") + if how != "l=>u" and how != "u=>l": + raise newException(ValueError, "'how' must be one of 'l=>u' or 'u=>l'") + if how == "l=>u": + for i in 0..(num_cols-1): + for j in 0..i: + X_copy[j][i] = X[i][j] + else: + for i in 0..(num_cols-1): + for j in 0..i: + X_copy[i][j] = X[j][i] + return X_copy + + +proc symmetrize*(X: var seq[seq[SomeNumber]], how: string = "l=>u"): seq[seq[SomeNumber]] = + # Mutates X symmetrical by copying upper triangle to lower triangle ("u=>l") or lower triangle to upper triangle ("l=>r") + let num_rows = len(X) + let num_cols = len(X[0]) + if num_rows != num_cols: + raise newException(ValueError, "Tensor must be symmetrical") + if how != "l=>u" and how != "u=>l": + raise newException(ValueError, "'how' must be one of 'l=>u' or 'u=>l'") + if how == "l=>u": + for i in 0..(num_cols-1): + for j in 0..i: + X[j][i] = X[i][j] + else: + for i in 0..(num_cols-1): + for j in 0..i: + X[i][j] = X[j][i] + return X diff --git a/src/distances/tensor.nim b/src/distances/tensor.nim new file mode 100644 index 0000000..4ea1a32 --- /dev/null +++ b/src/distances/tensor.nim @@ -0,0 +1,159 @@ +type + Submodule* = object + name*: string + +proc initSubmodule*(): Submodule = + Submodule(name: "Anonymous") + + +import Arraymancer +import math + +{.nanChecks: on, infChecks: on.} +proc hamming_distance*[T: SomeNumber](x1, x2: Tensor[T], normalize: bool = false): float = + let num_rows = x1.shape[0] + var total: int32 = 0 + for k in 0..num_rows-1: + total += (x1[k, 0] != x2[k, 0]).int32 + if normalize: + return total.float / num_rows.float + else: + return total.float + + +proc euclidean_distance*[T: SomeNumber](x1, x2: Tensor[T], normalize: bool = false): float = + let num_rows = x1.shape[0] + var total: typeof(x1[0]) = 0 + var total_intermediate: typeof(x1[0]) = 0 + for k in 0..num_rows-1: + total_intermediate = (x1[k, 0] - x2[k, 0]) + total += total_intermediate * total_intermediate + if normalize: + return sqrt(total.float) / num_rows.float + else: + return sqrt(total.float) + + +proc sqeuclidean_distance*[T: SomeNumber](x1, x2: Tensor[T], normalize: bool = false): float = + let num_rows = x1.shape[0] + var total: typeof(x1[0]) = 0 + var total_intermediate: typeof(x1[0]) = 0 + for k in 0..num_rows-1: + total_intermediate = (x1[k, 0] - x2[k, 0]) + total += total_intermediate * total_intermediate + if normalize: + return total.float / num_rows.float + else: + return total.float + + +proc cityblock_distance*[T: SomeNumber](x1, x2: Tensor[T], normalize: bool = false): float = + let num_rows = x1.shape[0] + var total: typeof(x1[0]) = 0 + for k in 0..num_rows-1: + total += abs(x1[k, 0] - x2[k, 0]) + if normalize: + return total.float / num_rows.float + else: + return total.float + + +proc totalvariation_distance*[T: SomeNumber](x1, x2: Tensor[T], normalize: bool = false): float = + let num_rows = x1.shape[0] + var total: typeof(x1[0]) = 0 + for k in 0..num_rows-1: + total += abs(x1[k, 0] - x2[k, 0]) + if normalize: + return total.float / 2.0.float / num_rows.float + else: + return total.float / 2.0.float + + +proc jaccard_distance*[T: SomeNumber](x1, x2: Tensor[T], normalize: bool = false): float = + let num_rows = x1.shape[0] + var + total_min: typeof(x1[0]) = 0 + total_max: typeof(x1[0]) = 0 + for k in 0..num_rows-1: + total_min += min(x1[k, 0], x2[k, 0]) + total_max += max(x1[k, 0], x2[k, 0]) + if total_max == 0: + return 0.0.float + else: + return 1.0.float - (total_min / total_max).float + + +proc cosine_distance*[T: SomeNumber](x1, x2: Tensor[T], normalize: bool = false): float = + let num_rows = x1.shape[0] + var + total_x12: typeof(x1[0]) = 0 + total_x11: typeof(x1[0]) = 0 + total_x22: typeof(x1[0]) = 0 + for k in 0..num_rows-1: + total_x12 += x1[k, 0] * x2[k, 0] + total_x11 += x1[k, 0] * x1[k, 0] + total_x22 += x2[k, 0] * x2[k, 0] + if total_x11 == 0 or total_x22 == 0: + return 0.0.float + else: + return 1.0.float - total_x12.float / (sqrt(total_x11.float) * sqrt(total_x22.float)) + + +proc kldivergence_distance*[T: SomeNumber](x1, x2: Tensor[T], normalize: bool = false): float = + let num_rows = x1.shape[0] + var total: float + for k in 0..num_rows-1: + if x1[k, 0] == 0 and x2[k, 0] == 0: + continue + else: + total += x1[k, 0].float * ln(x1[k, 0] / x2[k, 0]).float + return total.float + + +proc pairwise*[T: SomeNumber](X: Tensor[T], distance: (proc(x1, x2: Tensor[T], normalize: bool): float), normalize: bool = false): Tensor[float] = + # Computes the pairwise distance of a 2D matrix across the selected distance + let num_cols = X.shape[1] + var dist = zeros[float](num_cols, num_cols) + for i in 0||(num_cols-1): + for j in 0..i: + dist[i, j] = distance(X[0.._, i], X[0.._, j], normalize) + return dist + + +proc symmetrize*(X: Tensor[SomeNumber], how: string = "l=>u"): Tensor[SomeNumber] = + # Mutates X symmetrical by copying upper triangle to lower triangle ("u=>l") or lower triangle to upper triangle ("l=>r") + let num_rows = X.shape[0] + let num_cols = X.shape[1] + var X_copy = X + if num_rows != num_cols: + raise newException(ValueError, "Matrix must be symmetrical") + if how != "l=>u" and how != "u=>l": + raise newException(ValueError, "'how' must be one of 'l=>u' or 'u=>l'") + if how == "l=>u": + for i in 0..(num_cols-1): + for j in 0..i: + X_copy[j, i] = X[i, j] + else: + for i in 0..(num_cols-1): + for j in 0..i: + X_copy[i, j] = X[j, i] + return X_copy + + +proc symmetrize*(X: var Tensor[SomeNumber], how: string = "l=>u"): Tensor[SomeNumber] = + # Mutates X symmetrical by copying upper triangle to lower triangle ("u=>l") or lower triangle to upper triangle ("l=>r") + let num_rows = X.shape[0] + let num_cols = X.shape[1] + if num_rows != num_cols: + raise newException(ValueError, "Matrix must be symmetrical") + if how != "l=>u" and how != "u=>l": + raise newException(ValueError, "'how' must be one of 'l=>u' or 'u=>l'") + if how == "l=>u": + for i in 0..(num_cols-1): + for j in 0..i: + X[j, i] = X[i, j] + else: + for i in 0..(num_cols-1): + for j in 0..i: + X[i, j] = X[j, i] + return X \ No newline at end of file diff --git a/src/distances/vector.nim b/src/distances/vector.nim new file mode 100644 index 0000000..03a54ec --- /dev/null +++ b/src/distances/vector.nim @@ -0,0 +1,159 @@ +type + Submodule* = object + name*: string + +proc initSubmodule*(): Submodule = + Submodule(name: "Anonymous") + + +import math +import neo + +{.nanChecks: on, infChecks: on.} +proc hamming_distance*[T: SomeNumber](x1, x2: Vector[T], normalize: bool = false): float = + let num_rows = x1.len + var total: int32 = 0 + for k in 0..num_rows-1: + total += (x1[k] != x2[k]).int32 + if normalize: + return total.float / num_rows.float + else: + return total.float + + +proc euclidean_distance*[T: SomeNumber](x1, x2: Vector[T], normalize: bool = false): float = + let num_rows = x1.len + var total: typeof(x1[0]) = 0 + var total_intermediate: typeof(x1[0]) = 0 + for k in 0..num_rows-1: + total_intermediate = (x1[k] - x2[k]) + total += total_intermediate * total_intermediate + if normalize: + return sqrt(total.float) / num_rows.float + else: + return sqrt(total.float) + + +proc sqeuclidean_distance*[T: SomeNumber](x1, x2: Vector[T], normalize: bool = false): float = + let num_rows = x1.len + var total: typeof(x1[0]) = 0 + var total_intermediate: typeof(x1[0]) = 0 + for k in 0..num_rows-1: + total_intermediate = (x1[k] - x2[k]) + total += total_intermediate * total_intermediate + if normalize: + return total.float / num_rows.float + else: + return total.float + + +proc cityblock_distance*[T: SomeNumber](x1, x2: Vector[T], normalize: bool = false): float = + let num_rows = x1.len + var total: typeof(x1[0]) = 0 + for k in 0..num_rows-1: + total += abs(x1[k] - x2[k]) + if normalize: + return total.float / num_rows.float + else: + return total.float + + +proc totalvariation_distance*[T: SomeNumber](x1, x2: Vector[T], normalize: bool = false): float = + let num_rows = x1.len + var total: typeof(x1[0]) = 0 + for k in 0..num_rows-1: + total += abs(x1[k] - x2[k]) + if normalize: + return total.float / 2.0.float / num_rows.float + else: + return total.float / 2.0.float + + +proc jaccard_distance*[T: SomeNumber](x1, x2: Vector[T], normalize: bool = false): float = + let num_rows = x1.len + var + total_min: typeof(x1[0]) = 0 + total_max: typeof(x1[0]) = 0 + for k in 0..num_rows-1: + total_min += min(x1[k], x2[k]) + total_max += max(x1[k], x2[k]) + if total_max == 0: + return 0.0.float + else: + return 1.0.float - (total_min / total_max).float + + +proc cosine_distance*[T: SomeNumber](x1, x2: Vector[T], normalize: bool = false): float = + let num_rows = x1.len + var + total_x12: typeof(x1[0]) = 0 + total_x11: typeof(x1[0]) = 0 + total_x22: typeof(x1[0]) = 0 + for k in 0..num_rows-1: + total_x12 += x1[k] * x2[k] + total_x11 += x1[k] * x1[k] + total_x22 += x2[k] * x2[k] + if total_x11 == 0 or total_x22 == 0: + return 0.0.float + else: + return 1.0.float - total_x12.float / (sqrt(total_x11.float) * sqrt(total_x22.float)) + + +proc kldivergence_distance*[T: SomeNumber](x1, x2: Vector[T], normalize: bool = false): float = + let num_rows = x1.len + var total: float + for k in 0..num_rows-1: + if x1[k] == 0 and x2[k] == 0: + continue + else: + total += x1[k].float * ln(x1[k] / x2[k]).float + return total.float + + +proc pairwise*[T](X: Matrix[T], distance: (proc(x1, x2: Vector[T], normalize: bool): float), normalize: bool = false): Matrix[float] = + # Computes the pairwise distance of a 2D matrix (col-major-wise) across the selected distance + let num_cols = X.N + var dist = makeMatrix(num_cols, num_cols, proc(i, j: int): float = 0.float) + for i in 0..(num_cols-1): + for j in 0..i: + dist[i, j] = distance(X.column(i), X.column(j), normalize) + return dist + + +proc symmetrize*(X: Matrix[SomeNumber], how: string = "l=>u"): Matrix[SomeNumber] = + # Mutates X symmetrical by copying upper triangle to lower triangle ("u=>l") or lower triangle to upper triangle ("l=>r") + let num_rows = X.M + let num_cols = X.N + var X_copy = X + if num_rows != num_cols: + raise newException(ValueError, "Matrix must be symmetrical") + if how != "l=>u" and how != "u=>l": + raise newException(ValueError, "'how' must be one of 'l=>u' or 'u=>l'") + if how == "l=>u": + for i in 0..(num_cols-1): + for j in 0..i: + X_copy[j, i] = X[i, j] + else: + for i in 0..(num_cols-1): + for j in 0..i: + X_copy[i, j] = X[j, i] + return X_copy + + +proc symmetrize*(X: var Matrix[SomeNumber], how: string = "l=>u"): Matrix[SomeNumber] = + # Mutates X symmetrical by copying upper triangle to lower triangle ("u=>l") or lower triangle to upper triangle ("l=>r") + let num_rows = X.M + let num_cols = X.N + if num_rows != num_cols: + raise newException(ValueError, "Matrix must be symmetrical") + if how != "l=>u" and how != "u=>l": + raise newException(ValueError, "'how' must be one of 'l=>u' or 'u=>l'") + if how == "l=>u": + for i in 0..(num_cols-1): + for j in 0..i: + X[j, i] = X[i, j] + else: + for i in 0..(num_cols-1): + for j in 0..i: + X[i, j] = X[j, i] + return X diff --git a/tests/config.nims b/tests/config.nims new file mode 100644 index 0000000..3bb69f8 --- /dev/null +++ b/tests/config.nims @@ -0,0 +1 @@ +switch("path", "$projectDir/../src") \ No newline at end of file diff --git a/tests/test_seq.nim b/tests/test_seq.nim new file mode 100644 index 0000000..ce4bac6 --- /dev/null +++ b/tests/test_seq.nim @@ -0,0 +1,91 @@ +import sequtils +import times +import unittest + +import distances/seq + + +# Globals +let + num_rows = 10 + num_cols = 10 + input_seq_seq_int = newSeqWith(num_rows, newSeq[int](num_cols)) + input_seq_seq_float = newSeqWith(num_rows, newSeq[float](num_cols)) + output_seq_float = newSeqWith(num_rows, newSeq[float](num_rows)) + normalize = true + +suite "Seq": + + setup: + let t0 = getTime() + + teardown: + echo "\n RUNTIME: ", getTime() - t0 + + test "pairwise int hamming": + check pairwise(input_seq_seq_int, hamming_distance, normalize) == output_seq_float + + test "pairwise int euclidean": + check pairwise(input_seq_seq_int, euclidean_distance, normalize) == output_seq_float + + test "pairwise int sqeuclidean": + check pairwise(input_seq_seq_int, sqeuclidean_distance, normalize) == output_seq_float + + test "pairwise int cityblock": + check pairwise(input_seq_seq_int, cityblock_distance, normalize) == output_seq_float + + test "pairwise int totalvariation": + check pairwise(input_seq_seq_int, totalvariation_distance, normalize) == output_seq_float + + test "pairwise int jaccard": + check pairwise(input_seq_seq_int, jaccard_distance, normalize) == output_seq_float + + test "pairwise int cosine": + check pairwise(input_seq_seq_int, cosine_distance, normalize) == output_seq_float + + test "pairwise int kldivergence": + check pairwise(input_seq_seq_int, kldivergence_distance, normalize) == output_seq_float + + test "pairwise float hamming": + check pairwise(input_seq_seq_float, hamming_distance, normalize) == output_seq_float + + test "pairwise float euclidean": + check pairwise(input_seq_seq_float, euclidean_distance, normalize) == output_seq_float + + test "pairwise float sqeuclidean": + check pairwise(input_seq_seq_float, sqeuclidean_distance, normalize) == output_seq_float + + test "pairwise float cityblock": + check pairwise(input_seq_seq_float, cityblock_distance, normalize) == output_seq_float + + test "pairwise float totalvariation": + check pairwise(input_seq_seq_float, totalvariation_distance, normalize) == output_seq_float + + test "pairwise float jaccard": + check pairwise(input_seq_seq_float, jaccard_distance, normalize) == output_seq_float + + test "pairwise float cosine": + check pairwise(input_seq_seq_float, cosine_distance, normalize) == output_seq_float + + test "pairwise float kldivergence": + check pairwise(input_seq_seq_float, kldivergence_distance, normalize) == output_seq_float + + test "symmetrize l=>u": + let input = @[@[1, 2, 3], @[11, 22, 33], @[111, 222, 333]] + let output = @[@[1, 11, 111], @[11, 22, 222], @[111, 222, 333]] + check symmetrize(input) == output + + test "symmetrize seq sesq u=>l": + let input = @[@[1, 2, 3], @[11, 22, 33], @[111, 222, 333]] + let output = @[@[1, 2, 3], @[2, 22, 33], @[3, 33, 333]] + check symmetrize(input, "u=>l") == output + + test "symmetrize mut l=>u": + var input = @[@[1, 2, 3], @[11, 22, 33], @[111, 222, 333]] + var output = @[@[1, 11, 111], @[11, 22, 222], @[111, 222, 333]] + check symmetrize(input) == output + + test "symmetrize mut u=>l": + var input = @[@[1, 2, 3], @[11, 22, 33], @[111, 222, 333]] + var output = @[@[1, 2, 3], @[2, 22, 33], @[3, 33, 333]] + check symmetrize(input, "u=>l") == output diff --git a/tests/test_tensor.nim b/tests/test_tensor.nim new file mode 100644 index 0000000..6ee8ee9 --- /dev/null +++ b/tests/test_tensor.nim @@ -0,0 +1,91 @@ +import arraymancer +import times +import unittest + +import distances/tensor + +# Globals +let + num_rows = 10 + num_cols = 10 + input_tensor_int = zeros[int](num_rows, num_cols) + input_tensor_float = zeros[float](num_rows, num_cols) + output_tensor_float = zeros[float](num_cols, num_cols) + normalize = true + +suite "Tensor": + + setup: + let t0 = getTime() + + teardown: + echo "\n RUNTIME: ", getTime() - t0 + + test "pairwise int hamming": + check pairwise(input_tensor_int, hamming_distance, normalize) == output_tensor_float + + test "pairwise int euclidean": + check pairwise(input_tensor_int, euclidean_distance, normalize) == output_tensor_float + + test "pairwise int sqeuclidean": + check pairwise(input_tensor_int, sqeuclidean_distance, normalize) == output_tensor_float + + test "pairwise int cityblock": + check pairwise(input_tensor_int, cityblock_distance, normalize) == output_tensor_float + + test "pairwise int totalvariation": + check pairwise(input_tensor_int, totalvariation_distance, normalize) == output_tensor_float + + test "pairwise int jaccard": + check pairwise(input_tensor_int, jaccard_distance, normalize) == output_tensor_float + + test "pairwise int cosine": + check pairwise(input_tensor_int, cosine_distance, normalize) == output_tensor_float + + test "pairwise int kldivergence": + check pairwise(input_tensor_int, kldivergence_distance, normalize) == output_tensor_float + + test "pairwise float hamming": + check pairwise(input_tensor_float, hamming_distance, normalize) == output_tensor_float + + test "pairwise float euclidean": + check pairwise(input_tensor_float, euclidean_distance, normalize) == output_tensor_float + + test "pairwise float sqeuclidean": + check pairwise(input_tensor_float, sqeuclidean_distance, normalize) == output_tensor_float + + test "pairwise float cityblock": + check pairwise(input_tensor_float, cityblock_distance, normalize) == output_tensor_float + + test "pairwise float totalvariation": + check pairwise(input_tensor_float, totalvariation_distance, normalize) == output_tensor_float + + test "pairwise float jaccard": + check pairwise(input_tensor_float, jaccard_distance, normalize) == output_tensor_float + + test "pairwise float cosine": + check pairwise(input_tensor_float, cosine_distance, normalize) == output_tensor_float + + test "pairwise float kldivergence": + check pairwise(input_tensor_float, kldivergence_distance, normalize) == output_tensor_float + + test "symmetrize tensor l=>u": + let input = @[@[1, 2, 3], @[11, 22, 33], @[111, 222, 333]].toTensor() + let output = @[@[1, 11, 111], @[11, 22, 222], @[111, 222, 333]].toTensor() + check symmetrize(input) == output + + test "symmetrize tensor u=>l": + let input = @[@[1, 2, 3], @[11, 22, 33], @[111, 222, 333]].toTensor() + let output = @[@[1, 2, 3], @[2, 22, 33], @[3, 33, 333]].toTensor() + check symmetrize(input, "u=>l") == output + + test "symmetrize mut tensor l=>u": + var input = @[@[1, 2, 3], @[11, 22, 33], @[111, 222, 333]].toTensor() + var output = @[@[1, 11, 111], @[11, 22, 222], @[111, 222, 333]].toTensor() + check symmetrize(input) == output + + test "symmetrize mut tensor u=>l": + var input = @[@[1, 2, 3], @[11, 22, 33], @[111, 222, 333]].toTensor() + var output = @[@[1, 2, 3], @[2, 22, 33], @[3, 33, 333]].toTensor() + check symmetrize(input, "u=>l") == output + diff --git a/tests/test_vector.nim b/tests/test_vector.nim new file mode 100644 index 0000000..57bf57c --- /dev/null +++ b/tests/test_vector.nim @@ -0,0 +1,90 @@ +import neo +import times +import unittest + +import distances/vector + +# Globals +let + num_rows = 10 + num_cols = 10 + input_matrix_int = makeMatrix(num_rows, num_cols, proc(i, j: int): int = 0.int) + input_matrix_float = makeMatrix(num_rows, num_cols, proc(i, j: int): float = 0.float) + output_matrix_float = makeMatrix(num_rows, num_rows, proc(i, j: int): float = 0.float) + normalize = true + +suite "Vector": + + setup: + let t0 = getTime() + + teardown: + echo "\n RUNTIME: ", getTime() - t0 + + test "pairwise int hamming": + check pairwise(input_matrix_int, hamming_distance, normalize) == output_matrix_float + + test "pairwise int euclidean": + check pairwise(input_matrix_int, euclidean_distance, normalize) == output_matrix_float + + test "pairwise int sqeuclidean": + check pairwise(input_matrix_int, sqeuclidean_distance, normalize) == output_matrix_float + + test "pairwise int cityblock": + check pairwise(input_matrix_int, cityblock_distance, normalize) == output_matrix_float + + test "pairwise int totalvariation": + check pairwise(input_matrix_int, totalvariation_distance, normalize) == output_matrix_float + + test "pairwise int jaccard": + check pairwise(input_matrix_int, jaccard_distance, normalize) == output_matrix_float + + test "pairwise int cosine": + check pairwise(input_matrix_int, cosine_distance, normalize) == output_matrix_float + + test "pairwise int kldivergence": + check pairwise(input_matrix_int, kldivergence_distance, normalize) == output_matrix_float + + test "pairwise float hamming": + check pairwise(input_matrix_float, hamming_distance, normalize) == output_matrix_float + + test "pairwise float euclidean": + check pairwise(input_matrix_float, euclidean_distance, normalize) == output_matrix_float + + test "pairwise float sqeuclidean": + check pairwise(input_matrix_float, sqeuclidean_distance, normalize) == output_matrix_float + + test "pairwise float cityblock": + check pairwise(input_matrix_float, cityblock_distance, normalize) == output_matrix_float + + test "pairwise float totalvariation": + check pairwise(input_matrix_float, totalvariation_distance, normalize) == output_matrix_float + + test "pairwise float jaccard": + check pairwise(input_matrix_float, jaccard_distance, normalize) == output_matrix_float + + test "pairwise float cosine": + check pairwise(input_matrix_float, cosine_distance, normalize) == output_matrix_float + + test "pairwise float kldivergence": + check pairwise(input_matrix_float, kldivergence_distance, normalize) == output_matrix_float + + test "symmetrize l=>u": + let input = @[@[1, 2, 3], @[11, 22, 33], @[111, 222, 333]].matrix() + let output = @[@[1, 11, 111], @[11, 22, 222], @[111, 222, 333]].matrix() + check symmetrize(input) == output + + test "symmetrize u=>l": + let input = @[@[1, 2, 3], @[11, 22, 33], @[111, 222, 333]].matrix() + let output = @[@[1, 2, 3], @[2, 22, 33], @[3, 33, 333]].matrix() + check symmetrize(input, "u=>l") == output + + test "symmetrize mut l=>u": + var input = @[@[1, 2, 3], @[11, 22, 33], @[111, 222, 333]].matrix() + var output = @[@[1, 11, 111], @[11, 22, 222], @[111, 222, 333]].matrix() + check symmetrize(input) == output + + test "symmetrize mut u=>l": + var input = @[@[1, 2, 3], @[11, 22, 33], @[111, 222, 333]].matrix() + var output = @[@[1, 2, 3], @[2, 22, 33], @[3, 33, 333]].matrix() + check symmetrize(input, "u=>l") == output