Permalink
Browse files

added blas

  • Loading branch information...
aaalgo committed Aug 27, 2015
1 parent 59e9a1c commit a503915d2930b9d83da4a3ef74487505795d2fdc
Showing with 295 additions and 17 deletions.
  1. +6 −3 Makefile
  2. +38 −0 kgraph-data.h
  3. +102 −0 metric.cpp
  4. +8 −3 python/Makefile
  5. +141 −11 python/pykgraph.cpp
@@ -15,11 +15,11 @@ NABO_LIBS+=-lnabo
COMMON=kgraph.o metric.o
HEADERS=kgraph.h kgraph-data.h
PROGS=index search prune split fvec2lshkit
EXTRA_PROGS=test
EXTRA_PROGS=test
FLANN_PROGS=flann_index flann_search
NABO_PROGS=nabo_search
all: libkgraph.so $(PROGS) python
all: libkgraph.a libkgraph.so $(PROGS) python $(EXTRA_PROGS)
echo $(BUILD_INFO)
flann: $(FANN_PROGS)
@@ -29,7 +29,7 @@ deps-ubuntu:
RELEASE=kgraph-release
RELEASE_SRC=Makefile LICENSE kgraph.h kgraph-data.h index.cpp prune.cpp search.cpp flann_index.cpp flann_search.cpp split.cpp fvec2lshkit.cpp
RELEASE_BIN=libkgraph.so $(PROGS) #$(FLANN_PROGS)
RELEASE_BIN=libkgraph.a libkgraph.so $(PROGS) #$(FLANN_PROGS)
python:
make -C python
@@ -63,6 +63,9 @@ $(NABO_PROGS): %: %.cpp $(HEADERS) $(COMMON)
libkgraph.so: $(COMMON)
$(CXX) -shared -o $@ $^ $(LDLIBS)
libkgraph.a: $(COMMON)
ar rvs $@ $^
%.o: %.cpp $(HEADERS)
$(CXX) $(CXXFLAGS) -c $*.cpp
@@ -28,9 +28,16 @@ namespace kgraph {
extern float float_l2sqr_avx (float const *t1, float const *t2, unsigned dim);
/// L2 square distance with SSE2 instructions.
extern float float_l2sqr_sse2 (float const *t1, float const *t2, unsigned dim);
extern float float_l2sqr_sse2 (float const *, unsigned dim);
extern float float_dot_sse2 (float const *, float const *, unsigned dim);
/// L2 square distance for uint8_t with SSE2 instructions (for SIFT).
extern float uint8_l2sqr_sse2 (uint8_t const *t1, uint8_t const *t2, unsigned dim);
extern float float_l2sqr (float const *, float const *, unsigned dim);
extern float float_l2sqr (float const *, unsigned dim);
extern float float_dot (float const *, float const *, unsigned dim);
using std::vector;
using std::runtime_error;
@@ -48,6 +55,26 @@ namespace kgraph {
}
return r;
}
template <typename T>
static float dot (T const *t1, T const *t2, unsigned dim) {
float r = 0;
for (unsigned i = 0; i < dim; ++i) {
r += float(t1[i]) *float(t2[i]);
}
return r;
}
template <typename T>
static float norm2 (T const *t1, unsigned dim) {
float r = 0;
for (unsigned i = 0; i < dim; ++i) {
float v = float(t1[i]);
v *= v;
r += v;
}
return r;
}
};
/// L2 distance.
struct l2 {
@@ -202,6 +229,9 @@ namespace kgraph {
DATA_TYPE const *operator [] (unsigned i) const {
return reinterpret_cast<DATA_TYPE const *>(data + stride * i);
}
DATA_TYPE *operator [] (unsigned i) {
return const_cast<DATA_TYPE *>(reinterpret_cast<DATA_TYPE const *>(data + stride * i));
}
};
/// Oracle for matrix data.
@@ -292,6 +322,14 @@ namespace kgraph { namespace metric {
return float_l2sqr_sse2(t1, t2, dim);
}
template <>
inline float l2sqr::dot<float> (float const *t1, float const *t2, unsigned dim) {
return float_dot_sse2(t1, t2, dim);
}
template <>
inline float l2sqr::norm2<float> (float const *t1, unsigned dim) {
return float_l2sqr_sse2(t1, dim);
}
template <>
inline float l2sqr::apply<uint8_t> (uint8_t const *t1, uint8_t const *t2, unsigned dim) {
return uint8_l2sqr_sse2(t1, t2, dim);
}
@@ -1,6 +1,34 @@
#include "kgraph.h"
#include "kgraph-data.h"
namespace kgraph {
float float_l2sqr (float const *t1, float const *t2, unsigned dim) {
float sum = 0;
for (unsigned i = 0; i < dim; ++i) {
float v = t1[i] - t2[i];
sum += v * v;
}
return sum;
}
float float_l2sqr (float const *t1, unsigned dim) {
float sum = 0;
for (unsigned i = 0; i < dim; ++i) {
sum += t1[i] * t1[i];
}
return sum;
}
float float_dot (float const *t1, float const *t2, unsigned dim) {
float sum = 0;
for (unsigned i = 0; i < dim; ++i) {
sum += t1[i] * t2[i];
}
return sum;
}
}
#ifdef __GNUC__
#ifdef __AVX__
#include <immintrin.h>
@@ -87,6 +115,80 @@ float float_l2sqr_sse2 (float const *t1, float const *t2, unsigned dim) {
ret = unpack[0] + unpack[1] + unpack[2] + unpack[3];
return ret;//sqrt(ret);
}
#define SSE_DOT(addr1, addr2, dest, tmp1, tmp2) \
tmp1 = _mm_load_ps(addr1);\
tmp2 = _mm_load_ps(addr2);\
tmp1 = _mm_mul_ps(tmp1, tmp2); \
dest = _mm_add_ps(dest, tmp1);
float float_dot_sse2 (float const *t1, float const *t2, unsigned dim) {
__m128 sum;
__m128 l0, l1, l2, l3;
__m128 r0, r1, r2, r3;
unsigned D = (dim + 3) & ~3U;
unsigned DR = D % 16;
unsigned DD = D - DR;
const float *l = t1;
const float *r = t2;
const float *e_l = l + DD;
const float *e_r = r + DD;
float unpack[4] __attribute__ ((aligned (16))) = {0, 0, 0, 0};
float ret = 0.0;
sum = _mm_load_ps(unpack);
switch (DR) {
case 12:
SSE_DOT(e_l+8, e_r+8, sum, l2, r2);
case 8:
SSE_DOT(e_l+4, e_r+4, sum, l1, r1);
case 4:
SSE_DOT(e_l, e_r, sum, l0, r0);
}
for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) {
SSE_DOT(l, r, sum, l0, r0);
SSE_DOT(l + 4, r + 4, sum, l1, r1);
SSE_DOT(l + 8, r + 8, sum, l2, r2);
SSE_DOT(l + 12, r + 12, sum, l3, r3);
}
_mm_storeu_ps(unpack, sum);
ret = unpack[0] + unpack[1] + unpack[2] + unpack[3];
return ret;//sqrt(ret);
}
#define SSE_L2SQR_1(addr1, dest, tmp1) \
tmp1 = _mm_load_ps(addr1);\
tmp1 = _mm_mul_ps(tmp1, tmp1); \
dest = _mm_add_ps(dest, tmp1);
float float_l2sqr_sse2 (float const *t1, unsigned dim) {
__m128 sum;
__m128 l0, l1, l2, l3;
unsigned D = (dim + 3) & ~3U;
unsigned DR = D % 16;
unsigned DD = D - DR;
const float *l = t1;
const float *e_l = l + DD;
float unpack[4] __attribute__ ((aligned (16))) = {0, 0, 0, 0};
float ret = 0.0;
sum = _mm_load_ps(unpack);
switch (DR) {
case 12:
SSE_L2SQR_1(e_l+8, sum, l2);
case 8:
SSE_L2SQR_1(e_l+4, sum, l1);
case 4:
SSE_L2SQR_1(e_l, sum, l0);
}
for (unsigned i = 0; i < DD; i += 16, l += 16) {
SSE_L2SQR_1(l, sum, l0);
SSE_L2SQR_1(l + 4, sum, l1);
SSE_L2SQR_1(l + 8, sum, l2);
SSE_L2SQR_1(l + 12, sum, l3);
}
_mm_storeu_ps(unpack, sum);
ret = unpack[0] + unpack[1] + unpack[2] + unpack[3];
return ret;//sqrt(ret);
}
}
/*
template <typename T>
@@ -1,9 +1,14 @@
CC = g++
PYTHON_VERSION = 2.7
PYTHON_INCLUDE = /usr/include/python$(PYTHON_VERSION)
CXXFLAGS = -g -fPIC -std=c++11 -I$(PYTHON_INCLUDE) -I..
LDFLAGS = -L../bin -L..
LDLIBS = -lkgraph -lboost_python -lpython$(PYTHON_VERSION)
CXXFLAGS = -fopenmp -g -fPIC -std=c++11 -I$(PYTHON_INCLUDE) -I..
LDFLAGS = -L../bin
LDLIBS = ../libkgraph.a -lboost_python -lboost_timer -lboost_chrono -lpython$(PYTHON_VERSION) -lgomp
ifeq (${BLAS}, true)
CXXFLAGS += -DUSE_BLAS=1
LDLIBS += /usr/lib/openblas-base/libblas.a
endif
PYTHON_DIR = /usr/local/lib/python$(PYTHON_VERSION)/dist-packages/
Oops, something went wrong.

0 comments on commit a503915

Please sign in to comment.