In [1]:
!pip install faiss-cpu --no-cache

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m151.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [2]:
import faiss

## Code from Faiss Getting Started guide.

https://github.com/facebookresearch/faiss/wiki/Getting-started

### Create 100K random vectors for the database, and 10K random vectors to query.

In [3]:
import numpy as np
d = 64                           # dimension
nb = 100000                      # database size
nq = 10000                       # nb of queries
np.random.seed(1234)             # make reproducible
xb = np.random.random((nb, d)).astype('float32')
xb[:, 0] += np.arange(nb) / 1000.
xq = np.random.random((nq, d)).astype('float32')
xq[:, 0] += np.arange(nq) / 1000.

### Build database with the 100K vectors.

In [None]:
index = faiss.IndexFlatL2(d)   # build the index

#### Is index ready for searching?

In [13]:
print(index.is_trained)

True


#### Add the 100K vectors.

In [12]:
index.add(xb)                  # add vectors to the index
print(index.ntotal)

300000


### Sanity check: Search for the 4 nearest neighbors of the first 5 loaded vectors.

In [10]:
k = 4                          # we want to see 4 nearest neighbors
D, I = index.search(xb[:5], k) # sanity check


#### Indexes of the 4 nearest neighbors of the first 5 loaded vectors

In [8]:
print(I)

[[  0 393 363  78]
 [  1 555 277 364]
 [  2 304 101  13]
 [  3 173  18 182]
 [  4 288 370 531]]


#### Distances to the 4 nearest neighbors of the first 5 loaded vectors

In [9]:
print(D)

[[0.        7.1751738 7.20763   7.2511625]
 [0.        6.3235645 6.684581  6.799946 ]
 [0.        5.7964087 6.391736  7.2815123]
 [0.        7.2779055 7.5279875 7.662846 ]
 [0.        6.7638035 7.2951202 7.3688145]]


### Search for the 4 nearest neighbors for each of the 10K queries.

In [16]:
D, I = index.search(xq, k)     # actual search


#### Check neighbors of first/last queries

In [17]:
print(I[:5])                   # neighbors of the 5 first queries
print(I[-5:])                  # neighbors of the 5 last queries

[[   381 100381 200381    207]
 [   526 100526 200526    911]
 [   838 100838 200838    527]
 [   196 100196 200196    184]
 [   526 100526 200526    377]]
[[  9900 109900 209900  10500]
 [211055  11055 111055  10895]
 [111353 211353  11353 211103]
 [ 10571 110571 210571  10664]
 [109628   9628 209628   9554]]


#### Check distances of first/last queries

In [18]:
print(D[:5])
print(D[-5:])

[[6.815506  6.815506  6.815506  6.8894653]
 [6.6041145 6.6041145 6.6041145 6.679699 ]
 [6.4703827 6.4703865 6.4703865 6.8578644]
 [5.573681  5.573681  5.573681  6.4075394]
 [5.409401  5.409401  5.409401  6.232216 ]]
[[6.53154   6.53154   6.53154   6.9786987]
 [4.3352356 4.335266  4.335266  5.2369385]
 [6.0727234 6.0727234 6.072754  6.576721 ]
 [6.6374817 6.6374817 6.6374817 6.6487427]
 [6.218338  6.2183533 6.2183533 6.4525146]]
