# FAISS (Facebook AI Similarity Search) Method

In [1]:
pip install faiss-cpu

Collecting faiss-cpu
[?25l  Downloading https://files.pythonhosted.org/packages/48/0c/efd43c4feac172867409f38f07949c36602355ec7196749d10f905d09228/faiss_cpu-1.7.0-cp37-cp37m-manylinux2014_x86_64.whl (8.1MB)
[K     |████████████████████████████████| 8.2MB 8.0MB/s 
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.0


In [2]:
from zipfile import ZipFile 
file_name = "abc.zip"

with ZipFile(file_name , 'r') as zip:
  zip.extractall()
  print('done')

done


Importing Libraries


In [3]:
import numpy as np
import faiss 
import time
import pandas as pd
from numpy.linalg import norm
import pickle
from tqdm import tqdm, tqdm_notebook
import os
import time
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input

Defining a function to extract image features given an image and Resnet-50 model

In [4]:
model = ResNet50(weights='imagenet', include_top=False,
                 input_shape=(180, 180, 3))
def extract_features(img_path, model):
    input_shape = (180, 180, 3)
    img = image.load_img(img_path, target_size=(
        input_shape[0], input_shape[1]))
    img_array = image.img_to_array(img)
    expanded_img_array = np.expand_dims(img_array, axis=0)
    preprocessed_img = preprocess_input(expanded_img_array)
    features = model.predict(preprocessed_img)
    flattened_features = features.flatten()
    normalized_features = flattened_features / norm(flattened_features)
    return normalized_features

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


Function to recursively get all the image files under a root directory.

In [5]:
extensions = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG']
def get_file_list(root_dir):
    file_list = []
    counter = 1
    for root, directories, filenames in os.walk(root_dir):
        for filename in filenames:
            if any(ext in filename for ext in extensions):
                file_list.append(os.path.join(root, filename))
                counter += 1
    return file_list

Run the extraction over the entire dataset.

In [6]:
root_dir = '/content/abc'
files = sorted(get_file_list(root_dir))

In [None]:
pip install tqdm



In [None]:
#import tqdm.console as tqdm
import tqdm.gui as tqdm
import tqdm.notebook as tqdm

In [None]:
from tqdm.notebook import tqdm_notebook

In [7]:
feature_list = []
for i in tqdm_notebook(range(len(files))):
    feature_list.append(extract_features(files[i], model))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=110.0), HTML(value='')))




In [8]:
feature_list

[array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00218143], dtype=float32),
 array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00759384], dtype=float32),
 array([0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 2.0877398e-05,
        0.0000000e+00, 0.0000000e+00], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00065994], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00623004], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0.        , 0.        , 0.        , ..., 0.   

In [9]:
feature_list=np.array(feature_list)

In [10]:
feature_list.shape

(110, 73728)

In [11]:
dimension = 73728    # dimensions of each vector                         
n = len(files)    # number of vectors                   
np.random.seed(1)             
db_vec = feature_list #np.random.random((n, dimension)).astype('float32')

In [12]:
db_vec.shape

(110, 73728)

In [13]:
nlist = 1  # number of clusters
quantiser = faiss.IndexFlatL2(dimension)  
index = faiss.IndexIVFFlat(quantiser, dimension, nlist,   faiss.METRIC_L2)

In [14]:
print(index.is_trained)   # False
index.train(db_vec)  # train on the database vectors
print(index.ntotal)   # 0
index.add(db_vec)   # add the vectors and update the index
print(index.is_trained)  # True
print(index.ntotal)   # 200

False
0
True
110


In [15]:
nprobe = 1  # find 2 most similar clusters
n_query = 110  
k = 10  # return 3 nearest neighbours
np.random.seed(0)   
query_vectors = feature_list#np.random.random((n_query, dimension)).astype('float32')
distances, indices = index.search(query_vectors, k)

In [16]:
distances

array([[0.       , 0.       , 1.0203644, ..., 1.2095646, 1.220204 ,
        1.2303387],
       [0.       , 0.       , 1.0203644, ..., 1.2095646, 1.220204 ,
        1.2303387],
       [0.       , 1.0432832, 1.168778 , ..., 1.2695479, 1.2880524,
        1.291001 ],
       ...,
       [0.       , 0.8455598, 1.0093704, ..., 1.1652576, 1.2458723,
        1.2502122],
       [0.       , 1.1026717, 1.1026717, ..., 1.2809144, 1.2917243,
        1.3118236],
       [0.       , 1.2989217, 1.3176483, ..., 1.3456513, 1.3556067,
        1.3596555]], dtype=float32)

In [17]:
indices

array([[  0,   1,  69, ...,  36, 101, 106],
       [  0,   1,  69, ...,  36, 101, 106],
       [  2,  41,  38, ...,  60,  64,  76],
       ...,
       [107,  60,  75, ...,  95,  76,  19],
       [108,   1,   0, ...,  36,  61,  45],
       [109,   9,  88, ...,  27,  85,  95]])

In [18]:
faiss.write_index(index,"vectors.index")  # save the index to 
# diskindex = faiss.read_index("vector.index")  # load the index 

In [19]:
diskindex = faiss.read_index("vectors.index")

In [20]:
pickle.dump(feature_list, open('features-cdiscount-resnet.pickle', 'wb'))
pickle.dump('filenames', open('filenames-cdiscount.pickle','wb'))

In [22]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

In [23]:
files[1]

'/content/abc/test_1.jpg'

In [24]:
files

['/content/abc/test_0.jpg',
 '/content/abc/test_1.jpg',
 '/content/abc/test_10.jpg',
 '/content/abc/test_100.jpg',
 '/content/abc/test_101.jpg',
 '/content/abc/test_102.jpg',
 '/content/abc/test_103.jpg',
 '/content/abc/test_104.jpg',
 '/content/abc/test_105.jpg',
 '/content/abc/test_106.jpg',
 '/content/abc/test_107.jpg',
 '/content/abc/test_108.jpg',
 '/content/abc/test_109.jpg',
 '/content/abc/test_11.jpg',
 '/content/abc/test_12.jpg',
 '/content/abc/test_13.jpg',
 '/content/abc/test_14.jpg',
 '/content/abc/test_15.jpg',
 '/content/abc/test_16.jpg',
 '/content/abc/test_17.jpg',
 '/content/abc/test_18.jpg',
 '/content/abc/test_19.jpg',
 '/content/abc/test_2.jpg',
 '/content/abc/test_20.jpg',
 '/content/abc/test_21.jpg',
 '/content/abc/test_22.jpg',
 '/content/abc/test_23.jpg',
 '/content/abc/test_24.jpg',
 '/content/abc/test_25.jpg',
 '/content/abc/test_26.jpg',
 '/content/abc/test_27.jpg',
 '/content/abc/test_28.jpg',
 '/content/abc/test_29.jpg',
 '/content/abc/test_3.jpg',
 '/conte

In [25]:
filenamenew = [i.split('/')[3] for i in files]

In [26]:
filenamenew

['test_0.jpg',
 'test_1.jpg',
 'test_10.jpg',
 'test_100.jpg',
 'test_101.jpg',
 'test_102.jpg',
 'test_103.jpg',
 'test_104.jpg',
 'test_105.jpg',
 'test_106.jpg',
 'test_107.jpg',
 'test_108.jpg',
 'test_109.jpg',
 'test_11.jpg',
 'test_12.jpg',
 'test_13.jpg',
 'test_14.jpg',
 'test_15.jpg',
 'test_16.jpg',
 'test_17.jpg',
 'test_18.jpg',
 'test_19.jpg',
 'test_2.jpg',
 'test_20.jpg',
 'test_21.jpg',
 'test_22.jpg',
 'test_23.jpg',
 'test_24.jpg',
 'test_25.jpg',
 'test_26.jpg',
 'test_27.jpg',
 'test_28.jpg',
 'test_29.jpg',
 'test_3.jpg',
 'test_30.jpg',
 'test_31.jpg',
 'test_32.jpg',
 'test_33.jpg',
 'test_34.jpg',
 'test_35.jpg',
 'test_36.jpg',
 'test_37.jpg',
 'test_38.jpg',
 'test_39.jpg',
 'test_4.jpg',
 'test_40.jpg',
 'test_41.jpg',
 'test_42.jpg',
 'test_43.jpg',
 'test_44.jpg',
 'test_45.jpg',
 'test_46.jpg',
 'test_47.jpg',
 'test_48.jpg',
 'test_49.jpg',
 'test_5.jpg',
 'test_50.jpg',
 'test_51.jpg',
 'test_52.jpg',
 'test_53.jpg',
 'test_54.jpg',
 'test_55.jpg',
 'te

In [27]:
index = []
for i in range(len(filenamenew)):
  index.append(i)

In [28]:
index

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109]

In [29]:
df = pd.DataFrame()
# df['index'] = index
df['images'] = filenamenew

In [30]:
df

Unnamed: 0,images
0,test_0.jpg
1,test_1.jpg
2,test_10.jpg
3,test_100.jpg
4,test_101.jpg
...,...
105,test_95.jpg
106,test_96.jpg
107,test_97.jpg
108,test_98.jpg


In [31]:
indi_df = pd.DataFrame(indices)
indi_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,1,69,68,108,55,103,36,101,106
1,0,1,69,68,108,55,103,36,101,106
2,2,41,38,100,107,28,84,60,64,76
3,3,18,24,23,68,42,29,99,27,20
4,4,103,82,27,69,1,0,94,29,85
...,...,...,...,...,...,...,...,...,...,...
105,105,59,14,15,109,49,85,34,31,103
106,106,69,61,55,68,103,101,25,85,9
107,107,60,75,50,98,100,38,95,76,19
108,108,1,0,69,68,55,25,36,61,45


In [None]:
# # for i in indices_df.items:
#   if i = df['']

In [32]:
df

Unnamed: 0,images
0,test_0.jpg
1,test_1.jpg
2,test_10.jpg
3,test_100.jpg
4,test_101.jpg
...,...
105,test_95.jpg
106,test_96.jpg
107,test_97.jpg
108,test_98.jpg


In [33]:
dfnew = df.to_dict()
dfnew = dfnew['images']

In [34]:
dfnew

{0: 'test_0.jpg',
 1: 'test_1.jpg',
 2: 'test_10.jpg',
 3: 'test_100.jpg',
 4: 'test_101.jpg',
 5: 'test_102.jpg',
 6: 'test_103.jpg',
 7: 'test_104.jpg',
 8: 'test_105.jpg',
 9: 'test_106.jpg',
 10: 'test_107.jpg',
 11: 'test_108.jpg',
 12: 'test_109.jpg',
 13: 'test_11.jpg',
 14: 'test_12.jpg',
 15: 'test_13.jpg',
 16: 'test_14.jpg',
 17: 'test_15.jpg',
 18: 'test_16.jpg',
 19: 'test_17.jpg',
 20: 'test_18.jpg',
 21: 'test_19.jpg',
 22: 'test_2.jpg',
 23: 'test_20.jpg',
 24: 'test_21.jpg',
 25: 'test_22.jpg',
 26: 'test_23.jpg',
 27: 'test_24.jpg',
 28: 'test_25.jpg',
 29: 'test_26.jpg',
 30: 'test_27.jpg',
 31: 'test_28.jpg',
 32: 'test_29.jpg',
 33: 'test_3.jpg',
 34: 'test_30.jpg',
 35: 'test_31.jpg',
 36: 'test_32.jpg',
 37: 'test_33.jpg',
 38: 'test_34.jpg',
 39: 'test_35.jpg',
 40: 'test_36.jpg',
 41: 'test_37.jpg',
 42: 'test_38.jpg',
 43: 'test_39.jpg',
 44: 'test_4.jpg',
 45: 'test_40.jpg',
 46: 'test_41.jpg',
 47: 'test_42.jpg',
 48: 'test_43.jpg',
 49: 'test_44.jpg',
 50: 

In [35]:
indi_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,1,69,68,108,55,103,36,101,106
1,0,1,69,68,108,55,103,36,101,106
2,2,41,38,100,107,28,84,60,64,76
3,3,18,24,23,68,42,29,99,27,20
4,4,103,82,27,69,1,0,94,29,85
...,...,...,...,...,...,...,...,...,...,...
105,105,59,14,15,109,49,85,34,31,103
106,106,69,61,55,68,103,101,25,85,9
107,107,60,75,50,98,100,38,95,76,19
108,108,1,0,69,68,55,25,36,61,45


In [36]:
indi_df = indi_df.replace(dfnew)
indi_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,test_0.jpg,test_1.jpg,test_62.jpg,test_61.jpg,test_98.jpg,test_5.jpg,test_93.jpg,test_32.jpg,test_91.jpg,test_96.jpg
1,test_0.jpg,test_1.jpg,test_62.jpg,test_61.jpg,test_98.jpg,test_5.jpg,test_93.jpg,test_32.jpg,test_91.jpg,test_96.jpg
2,test_10.jpg,test_37.jpg,test_34.jpg,test_90.jpg,test_97.jpg,test_25.jpg,test_76.jpg,test_54.jpg,test_58.jpg,test_69.jpg
3,test_100.jpg,test_16.jpg,test_21.jpg,test_20.jpg,test_61.jpg,test_38.jpg,test_26.jpg,test_9.jpg,test_24.jpg,test_18.jpg
4,test_101.jpg,test_93.jpg,test_74.jpg,test_24.jpg,test_62.jpg,test_1.jpg,test_0.jpg,test_85.jpg,test_26.jpg,test_77.jpg
...,...,...,...,...,...,...,...,...,...,...
105,test_95.jpg,test_53.jpg,test_12.jpg,test_13.jpg,test_99.jpg,test_44.jpg,test_77.jpg,test_30.jpg,test_28.jpg,test_93.jpg
106,test_96.jpg,test_62.jpg,test_55.jpg,test_5.jpg,test_61.jpg,test_93.jpg,test_91.jpg,test_22.jpg,test_77.jpg,test_106.jpg
107,test_97.jpg,test_54.jpg,test_68.jpg,test_45.jpg,test_89.jpg,test_90.jpg,test_34.jpg,test_86.jpg,test_69.jpg,test_17.jpg
108,test_98.jpg,test_1.jpg,test_0.jpg,test_62.jpg,test_61.jpg,test_5.jpg,test_22.jpg,test_32.jpg,test_55.jpg,test_40.jpg


In [37]:

from google.colab import files
indi_df.to_csv('faiss.csv')


In [38]:
files.download('faiss.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>