In [1]:
import os
## The base dir of the project
PROJECT_DIR = "/home/projects/IBRS/Image-Based-Recommendation-System-with-ResNet-and-Elasticsearch"

## The "etc" dir contains all the files not tracked in GitHub
ETC_DIR = os.path.join(PROJECT_DIR, "etc")

## The .zip of the dataset is provided in the repository, but it's unzipped in "etc"
DATASET_ZIP_FILE = os.path.join(PROJECT_DIR, "dataset.zip")
DATASET_DIR = os.path.join(ETC_DIR, "dataset")
DATASET_METADATA_TXT_FILE = os.path.join(DATASET_DIR, "metadata.txt")
DATASET_METADATA_JSON_FILE = os.path.join(DATASET_DIR, "metadata.json")
DATASET_METADATA_DIR = os.path.join(DATASET_DIR, "metadata")

In [2]:
# unzip the dataset into the etc dir
import zipfile

if not os.path.exists(DATASET_DIR):
    with zipfile.ZipFile(DATASET_ZIP_FILE, 'r') as data:
        data.extractall(path=ETC_DIR)
else:
    print('[INFO] Dataset already unzipped')

[INFO] Dataset already unzipped


In [14]:
import json

metadata = []

if not os.path.exists(DATASET_METADATA_JSON_FILE):
    with open(DATASET_METADATA_TXT_FILE, 'r') as file :
        line = file.readline()
        while line:
            
            # read metadata of the current item
            item = eval(line.strip())
            
            metadata.append(item)
            
            # read the next line (next item)
            line = file.readline()
    
    with open(DATASET_METADATA_JSON_FILE, 'w') as file:
        json.dump(metadata, file)
else:
    print('[INFO] The JSON file already exists. Loading from disk')
    with open(DATASET_METADATA_JSON_FILE, 'r') as file:
        metadata = json.load(file)

[INFO] The JSON file already exists. Loading from disk


The metadata file contains a list of metadata, as shown in the following example

```json
[
  {
    "ID": 7541,
    "title": "Christina Gavioli",
    "slug": "christina-gavioli-3",
    "category": [
      "Fashion Women",
      "Women Blouse and Dress"
    ],
    "imPath": "images/Fashion Women/Women Blouse and Dress/CHRISTINA_GAVIOLI.jpg"
  },
]
```

## Create the Elasticsearch Mapping to index the Data

In this tutorial, I'm using Elasticsearch as the data store and the vector database. When enabled, Elasticsearch can perform vector or similarity search using KNN.

To enable KNN, add the following settings when creating the index mapping:
```json
"settings": {
    "index": {
        "knn": true
    }
}
```
Then, define the property that will hold the embedding vector
```json
"image_features": {
    "type": "dense_vector",
    "dims": 2048,
    "index": true,
    "similarity": "cosine"
}
```
* `dims`: The dimension should be the actual dimension of your dense vector.
* `index`: When ***true*** (the default value), ES can perform vector search. It then creates an HNSW (Hierarchical Navigable Small World) index that is used to perform vector search. When it is set to ***false***, ES just stores vectors but does not perform vector search using KNN. Useful only for storing embeddings for external processing. 
* `similarity`: The default value is ***cosine***. It defines the similarity metric that ES will use for vector search. Other metrics: *dot_product*, *l2_norm*

Vector store is available from version 8+. Here, I'm using version 9.1.8.

Please read https://www.elastic.co/docs/deploy-manage/production-guidance/optimize-performance/approximate-knn-search for what should be considered when using vector search in an index mode with Elasticsearch

In [10]:
# define the mapping
mapping = {
  "mappings": {
    "properties": {
      "ID": {
        "type": "integer"
      },
      "title": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword"
          }
        }
      },
      "slug": {
        "type": "text",
        "fields": {
          "keyword": {
            "type": "keyword"
          }
        }
      },
      "category": {
        "type": "keyword"
      },
      "imPath": {
        "type": "keyword"
      },
      "image_features": {
        "type": "dense_vector",
        "dims": 2048,
        "index": True,
        "similarity": "cosine"
      }
    }
  }
}


In [11]:
# create the mapping using the Elasticsearch Python API
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")

index_name = "items"

if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, body=mapping)
    print("[INFO] Mapping create.")

[INFO] Mapping create.


In [16]:
metadata[0]

{'ID': 7541,
 'title': 'Christina Gavioli',
 'slug': 'christina-gavioli-3',
 'category': ['Fashion Women', 'Women Blouse and Dress'],
 'imPath': 'images/Fashion Women/Women Blouse and Dress/CHRISTINA_GAVIOLI.jpg'}

In [17]:
# indexing the data in Elasticsearch

from elasticsearch.helpers import bulk

actions = [
    {
        "_index": index_name,
        "_id": item["ID"],
        "_source": item
    }
    for item in metadata
]

bulk(es, actions)
print("[INFO] Document indexed successfully")

Initial documents indexed (without embeddings)


In [18]:
es.search()

ObjectApiResponse({'took': 132, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 1655, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'items', '_id': '7541', '_score': 1.0, '_source': {'ID': 7541, 'title': 'Christina Gavioli', 'slug': 'christina-gavioli-3', 'category': ['Fashion Women', 'Women Blouse and Dress'], 'imPath': 'images/Fashion Women/Women Blouse and Dress/CHRISTINA_GAVIOLI.jpg'}}, {'_index': 'items', '_id': '7540', '_score': 1.0, '_source': {'ID': 7540, 'title': 'Sexy Woman', 'slug': 'sexy-woman-3', 'category': ['Fashion Women', 'Women Blouse and Dress'], 'imPath': 'images/Fashion Women/Women Blouse and Dress/SEXY_WOMAN_MULTICOLORE.jpg'}}, {'_index': 'items', '_id': '7539', '_score': 1.0, '_source': {'ID': 7539, 'title': 'Sexy Woman', 'slug': 'sexy-woman-2', 'category': ['Fashion Women', 'Women Blouse and Dress'], 'imPath': 'images/Fashion Women/Women Blouse and Dress/SEXY_WOMAN_JAUNE.jpg'}}, {'_i