In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
from itertools import islice
from tqdm import tqdm
from pprint import pprint, pformat
from IPython.display import Image, display, Markdown, Code, HTML
import random
import matplotlib.pyplot as plt
import numpy as np
import json

from amazonutils import *

## Connect to Elasticsearch

In [None]:
!curl localhost:9200

In [None]:
!curl localhost:9200/_cat/plugins

In [None]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

es = Elasticsearch(["http://localhost:9200"])
es.cluster.health(wait_for_status='yellow', request_timeout=1)

## Explore the Data

In [None]:
fname_products = "meta_Clothing_Shoes_and_Jewelry.json.gz"
fname_vectors = "image_features_Clothing_Shoes_and_Jewelry.b"

p = next(islice(iter_products(fname_products), 20, 21))
pprint(p)
display(Image(p['imUrl'], width=128, height=128))

_, vec = next(iter_vectors(fname_vectors))
print(len(vec), vec[:20])

## Index the Products

In [None]:
index = "amazon-products-demo"

mapping = {
  "dynamic": False,
  "properties": {
    "asin": { "type": "keyword" },
    "title": { "type": "text" },
    "description": { "type": "text" },
    "imUrl": { "type": "text" },
    "price": { "type": "float" },
    
    "imVec": {
      "type": "elastiknn_dense_float_vector",
      "elastiknn": {
        "dims": 4096
      }
    } 
  }
}

if es.indices.exists(index):
  es.indices.delete(index)
es.indices.create(index)
es.indices.put_mapping(mapping, index)

es.indices.get_mapping(index)

In [None]:
def index_actions(index, n=600):
  asin2vec = { asin: vec for asin, vec in islice(iter_vectors(fname_vectors), n) } 
  piter = iter_products(fname_products)
  indexed = 0
  while indexed < n:
    p = next(piter)
    if p['asin'] in asin2vec:
      vec = asin2vec[p['asin']]
      action = {
        "_op_type": "index", 
        "_index": index, 
        "_id": p["asin"], 
        "asin": p["asin"], 
        "title": p.get("title", None), 
        "description": p.get("description", None),
        "price": p.get("price", None),
        "imUrl": p.get("imUrl", None),
        
        "imVec": { "values": vec }
      }
      indexed += 1
      yield action

bulk(es, tqdm(index_actions(index)), chunk_size=100, max_retries=2)
es.indices.refresh(index)
!curl localhost:9200/{index}/_count

In [None]:
!curl localhost:9200/{index}/_search?size=1\&pretty

## Standard Keyword Search

In [None]:
body = {
  "query": {
    "function_score": {
      "query": { "match_all": {} },
      "random_score": {}
    }
  }
}

res = es.search(index=index, body=body, size=5)
display_hits_horizontal(res)

In [None]:
body = {
  "query": {
    "multi_match": {
      "query": "men's watch",
      "fields": ["title^2", "description"]
    }
  }
}

res = es.search(index=index, body=body, size=5)
display_hits_horizontal(res)

## Exact Nearest Neighbors Query


In [None]:
product_id = "B000FQCOOO"

fetch_res = es.get(index=index, id=product_id)
query_vec = fetch_res['_source']['imVec']

body = {
  "query": {
    "elastiknn_nearest_neighbors": {
      "vec": query_vec,
      "field": "imVec",
      "similarity": "angular",
      "model": "exact"
    }
  }
}

res = es.search(index=index, body=body, size=5)
display_hits_horizontal(res)

In [None]:
body = {
  "query": {
    "elastiknn_nearest_neighbors": {
      "vec": {
        "index": index,
        "id": product_id,
        "field": "imVec"
      },
      "field": "imVec",
      "model": "exact",
      "similarity": "angular"
    }
  }
}

res = es.search(index=index, body=body, size=5)
display_hits_horizontal(res)

## Index the Products (For Approximate Queries)

In [None]:
index = "amazon-products-demo"

mapping = {
  "dynamic": False,
  "properties": {
    "asin": { "type": "keyword" },
    "title": { "type": "text" },
    "description": { "type": "text" },
    "imUrl": { "type": "text" },
    "price": { "type": "float" },
    
    "imVec": {
      "type": "elastiknn_dense_float_vector",
      "elastiknn": {
        "dims": 4096,
        "similarity": "angular",
        "model": "lsh",
        "L": 60,
        "k": 3
      }
    }
    
  }
}

if es.indices.exists(index):
  es.indices.delete(index)
es.indices.create(index)
es.indices.put_mapping(mapping, index)

es.indices.get_mapping(index)

In [None]:
bulk(es, tqdm(index_actions(index)), chunk_size=100, max_retries=2)
es.indices.refresh(index)
!curl localhost:9200/{index}/_count

In [None]:
!curl localhost:9200/{index}/_search?size=1\&pretty

## Approximate Nearest Neighbors Query

In [None]:
body = {
  "query": {
    "elastiknn_nearest_neighbors": {
      "vec": query_vec,
      "field": "imVec",
      "similarity": "angular",
      "model": "lsh",
      "candidates": 50
    }
  }
}

res = es.search(index=index, body=body, size=5)
display_hits_horizontal(res)

## Combine Standard and Nearest Neighbors Queries

In [None]:
body = {
  "query": {
    "multi_match": {
      "query": "leather",
      "fields": ["title^2", "description"]
    }
  }
}

res = es.search(index=index, body=body, size=5)
display_hits_horizontal(res)

In [None]:
body = {
  "query": {
    "function_score": {
      "query": {
        "multi_match": {
          "query": "leather",
          "fields": ["title^2", "description"]
        }
      },
      "boost_mode": "replace",
      "functions": [{
        "elastiknn_nearest_neighbors": {
          "field": "imVec",
          "similarity": "angular",
          "model": "exact",
          "vec": query_vec
        }
      }]
    }
  }
}

res = es.search(index=index, body=body, size=5)
display_hits_horizontal(res)