In [1]:
%%time
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')

CPU times: user 3 s, sys: 933 ms, total: 3.94 s
Wall time: 9 s


In [2]:
import random
import numpy as np
import pandas as pd
import time
import redis
from redis.commands.search.field import VectorField
from redis.commands.search.field import TextField
from redis.commands.search.field import TagField
from redis.commands.search.query import Query
from redis.commands.search.result import Result


In [3]:
import json
from tqdm.contrib.concurrent import process_map
from multiprocessing import cpu_count
import redis


## Check connection to MemoryMind  and cleanup data if needed (Only sample dont run FLUSHALL in production) 

In [4]:
client_dev = redis.Redis(host = 'memoryd-rag-0001-001.memoryd-rag.ghlaqp.memorydb.us-east-1.amazonaws.com', port=6379, 
                     decode_responses=True, ssl=True, ssl_cert_reqs="none"
                      )
client_dev.ping()

#flush all data
#client_devo.flushall()

True

 ##  Copy the data set locally.
Before we can run any queries, we need to download the Amazon Product Question and Answer data from : https://registry.opendata.aws/amazon-pqa/
#### Let's start by having a look at all the files in the dataset. Uncomment the below line to list all options 

In [5]:
#!aws s3 ls --no-sign-request s3://amazon-pqa/

There are a lot of files here, so for the purposes of this demo, we focus on just the headset data. Let's download the amazon_pqa_headsets.json data locally.

In [6]:
#!aws s3 cp --no-sign-request s3://amazon-pqa/amazon_pqa_headsets.json ./amazon-pqa/amazon_pqa_headsets.json

### Prepare Headset PQA data

In [7]:
import json
import pandas as pd

def load_pqa(file_name,number_rows=1000):
    qa_list = []
    df = pd.DataFrame(columns=('question', 'answer'))
    with open(file_name) as f:
        i=0
        for line in f:
            data = json.loads(line)
            df.loc[i] = [data['question_text'],data['answers'][0]['answer_text']]
            i+=1
            if(i == number_rows):
                break
    return df


qa_list = load_pqa('amazon-pqa/amazon_pqa_headsets.json',number_rows=1000)

In [8]:
qa_list

Unnamed: 0,question,answer
0,does this work with cisco ip phone 7942,Use the Plantronics compatibility guide to see...
1,Is this compatible with the cisco ip phone 797...,Don’t know. Call Plantronics
2,"If i have a polycom vvx, what adapter cable wi...","Hi Gabrielle, what is the model of VVX?"
3,Does this headset work with a samsung galaxy s...,no wont work
4,"Will this work for a polycom vvx phone? also, ...","Yes, but you need a cable in between the heads..."
...,...,...
995,Does it come with the cable for console,Yes! The Game One comes with both a PC cable a...
996,How do these feel while wearing glasses for an...,"I don't wear glasses personally, but the perso..."
997,How good is the microphone quality?,Its actully really good. when i play with ny ...
998,is their an attachment that I can use to conne...,it didn't come with one but someone may make a...


In [9]:
NUMBER_PRODUCTS=1000
product_metadata = qa_list.head(NUMBER_PRODUCTS).to_dict(orient='index')


### Create vector embeddings for questions 

In [10]:
%%time
item_keywords =  [product_metadata[i]['question']  for i in product_metadata.keys()]
item_keywords_vectors = [model.encode(sentence) for sentence in item_keywords]

CPU times: user 53.2 s, sys: 21.4 ms, total: 53.3 s
Wall time: 53.5 s


In [11]:
from redis.client import Redis

In [12]:
ITEM_KEYWORD_EMBEDDING_FIELD='question_vector'

## Function to load the question, answer data with vector embeddings

In [13]:
%%time
def load_vectors(client: Redis, qa_list, vector_dict, vector_field_name):
    for index in product_metadata.keys():    
        # Hash key
        key = 'product:' + str(index)
        
        # Hash values
        item_metadata = product_metadata[index]
        item_keywords_vector = vector_dict[index].astype(np.float32).tobytes()
        item_metadata[vector_field_name] = item_keywords_vector
        
        # HSET
        client.hset(key, mapping=item_metadata)


CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 6.91 µs


### The below function is to create the Vector index. 
 However redis-py does not allow index creation because of unsupported fields in preview. so run this with MONITOR if you want a sample command for FT.CREATE

In [15]:
%%time
def create_hnsw_index (create_hnsw_index,vector_field_name,number_of_vectors, vector_dimensions=768, distance_metric='L2',M=40,EF=200):
    client_dev.ft(INDEX_NAME).create_index([
        VectorField("question_vector", "HNSW", {"TYPE": "FLOAT32", "DIM": vector_dimensions, "DISTANCE_METRIC": distance_metric, "INITIAL_CAP": number_of_vectors, "M": M, }),
        TextField("question"),        
        TextField("answer"),
          
    ])    


CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 7.63 µs


In [16]:
INDEX_NAME='indx:pqa_vss'
NUMBER_PRODUCTS=1000

In [18]:
%%time
create_hnsw_index(client_dev,INDEX_NAME,NUMBER_PRODUCTS)

CPU times: user 1.81 ms, sys: 35 µs, total: 1.84 ms
Wall time: 7 ms


In [19]:
## Run the below code to actually load the hashes in 

In [20]:
INDEX_NAME='indx:pqa_vss'

In [21]:
%%time

ITEM_KEYWORD_EMBEDDING_FIELD='question_vector'
TEXT_EMBEDDING_DIMENSION=768
NUMBER_PRODUCTS=1000

print ('Loading and Indexing + ' +  str(NUMBER_PRODUCTS) + ' products')

#flush all data
#client_devo.flushall()

load_vectors(client_dev,product_metadata,item_keywords_vectors,ITEM_KEYWORD_EMBEDDING_FIELD)


Loading and Indexing + 1000 products
CPU times: user 90.7 ms, sys: 16.2 ms, total: 107 ms
Wall time: 6.14 s



##  Example to create index manually using below command with redis-cli 

``` bash
FT.CREATE idx:pqa_vss SCHEMA question_vector VECTOR HNSW 10 TYPE FLOAT32 DIM 768 DISTANCE_METRIC COSINE INITIAL_CAP 1000 M 40 question TEXT answer TEXT

```
 

In [22]:
#create_index_command = "FT.CREATE idx:pqa_vss SCHEMA question_vector VECTOR HNSW 10 TYPE FLOAT32 DIM 768 DISTANCE_METRIC COSINE INITIAL_CAP 1000 M 40 question TEXT answer TEXT"

# Execute the command
#try:
#    response = client_devo.execute_command(create_index_command)
#    print("Index created successfully:", response)
#except Exception as e:
#    print("An error occurred while creating the index:", e)


### Run FT.INFO to inspect the index 

In [23]:
info = client_dev.ft(INDEX_NAME).info()
num_docs = info['num_docs']
space_usage = info['space_usage']
num_indexed_vectors = info['num_indexed_vectors']
vector_space_usage = (info['vector_space_usage'])

print(f"{num_docs} documents ({space_usage} space used vectors indexed {num_indexed_vectors} vector space usage in {vector_space_usage}")

1000 documents (7357016 space used vectors indexed 1000 vector space usage in 7051484


### Performing Semantic search.. broken down into multiple cells to capture execution time effectively can all be run from 1 cell as well 

In [24]:
%%time
topK=5
user_query='Does this work with xbox'
#vectorize the query
query_vector = model.encode(product_query).astype(np.float32).tobytes()


CPU times: user 58.2 ms, sys: 0 ns, total: 58.2 ms
Wall time: 56.8 ms


In [26]:
%%time
#prepare the query
q = Query(f'*=>[KNN {topK} @{ITEM_KEYWORD_EMBEDDING_FIELD} $vec_param AS vector_score]').paging(0,topK).return_fields('question','answer')
params_dict = {"vec_param": query_vector}

#Execute the query
results = client_dev.ft(INDEX_NAME).search(q, query_params = params_dict)


CPU times: user 1.99 ms, sys: 0 ns, total: 1.99 ms
Wall time: 2.05 ms


In [27]:
%%time
#Print similar products and questions found
for product in results.docs:
    print ('***************Product  found ************')
    print ('hash key = '  + product.id)
    print ( 'question = '  + product.question)
    print ('answer = ' + product.answer)

***************Product  found ************
hash key = product:197
question = Does this work with xbox one?
answer = sorry, Im not an xbox user.
***************Product  found ************
hash key = product:721
question = Does this work with the xbox one?
answer = Yeah of course , but you must have an adapter to use this beautiful headset
***************Product  found ************
hash key = product:579
question = does this work on xbox one?
answer = I'm sorry, but not!
***************Product  found ************
hash key = product:873
question = Does this work with ps4
answer = Yes it does
***************Product  found ************
hash key = product:538
question = will this work with xbox one
answer = Diane, you would also need to purchase an Xbox One Headset Adapter made by Microsoft to use this with an Xbox One. With the adapter it willwork perfectly with the Xbox One.
CPU times: user 118 µs, sys: 2 µs, total: 120 µs
Wall time: 112 µs
