In [None]:
import os, gzip, json

def locale_filter(data, locale='en_US'):
    return [data['value'] for data in data if data.get('language_tag') == locale]
def file_iterator(file_path):
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            yield json.loads(line)
must_have_keys = set(['item_id', 'brand', 'item_name', 'item_keywords', 'bullet_point'])
key_filter =  must_have_keys.union(set('product_description'))

def get_product_item(product, key_filter=key_filter, locale='en_US'):
    item = {}
    for key in key_filter:
        if key == 'item_id':
            item[key] = product[key]
        else:
            values = locale_filter(product.get(key, []), locale)
            if values:
                item[key] = ' '.join(values).strip()
    return item

def json_gz2product(file_path):
    data = []
    products = file_iterator(file_path)
    for product in products:
        # at least 'item_id', 'brand', 'item_name', 'item_keywords', 'bullet_point'
        item = get_product_item(product)
        if item.keys() == must_have_keys:
            data.append(item)
    return data

data = []
all_files = os.listdir("abo-listings/listings/metadata")
for file in all_files:
    file_path = 'abo-listings/listings/metadata/' + file
    products = json_gz2product(file_path)
    data.extend(products)

sentences = []
for item in data:
    sentences.append(' '.join([item.get('item_name', ''), item.get('bullet_point', ''), item.get('product_description', '')]).strip())
total_samples = 19900


In [None]:
# setup Elasticsearch for full-text search
# preprocess 안됨
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

es_client = Elasticsearch(
    "http://localhost:9200",
    basic_auth=("elastic", "password"),
    verify_certs=False,
    ssl_show_warn=False
)

index = 'product'

mapping = {
    "mappings": {
        "properties": {
            "item_id": {"type": "text"},
            "brand": {"type": "text"},
            "item_name": {"type": "text"},
            "item_keywords": {"type": "text"},
            "bullet_point": {"type": "text"},
            "product_description": {"type": "text"},
        }
    }
}

es_client.indices.create(index=index, body=mapping)

documents = []
data_sample = data[:total_samples]
for i in range(total_samples):
    data = data_sample[i]
    documents.append({
        '_index': index,
        '_id': i+1,
        '_source': {
            'item_id': data.get('item_id', ''),
            'brand': data.get('brand', ''),
            'item_name': data.get('item_name', ''),
            'item_keywords': data.get('item_keywords', ''),
            'bullet_point': data.get('bullet_point', ''),
            'product_description': data.get('product_description', ''),
        }
    })

success, _ = bulk(es_client, documents)
print(f"Successfully indexed {success} documents")


In [None]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')


In [None]:
from pinecone import Pinecone, ServerlessSpec

pinecone_api_key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_api_key)

# Create Index
index_name = "product"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

pc_client = pc.Index(index_name)

embeddings =  embedding_model.encode(sentences)  

vectors = []
for d, e in zip(data, embeddings):
    vectors.append({
        "id": d['item_id'],
        "values": e,
        "metadata": {'item_name': d['item_name']},
    })


window_size = 100
total_samples = 19900
vector_samples = vectors[:total_samples]

# index vectors to Pinecone
for i in range(total_samples - window_size + 1):
    window = vector_samples[i: i + window_size]
    pc_client.upsert(
        vectors=window,
        namespace="ns1"
    )

In [None]:
import time
from collections import defaultdict

def text_search(query):
    q = {
        "query": {
            "query_string": {
                "query": query,
            }
        }
    }

    result = es_client.search(index=index, body=q)
    hits = result['hits']
    return [hit['_source'] for hit in hits['hits']]

def vector_search(query, topk=3):
    query_embedding = embedding_model.encode(query).tolist()

    results = pc_client.query(
        namespace="ns1",
        vector=query_embedding,
        top_k=topk,
        include_values=False,
        include_metadata=True
    )
    
    sorted_matches = sorted(results['matches'], key=lambda x: x['score'], reverse=True)
    return sorted_matches

def reciprocal_rank_fusion(results, K=60):
    docs = []
    rrf_score = []
    
    for ranked_doc in results:
        for rank, doc in enumerate(ranked_doc, 1):
            docs.append(doc)
            rrf_score.append(1.0 / (rank + K))

    scored_docs = zip(docs, rrf_score)
    sorted_docs = sorted(scored_docs, key=lambda x: x[1], reverse=True)
    return sorted_docs

def hybrid_search(query, topk=3):
    results = [vector_search(query, topk), text_search(query)]
    ranked_results = reciprocal_rank_fusion(results)
    return ranked_results

start_time = time.time()
results = hybrid_search("Find me a kitchen table")
print(f"Done in {time.time() - start_time} seconds")
print(results)

# async version

In [None]:
import asyncio
from elasticsearch import AsyncElasticsearch  # Need to use async elasticsearch client

async def async_text_search(query):
    q = {
        "query": {
            "query_string": {
                "query": query,
            }
        }
    }
    result = await es_client.search(index=index, body=q)
    hits = result['hits']
    return [hit['_source'] for hit in hits['hits']]

async def async_vector_search(query, topk=3):
    query_embedding = embedding_model.encode(query).tolist()
    results = await pc_client.query(  # Need async Pinecone client
        namespace="ns1",
        vector=query_embedding,
        top_k=topk,
        include_values=False,
        include_metadata=True
    )
    return sorted(results['matches'], key=lambda x: x['score'], reverse=True)

async def hybrid_search(query, topk=3):
    # Run both searches concurrently
    vector_results, text_results = await asyncio.gather(
        async_vector_search(query, topk),
        async_text_search(query)
    )
    results = [vector_results, text_results]
    return reciprocal_rank_fusion(results)

In [None]:
from FlagEmbedding import FlagLLMReranker
reranker = FlagLLMReranker('BAAI/bge-reranker-v2-m3', use_fp16=True)

def get_product_str(product):
    name = product['item_name']
    bullet_points = product['bullet_point']
    descriptions = product['product_description']
    text = []
    if name:
        text.append("Product Name: %s" % name)
        if bullet_points:
            text.append("- bullet points: %s" % ','.join(bullet_points))
        if descriptions:
            text.append("- description: %s" % ','.join(descriptions))
    return '\n'.join(text)

def llm_reranker(query, docs):
    pairs = [(query, get_product_str(doc)) for doc in docs]
    scores = reranker.compute_score(pairs)
    scored_docs = zip(docs, scores)
    sorted_docs = sorted(scored_docs, key=lambda x: x[1], reverse=True)
    return sorted_docs

def fetch_doc(results):
    ids = []
    for result in results[0]:
        ids.append(result['id'])
    for result in results[1]:
        ids.append(result['item_id'])
    q = {
        "query": {
            "query_string": {
                "query": " OR ".join(ids),
                "default_field": "item_id",
            }
        }
    }
    result = es_client.search(index=index, body=q)
    hits = result['hits']
    return [hit['_source'] for hit in hits['hits']]

def hybrid_search2(query, topk=3):
    results = [vector_search(query, topk), text_search(query)]
    docs = fetch_doc(results)
    ranked_results = llm_reranker(query, docs)
    return ranked_results

start_time = time.time()
results = hybrid_search2("Find me a kitchen table")
print(f"Done in {time.time() - start_time} seconds")
print(results)

# async version

In [None]:

async def hybrid_search2(query, topk=3):

    vector_results, text_results = await asyncio.gather(
        async_vector_search(query, topk),
        async_text_search(query)
    )
    results = [vector_results, text_results]
    docs = fetch_doc(results)
    ranked_results = llm_reranker(query, docs)
    return ranked_results

start_time = time.time()
results = hybrid_search2("Find me a kitchen table")
print(f"Done in {time.time() - start_time} seconds")
print(results)