In [None]:
import pandas as pd
from elasticsearch import Elasticsearch

# ----------------------------
# 1. Connect to Elasticsearch
# ----------------------------
es = Elasticsearch(
    "https://3322940df94141d98ca5135e9d80ba54.us-central1.gcp.cloud.es.io:443",
    basic_auth=("test", "test123456")
)
index_name = "test-reviews"

# ----------------------------
# 2. Load CSV
# ----------------------------
csv_file = "/Users/aasawarisahasrabuddhe/PyCharmMiscProject/pubmedqa_downloads/Reviews_10k.csv"
df = pd.read_csv(csv_file)
print(f"✅ Loaded CSV with {len(df)} rows")

# ----------------------------
# 3. Create index with mapping (Text + embedding_vector)
# ----------------------------
if not es.indices.exists(index=index_name):
    mapping = {
        "mappings": {
            "properties": {
                "Text": {"type": "text"},
                "embedding_vector": {"type": "dense_vector", "dims": 384, "index": True, "similarity": "cosine"}
            }
        }
    }
    es.indices.create(index=index_name, body=mapping)
    print(f"✅ Created index '{index_name}' with mapping")
else:
    print(f"✅ Index '{index_name}' already exists")

from elasticsearch import helpers
from tqdm import tqdm

# ----------------------------
# 4. Prepare documents for bulk
# ----------------------------
actions = []

for idx, row in df.iterrows():
    text = str(row['Text']).strip()
    if not text:
        continue
    action = {
        "_index": index_name,
        "_id": idx,
        "_source": {
            "Text": text
        }
    }
    actions.append(action)

# ----------------------------
# 5. Bulk index documents
# ----------------------------
success, failed = helpers.bulk(es, actions)
print(f"\n✅ Indexed {success} documents, failed: {len(failed)}")

from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# ----------------------------
# 1. Load embedding model
# ----------------------------
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
print("✅ Model loaded")

# ----------------------------
# 6. Fetch all documents (corrected)
# ----------------------------
docs = es.search(
    index=index_name,
    body={
        "query": {"match_all": {}},
        "size": 100  # move size inside the body
    }
)
print(f"ℹ Found {len(docs['hits']['hits'])} documents to process")

# ----------------------------
# 7. Generate embeddings and update documents
# ----------------------------
updated_count = 0

for hit in tqdm(docs['hits']['hits'], desc="Updating embeddings"):
    doc_id = hit['_id']
    text = hit['_source'].get('Text', '').strip()
    if not text:
        continue

    # Generate embedding
    embedding = model.encode(text).tolist()

    # Update document in ES
    es.update(
        index=index_name,
        id=doc_id,
        body={"doc": {"embedding_vector": embedding}},
        refresh=True  # ensure embedding is visible immediately
    )
    updated_count += 1

print(f"\n✅ Updated embeddings for {updated_count} documents")

In [None]:
import time


# ----------------------------
# Helper: pretty print results
# ----------------------------
def pretty_print(results):
    for i, hit in enumerate(results["hits"]["hits"], 1):
        score = hit.get('_score', 0)
        text = hit['_source'].get('Text', '')[:150]
        print(f"{i}. Score: {score:.4f} | Text: {text}...\n")

# ----------------------------
# 1. Full-text search
# ----------------------------
def full_text_search(query, size=5):
    body = {"size": size, "query": {"match": {"Text": query}}, "_source": ["Text"]}
    start = time.time()
    results = es.search(index=index_name, body=body)
    duration = time.time() - start
    print(f"\n--- Full-Text Search ({results['hits']['total']['value']} matches) ---")
    pretty_print(results)
    print(f"Execution time: {duration:.4f} sec")
    return results


# ----------------------------
# 2. Vector search
# ----------------------------
def vector_search(query, size=5):
    query_vector = model.encode(query).tolist()
    body = {
        "size": size,
        "knn": {
            "field": "embedding_vector",
            "query_vector": query_vector,
            "k": size,
            "num_candidates": 100
        },
        "_source": ["Text"]
    }
    start = time.time()
    results = es.search(index=index_name, body=body)
    duration = time.time() - start
    print(f"\n--- Vector Search ---")
    pretty_print(results)
    print(f"Execution time: {duration:.4f} sec")
    return results


# ----------------------------
# 3. Hybrid search
# ----------------------------
def hybrid_search(query, size=5, text_weight=0.5, vector_weight=0.5):
    query_vector = model.encode(query).tolist()
    body = {
        "size": size,
        "query": {
            "function_score": {
                "query": {"match": {"Text": query}},
                "functions": [
                    {
                        "script_score": {
                            "script": {
                                "source": "cosineSimilarity(params.query_vector, 'embedding_vector') + 1.0",
                                "params": {"query_vector": query_vector}
                            }
                        },
                        "weight": vector_weight
                    }
                ],
                "score_mode": "sum",
                "boost_mode": "sum"
            }
        },
        "_source": ["Text"]
    }
    start = time.time()
    results = es.search(index=index_name, body=body)
    duration = time.time() - start
    print(f"\n--- Hybrid Search ---")
    pretty_print(results)
    print(f"Execution time: {duration:.4f} sec")
    return results


# ----------------------------
# 4. Run all searches
# ----------------------------
query_text = "good for puppies"

full_text_search(query_text)
vector_search(query_text)
hybrid_search(query_text)