<a href="https://colab.research.google.com/github/YogiGoodman/ES/blob/main/ES_Fast_API.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ✅ Step 1: Install Dependencies
!pip install elasticsearch sentence-transformers faker tqdm

# ✅ Step 2: Import Libraries
import random
import string
import datetime
from faker import Faker
from tqdm import tqdm
from elasticsearch import Elasticsearch, helpers
from sentence_transformers import SentenceTransformer
import numpy as np


In [None]:
# ✅ Step 3: Initialize Faker and Model
fake = Faker()
model = SentenceTransformer("all-MiniLM-L6-v2")

In [3]:
# Step 4: Generate Synthetic Data (50 Records)
def generate_record(record_id):
    return {
        "record_id": record_id,
        "sensor_name": fake.domain_word(),
        "location": fake.city(),
        "industry": random.choice(["Energy", "Weather"]),
        "status": random.choice(["Active", "Inactive"]),
        "temperature": round(random.uniform(-20, 50), 2),
        "pressure": round(random.uniform(950, 1050), 2),
        "humidity": round(random.uniform(0, 100), 2),
        "wind_speed": round(random.uniform(0, 100), 2),
        "is_critical": random.choice([True, False]),
        "timestamp": fake.date_time_this_decade().isoformat(),
        "notes": fake.text(max_nb_chars=200)
    }

data = [generate_record(i) for i in range(50)]

In [4]:
# Step 5: Add combined_text and embedding fields
def get_combined_text(doc):
    fields = [
        "sensor_name", "location", "industry", "status",
        "temperature", "pressure", "humidity", "wind_speed",
        "is_critical", "timestamp", "notes"
    ]
    return " ".join([str(doc[f]) for f in fields])

for doc in tqdm(data):
    doc["combined_text"] = get_combined_text(doc)
    doc["embedding"] = model.encode(doc["combined_text"]).tolist()

100%|██████████| 50/50 [00:03<00:00, 16.26it/s]


In [5]:
es = Elasticsearch(
    "https://my-elasticsearch-project-e8e56c.es.eu-west-1.aws.elastic.cloud:443",
    api_key="VEhQQ0NKY0JKdnBza29EUXFDZnA6Z0F3cWlpN290WkZfTWgwQzVGc1Z5Zw=="
)

In [6]:
INDEX_NAME = "timeseries_metadata_v2"

# Step 7: Define Index Mapping
mapping = {
    "mappings": {
        "properties": {
            "record_id": {"type": "long"},
            "sensor_name": {"type": "keyword"},
            "location": {"type": "keyword"},
            "industry": {"type": "keyword"},
            "status": {"type": "keyword"},
            "temperature": {"type": "float"},
            "pressure": {"type": "float"},
            "humidity": {"type": "float"},
            "wind_speed": {"type": "float"},
            "is_critical": {"type": "boolean"},
            "timestamp": {"type": "date"},
            "notes": {"type": "text"},
            "combined_text": {"type": "text"},
            "embedding": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }
        }
    }
}

# Step 8: Create Index
if es.indices.exists(index=INDEX_NAME):
    es.indices.delete(index=INDEX_NAME)

es.indices.create(index=INDEX_NAME, body=mapping)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'timeseries_metadata_v2'})

In [7]:
# Step 9: Bulk Index Documents
actions = [
    {
        "_index": INDEX_NAME,
        "_id": doc["record_id"],
        "_source": doc
    }
    for doc in data
]
helpers.bulk(es, actions)

(50, [])

In [74]:
# Step 10: Define Search Functions
INCLUDED_FIELDS = [
    "record_id", "sensor_name", "location", "industry", "status",
    "temperature", "pressure", "humidity", "wind_speed",
    "is_critical", "timestamp", "notes"
]

MIN_SIMILARITY_SCORE = 0.5  # configurable

def filter_fields_with_score(results, min_score=MIN_SIMILARITY_SCORE):
    filtered = [
        {**{k: hit["_source"].get(k) for k in INCLUDED_FIELDS}, "score": hit["_score"]}
        for hit in results["hits"]["hits"]
        if hit.get("_score", 0) >= min_score
    ]
    if not filtered:
        return {"warning": f"No relevant matches found (score >= {min_score})."}
    return filtered

def get_all():
    return es.search(index=INDEX_NAME, query={"match_all":{}},size=50,source=INCLUDED_FIELDS)

def keyword_search(field, value):
    return es.search(index=INDEX_NAME, query={"term": {field: {"value": value}}},source=INCLUDED_FIELDS)

def full_text_search(field, query):
    return es.search(index=INDEX_NAME, query={"match": {field: query}},source=INCLUDED_FIELDS)

def fuzzy_search(field, query):
    return es.search(index=INDEX_NAME, query={"match": {field: {"query": query, "fuzziness": "AUTO"}}},source=INCLUDED_FIELDS)

def semantic_search(query_text, top_k=5, num_candidates=100, min_score=0.65):
    vector = model.encode(query_text).tolist()
    res = es.search(
        index=INDEX_NAME,
        knn={
            "field": "embedding",
            "query_vector": vector,
            "k": top_k,
            "num_candidates": num_candidates
        },
        source=INCLUDED_FIELDS
    )
    return filter_fields_with_score(res, min_score)

def hybrid_search(query_text, top_k=5, num_candidates=10000, min_score=0.65):
    vector = model.encode(query_text).tolist()
    res = es.search(
        index=INDEX_NAME,
        knn={
            "field": "embedding",
            "query_vector": vector,
            "k": top_k,
            "num_candidates": num_candidates
        },
        query={
            "bool": {
                "should": [
                    {"match": {"combined_text": query_text}}
                ]
            }
        },
        _source=INCLUDED_FIELDS
    )
    return filter_fields_with_score(res, min_score)

def hybrid_search_v1(query_text, top_k=5, num_candidates=10000, min_score=0.65):
    print(top_k)
    vector = model.encode(query_text).tolist()
    res = es.search(
        index=INDEX_NAME,
        knn={
            "field": "embedding",
            "query_vector": vector,
            "k": top_k,
            "num_candidates": num_candidates
        },
        query={
          "bool": {
              "should": [
                  {"match": {"combined_text": {"query": query_text, "boost": 1}}},
                  {"match": {"industry": {"query": query_text, "boost": 2}}},
                  {"match": {"sensor_name": {"query": query_text, "boost": 1.5}}},
              ]
          }
        },
        _source=INCLUDED_FIELDS
    )
    return filter_fields_with_score(res, min_score)



In [29]:
def print_results(response):
    for hit in response["hits"]["hits"]:
        source = hit["_source"]
        filtered = {k: source[k] for k in [
            "record_id", "sensor_name", "location", "industry", "status",
            "temperature", "pressure", "humidity", "wind_speed",
            "is_critical", "timestamp", "notes"
        ]}
        print(f"\n📄 Score: {hit.get('_score', 'N/A')}")
        for key, val in filtered.items():
            print(f"{key}: {val}")

In [None]:
print("\n🔍 Keyword Search:")
print_results(keyword_search("industry", "Energy"))

print("\n🔍 Full Text Search:")
print_results(full_text_search("notes", "solar panel"))

print("\n🔍 Fuzzy Search:")
print_results(fuzzy_search("sensor_name", "senosr"))

print("\n🧠 Semantic Search:")
print_results(semantic_search("pressure is dropping in windy area"))

print("\n🧠🔍 Hybrid Search:")
print_results(hybrid_search("windy condition with high temperature"))

In [None]:
!pip install fastapi uvicorn pyngrok nest-asyncio

In [72]:
from fastapi import FastAPI, Query
app = FastAPI(title="Hybrid Semantic Search API")

@app.get("/")
def read_root():
    return {"message": "Hello from FastAPI in Colab!"}

@app.get("/search/keyword")
def search_keyword(field: str, value: str):
    return keyword_search(field, value)

@app.get("/search/all")
def search_keyword():
    return get_all()

@app.get("/search/fulltext")
def search_full_text(field: str, text: str):
    return full_text_search(field, text)

@app.get("/search/fuzzy")
def search_fuzzy(field: str, text: str):
    return fuzzy_search(field, text)

@app.get("/search/semantic")
def search_semantic(
    text: str,
    top_k: int = Query(5, ge=1),
    num_candidates: int = Query(50, ge=10),
    min_score: float = Query(0.8, ge=0.0, le=1.0)
):
    return semantic_search(text, top_k, num_candidates, min_score)

@app.get("/search/v1/hybrid")
def search_hybrid(
    text: str,
    top_k: int = Query(5, ge=1),
    num_candidates: int = Query(50, ge=10),
    min_score: float = Query(0.8, ge=0.0, le=1.0)
):
    return hybrid_search_v1(text, top_k, num_candidates, min_score)

In [17]:
import nest_asyncio
from pyngrok import ngrok
import uvicorn

In [19]:
ngrok.set_auth_token("2xv9glaTuMZthP6cERPEuPkyzka_7fo1E7SchoeftUFynn8Sx")

In [75]:
# Step: Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Step: Create a tunnel
public_url = ngrok.connect(8000)
print("🚀 Public FastAPI URL:", public_url)
print("📘 Swagger Docs:", f"{public_url}/docs")
print("📙 ReDoc Docs:", f"{public_url}/redoc")

# Step: Run the FastAPI app
uvicorn.run(app, port=8000)

🚀 Public FastAPI URL: NgrokTunnel: "https://4c36-34-48-74-152.ngrok-free.app" -> "http://localhost:8000"
📘 Swagger Docs: NgrokTunnel: "https://4c36-34-48-74-152.ngrok-free.app" -> "http://localhost:8000"/docs
📙 ReDoc Docs: NgrokTunnel: "https://4c36-34-48-74-152.ngrok-free.app" -> "http://localhost:8000"/redoc


INFO:     Started server process [1743]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


INFO:     2a01:4b00:ad04:b500:374:4a75:eaa3:9bbe:0 - "GET / HTTP/1.1" 200 OK
INFO:     2a01:4b00:ad04:b500:374:4a75:eaa3:9bbe:0 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO:     2a01:4b00:ad04:b500:374:4a75:eaa3:9bbe:0 - "GET /docs HTTP/1.1" 200 OK
INFO:     2a01:4b00:ad04:b500:374:4a75:eaa3:9bbe:0 - "GET /openapi.json HTTP/1.1" 200 OK
INFO:     2a01:4b00:ad04:b500:374:4a75:eaa3:9bbe:0 - "GET /search/all HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [1743]
