In [1]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [2]:
project_id = os.getenv("PROJECT_ID")
location = os.getenv("LOCATION")
bigquery_table_id = os.getenv("BIGQUERY_TABLE_ID")

In [3]:
index_endpoint_id = os.getenv("INDEX_ENDPOINT")
deployed_index_id = os.getenv("DEPLOYED_INDEX_ID")

In [5]:
import pandas as pd
import re
import json
from google.cloud import storage
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from datetime import datetime, timedelta

In [6]:
from google.genai.types import EmbedContentConfig
from google import genai

In [7]:
from google.cloud import aiplatform

In [8]:
import streamlit as st
from sklearn.metrics.pairwise import cosine_similarity
from google.genai import types

### Initial Data Extraction from BigQuery

In [19]:
from google.cloud import bigquery
query = f"""
SELECT
  id, title, status, country, date, type, description
FROM
  `{bigquery_table_id}`
WHERE
  title != 'test_record'
LIMIT 200
"""
bigq_client = bigquery.Client()
disaster_df = bigq_client.query(query).to_dataframe()

In [20]:
disaster_df

Unnamed: 0,id,title,status,country,date,type,description
0,51830,Jordan: Cold Wave - Nov 2023,past,Jordan,2024-02-16T09:45:58+00:00,Cold Wave,"Starting in the evening of Sunday, 19/11/2023,..."
1,51871,Mongolia: Dzud - Dec 2023,past,Mongolia,2025-02-03T04:50:55+00:00,Cold Wave,Mongolia is currently facing a severe Dzud. Th...
2,51879,Lebanon: Cold Wave - Jan 2024,past,Lebanon,2024-04-18T08:05:00+00:00,Cold Wave,Lebanon and the eastern basin of the Mediterra...
3,51930,Afghanistan: Cold Wave - Mar 2024,past,Afghanistan,2024-06-18T21:04:41+00:00,Cold Wave,"On 3 March 2024, the Afghan Red Crescent Socie..."
4,52302,Georgia: Heavy Snowfall and Cold Wave - Feb 2025,past,Georgia,2025-09-17T12:28:39+00:00,Cold Wave,"Since 21 February 2025, Western Georgia has be..."
...,...,...,...,...,...,...,...
195,52306,Tropical Cyclone Jude - Mar 2025,ongoing,Madagascar,2025-04-29T07:38:53+00:00,Tropical Cyclone,According to the Mozambique National Institute...
196,51880,Chile: Wild Fires - Jan 2024,past,Chile,2024-04-25T18:17:09+00:00,Wild Fire,Wildfires have been burning across parts of ce...
197,52030,Belize: Wild Fires - May 2024,past,Belize,2024-07-30T19:31:03+00:00,Wild Fire,"On May 16, the Toledo District Emergency Opera..."
198,52095,Bolivia: Wild Fires - Jul 2024,past,Bolivia (Plurinational State of),2025-02-21T15:59:56+00:00,Wild Fire,"On 24 July 2024, the Bolivia Ministry of Defen..."


In [11]:
disaster_df.to_csv("disasters_cache_200.csv", index=False)

In [43]:
main_df = pd.read_csv("disasters_cache_200.csv")
main_df.head()

Unnamed: 0,id,title,status,country,date,type,description
0,51830,Jordan: Cold Wave - Nov 2023,past,Jordan,2024-02-16T09:45:58+00:00,Cold Wave,"Starting in the evening of Sunday, 19/11/2023,..."
1,51871,Mongolia: Dzud - Dec 2023,past,Mongolia,2025-02-03T04:50:55+00:00,Cold Wave,Mongolia is currently facing a severe Dzud. Th...
2,51879,Lebanon: Cold Wave - Jan 2024,past,Lebanon,2024-04-18T08:05:00+00:00,Cold Wave,Lebanon and the eastern basin of the Mediterra...
3,51930,Afghanistan: Cold Wave - Mar 2024,past,Afghanistan,2024-06-18T21:04:41+00:00,Cold Wave,"On 3 March 2024, the Afghan Red Crescent Socie..."
4,52302,Georgia: Heavy Snowfall and Cold Wave - Feb 2025,past,Georgia,2025-09-17T12:28:39+00:00,Cold Wave,"Since 21 February 2025, Western Georgia has be..."


In [22]:
main_df['status'].unique()

array(['past', 'ongoing', 'alert'], dtype=object)

In [14]:
main_df.description.str.len().describe()

count      200.000000
mean      3505.965000
std       4158.855606
min        367.000000
25%       1246.500000
50%       2176.500000
75%       4110.750000
max      39860.000000
Name: description, dtype: float64

In [44]:
import re

def clip_text(text, max_chars=3000):
    if not isinstance(text, str):
        return ""
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    if len(text) <= max_chars:
        return text
    
    sentences = re.split(r'(?<=[.!?]) +', text)
    clipped = ""
    for s in sentences:
        if len(clipped) + len(s) > max_chars:
            break
        clipped += s + " "
    return clipped.strip()


In [45]:
main_df["description_clipped"] = main_df["description"].apply(clip_text)

In [25]:
main_df["description_clipped"].str.len().describe()

count     200.000000
mean     2061.210000
std       841.051583
min       367.000000
25%      1246.500000
50%      2174.000000
75%      2888.000000
max      3000.000000
Name: description_clipped, dtype: float64

In [46]:
main_df.head()

Unnamed: 0,id,title,status,country,date,type,description,description_clipped
0,51830,Jordan: Cold Wave - Nov 2023,past,Jordan,2024-02-16T09:45:58+00:00,Cold Wave,"Starting in the evening of Sunday, 19/11/2023,...","Starting in the evening of Sunday, 19/11/2023,..."
1,51871,Mongolia: Dzud - Dec 2023,past,Mongolia,2025-02-03T04:50:55+00:00,Cold Wave,Mongolia is currently facing a severe Dzud. Th...,Mongolia is currently facing a severe Dzud. Th...
2,51879,Lebanon: Cold Wave - Jan 2024,past,Lebanon,2024-04-18T08:05:00+00:00,Cold Wave,Lebanon and the eastern basin of the Mediterra...,Lebanon and the eastern basin of the Mediterra...
3,51930,Afghanistan: Cold Wave - Mar 2024,past,Afghanistan,2024-06-18T21:04:41+00:00,Cold Wave,"On 3 March 2024, the Afghan Red Crescent Socie...","On 3 March 2024, the Afghan Red Crescent Socie..."
4,52302,Georgia: Heavy Snowfall and Cold Wave - Feb 2025,past,Georgia,2025-09-17T12:28:39+00:00,Cold Wave,"Since 21 February 2025, Western Georgia has be...","Since 21 February 2025, Western Georgia has be..."


### Generate embeddings

In [47]:
embeddings = []
client = genai.Client(vertexai=True,project=project_id,location=location)

for idx, row in main_df.iterrows():
    # Concatenate title and description from the current row with separator
    text = f"{row['title']} {row['description_clipped']}" if pd.notnull(row['description']) else row['title']
    #print(text)
    
    response = client.models.embed_content(
        model="text-embedding-005",
        contents=text,
        config=EmbedContentConfig(
            task_type="RETRIEVAL_DOCUMENT" 
        ),
    )
    embeddings.append(response.embeddings[0].values)

# Add to your dataframe
main_df["embedding"] = embeddings

print("Embedding length:", len(main_df["embedding"][0]))

Embedding length: 768


In [48]:
main_df.head()

Unnamed: 0,id,title,status,country,date,type,description,description_clipped,embedding
0,51830,Jordan: Cold Wave - Nov 2023,past,Jordan,2024-02-16T09:45:58+00:00,Cold Wave,"Starting in the evening of Sunday, 19/11/2023,...","Starting in the evening of Sunday, 19/11/2023,...","[-0.01603551208972931, -0.004946137312799692, ..."
1,51871,Mongolia: Dzud - Dec 2023,past,Mongolia,2025-02-03T04:50:55+00:00,Cold Wave,Mongolia is currently facing a severe Dzud. Th...,Mongolia is currently facing a severe Dzud. Th...,"[-0.011093736626207829, -0.0063713970594108105..."
2,51879,Lebanon: Cold Wave - Jan 2024,past,Lebanon,2024-04-18T08:05:00+00:00,Cold Wave,Lebanon and the eastern basin of the Mediterra...,Lebanon and the eastern basin of the Mediterra...,"[-0.018243620172142982, 0.008612164296209812, ..."
3,51930,Afghanistan: Cold Wave - Mar 2024,past,Afghanistan,2024-06-18T21:04:41+00:00,Cold Wave,"On 3 March 2024, the Afghan Red Crescent Socie...","On 3 March 2024, the Afghan Red Crescent Socie...","[-0.055603839457035065, 0.003309867810457945, ..."
4,52302,Georgia: Heavy Snowfall and Cold Wave - Feb 2025,past,Georgia,2025-09-17T12:28:39+00:00,Cold Wave,"Since 21 February 2025, Western Georgia has be...","Since 21 February 2025, Western Georgia has be...","[-0.06296245753765106, 0.02222575433552265, -0..."


In [50]:
import json

records = []
with open("disaster_title_desc_vectors.json", "w") as f:  # note: .json extension
    for _, row in main_df.iterrows():
        #print(type(row["embedding"]))
        record = {
            "id" : row["id"],
            "embedding": row["embedding"],
            "embedding_metadata": {
                "title": row["title"],
                "description": row["description"],
                "country": row["country"],
                "status": row["status"],
                "type": row["type"],
                "date": row["date"]
            }
        }
        f.write(json.dumps(record) + "\n") 

### Debug with Manual Retrieval

In [30]:
from google.cloud import storage

storage_client = storage.Client(project=project_id)

bucket_name = "disasters-vectors"  
destination_blob_name = "disasters200_vectors.json"
source_file_name = "disaster_title_desc_vectors.json"

bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
blob.upload_from_filename(source_file_name)

print(f"Uploaded {source_file_name} to gs://{bucket_name}/{destination_blob_name}")


Uploaded disaster_title_desc_vectors.json to gs://disasters-vectors/disasters200_vectors.json


In [9]:
embeds_df = pd.read_json('disaster_title_desc_vectors.json',lines=True)
embeds_df

Unnamed: 0,id,embedding,embedding_metadata
0,51830,"[-0.016035512089729, -0.004946137312799, -0.03...","{'title': 'Jordan: Cold Wave - Nov 2023', 'des..."
1,51871,"[-0.011093736626207001, -0.00637139705941, -0....","{'title': 'Mongolia: Dzud - Dec 2023', 'descri..."
2,51879,"[-0.018243620172142, 0.008612164296209, -0.050...","{'title': 'Lebanon: Cold Wave - Jan 2024', 'de..."
3,51930,"[-0.055603839457035, 0.0033098678104570003, -0...","{'title': 'Afghanistan: Cold Wave - Mar 2024',..."
4,52302,"[-0.062962457537651, 0.022225754335522003, -0....",{'title': 'Georgia: Heavy Snowfall and Cold Wa...
...,...,...,...
195,52306,"[0.013958492316305, 0.009083811193704001, -0.0...","{'title': 'Tropical Cyclone Jude - Mar 2025', ..."
196,51880,"[-0.06289511173963501, -0.020150000229477, -0....","{'title': 'Chile: Wild Fires - Jan 2024', 'des..."
197,52030,"[-0.052157003432512006, -0.037723157554864, -0...","{'title': 'Belize: Wild Fires - May 2024', 'de..."
198,52095,"[-0.040381085127592004, -0.026769297197461003,...","{'title': 'Bolivia: Wild Fires - Jul 2024', 'd..."


### Manual retrieval & dashboard

In [10]:
client = genai.Client(vertexai=True, project=project_id, location=location)

def rewrite_query_with_gemini(user_query):
    """
    Uses Gemini to rewrite user query for better semantic retrieval.
    Expands regions, adds context, and clarifies disaster-related terms.
    """
    system_prompt = (
        "You are a disaster data retrieval assistant. "
        "Given a user query, rewrite it to make it more effective for semantic search over a disaster database. "
        "Expand region names into countries if necessary. If country name is specifically mentioned, retain it as is."
        "Preserve the disaster type & location explicitly. "
        "Handle time-sensitive expressions like 'recent' or 'this month' -> limit to 30 days; if specific date/month/year "
        "is mentioned, retain it as is."
        "Make the rewritten text concise, natural, and contextually complete. "
        "Do NOT invent data or add numeric details. "
        "Output only the rewritten query text."
    )

    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=user_query,
        config=types.GenerateContentConfig(
            system_instruction=system_prompt,
            temperature=0.7,
            max_output_tokens=10000
        )
    )
    #print(response)
    rewritten_query = response.candidates[0].content.parts[0].text.strip()
    return rewritten_query

In [35]:
def retrieve_similar_disasters_with_gemini(
    df,
    user_query: str,
    top_k: int = 5,
    embedding_model: str = "text-embedding-005",
    # project_id: str = None,
    # location: str = None
):
    """
    Retrieve top-K similar disasters using Gemini-assisted query expansion + cosine similarity.
    """
    #client = genai.Client(vertexai=True, project=project_id, location=location)

    # --- Step 1: Rewrite query using Gemini ---
    rewritten_query = rewrite_query_with_gemini(user_query)
    print(f"Gemini Rewritten Query:\n{rewritten_query}\n")

    # --- Step 2: Generate embedding for rewritten query ---
    response = client.models.embed_content(
        model=embedding_model,
        contents=rewritten_query,
        config=EmbedContentConfig(task_type="RETRIEVAL_QUERY")
    )
    query_embedding = response.embeddings[0].values

    # --- Step 3: Compute cosine similarity ---
    matrix = np.vstack(df["embedding"].values)
    sim_scores = cosine_similarity([query_embedding], matrix)[0]
    df["similarity_score"] = sim_scores

    # --- Step 4: Sort and return top K ---
    df = df.sort_values("similarity_score", ascending=False).head(top_k).reset_index(drop=True)
    return df[["id", "embedding_metadata", "similarity_score"]]

In [36]:
user_query = "Disasters in North America"
# rewrite_query = rewrite_query_with_gemini(user_query)
# print(rewrite_query)
results_df = retrieve_similar_disasters_with_gemini(
    df=embeds_df,
    user_query=user_query,
    top_k=5,
    # project_id=project_id,
    # location=location
)

Gemini Rewritten Query:
Disasters in Canada, Mexico, United States



In [37]:
results_df

Unnamed: 0,id,embedding_metadata,similarity_score
0,52406,"{'title': 'Guatemala: Earthquake - Jul 2025', ...",0.581444
1,51816,"{'title': 'Colombia: Floods - Oct 2023', 'desc...",0.572952
2,51819,{'title': 'Dominican Republic: Floods and Land...,0.572744
3,52399,{'title': 'Venezuela: Floods and Landslides - ...,0.56828
4,51951,"{'title': 'Uruguay: Floods - Mar 2024', 'descr...",0.567555


### Actual Deployment using Vertex Vector Search

In [61]:
def retrieve_similar_disasters_vertex(
    user_query: str,
    top_k: int=5,
    embedding_model: str = "text-embedding-005",
):
    """
    Retrieve top-K similar results using Vertex AI Vector Search with Gemini-based query expansion.
    """
    # ------------------------------
    # Step 1: Generate embedding using Gemini text-embedding model
    # ------------------------------
    #client = genai.Client(vertexai=True, project=project_id, location=location)

    rewritten_query = rewrite_query_with_gemini(user_query)
    print(f"Gemini Rewritten Query:\n{rewritten_query}\n")

    response = client.models.embed_content(
        model=embedding_model,
        contents=rewritten_query,
        config=EmbedContentConfig(task_type="RETRIEVAL_QUERY"),
    )

    query_embedding = response.embeddings[0].values

    # ------------------------------
    # Step 2: Call Vector Search endpoint
    # ------------------------------
    aiplatform.init(project=project_id, location=location)
    index_endpoint = aiplatform.MatchingEngineIndexEndpoint(index_endpoint_id)

    response = index_endpoint.find_neighbors(
        deployed_index_id=deployed_index_id,
        queries=[query_embedding],
        num_neighbors=top_k,
        return_full_datapoint=True
    )

    # ------------------------------
    # Step 3: Parse and format results
    # ------------------------------
    results = []
    for n in response[0]:  
        #print(n.id)
        row = embeds_df[embeds_df["id"] == int(n.id)]
        #print(row["embedding_metadata"])
        results.append({
            "id":int(n.id),
            "embedding_metadata":row.embedding_metadata.iloc[0],
            "similarity_score": n.distance,
        })
    results = pd.DataFrame(results)
    results = results.sort_values("similarity_score", ascending=False).head(top_k).reset_index(drop=True)
    return results

In [62]:
user_query_vertex = "Disasters in North America"
results_df_vertex = retrieve_similar_disasters_vertex(
    user_query=user_query_vertex,
    top_k=5
)

Gemini Rewritten Query:
Disasters in North America, including Canada, Mexico, and the United States.



In [63]:
results_df_vertex

Unnamed: 0,id,embedding_metadata,similarity_score
0,51816,"{'title': 'Colombia: Floods - Oct 2023', 'desc...",0.56644
1,52406,"{'title': 'Guatemala: Earthquake - Jul 2025', ...",0.557235
2,51894,{'title': 'Colombia: Drought and Wild Fires - ...,0.553027
3,52399,{'title': 'Venezuela: Floods and Landslides - ...,0.552277
4,51849,{'title': 'Argentina: Severe Local Storm - Dec...,0.548494


In [None]:
!gcloud ai index-endpoints list --project={project_id} --region={location}

In [None]:
!gcloud ai index-endpoints describe {index_endpoint_id} --project={project_id} --region={location}

### Refinement for Dashboard setup

In [38]:
refine_retrieved_df = pd.concat([results_df.drop(['embedding_metadata'], axis=1),
                results_df['embedding_metadata'].apply(pd.Series)], axis=1)
refine_retrieved_df["date"] = pd.to_datetime(refine_retrieved_df["date"])

In [39]:
refine_retrieved_df

Unnamed: 0,id,similarity_score,title,description,country,status,type,date
0,52406,0.581444,Guatemala: Earthquake - Jul 2025,"Since 8 July, a series of earthquakes has seve...",Guatemala,ongoing,Earthquake,2025-07-30 00:49:38+00:00
1,51816,0.572952,Colombia: Floods - Oct 2023,"Since 30 October, there have been heavy and pr...",Colombia,past,Flood,2024-04-25 18:07:17+00:00
2,51819,0.572744,Dominican Republic: Floods and Landslides - No...,"Since 17 November, a tropical depression has b...",Dominican Republic,past,Flash Flood,2024-01-11 06:01:47+00:00
3,52399,0.56828,Venezuela: Floods and Landslides - Jun 2025,The passage of Tropical Wave number 9 through ...,Venezuela (Bolivarian Republic of),ongoing,Flood,2025-07-10 08:26:26+00:00
4,51951,0.567555,Uruguay: Floods - Mar 2024,"Between March 16 and 22, Uruguay experienced a...",Uruguay,past,Flood,2024-06-10 17:10:04+00:00


In [64]:
refine_retrieved_df_vertex = pd.concat([results_df_vertex.drop(['embedding_metadata'], axis=1),
                results_df_vertex['embedding_metadata'].apply(pd.Series)], axis=1)
refine_retrieved_df_vertex["date"] = pd.to_datetime(refine_retrieved_df_vertex["date"])

In [65]:
results_df_vertex

Unnamed: 0,id,embedding_metadata,similarity_score
0,51816,"{'title': 'Colombia: Floods - Oct 2023', 'desc...",0.56644
1,52406,"{'title': 'Guatemala: Earthquake - Jul 2025', ...",0.557235
2,51894,{'title': 'Colombia: Drought and Wild Fires - ...,0.553027
3,52399,{'title': 'Venezuela: Floods and Landslides - ...,0.552277
4,51849,{'title': 'Argentina: Severe Local Storm - Dec...,0.548494


### Streamlit via Ngrok

In [62]:
!pip install ngrok

Collecting ngrok
  Downloading ngrok-1.4.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Downloading ngrok-1.4.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m43.4 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: ngrok
Successfully installed ngrok-1.4.0


In [66]:
from pyngrok import ngrok
ngrok_api_key = os.getenv("NGROK_API_KEY")
ngrok.set_auth_token(ngrok_api_key)

In [67]:
print(ngrok.get_tunnels())

[]


In [None]:
import subprocess
import time

# Run streamlit app in background on port 8501
command = ["streamlit", "run", "app.py", "--server.port", "8518", "--server.headless", "true"]
proc = subprocess.Popen(command)

ngrok.kill()
# Wait for Streamlit app to start
time.sleep(5)

# Create ngrok tunnel to expose port 8501
public_url = ngrok.connect(8518)
print(f"Streamlit app available at: {public_url}")

In [74]:
ngrok.kill()