🔹 Step 1: Import Libraries

In [6]:
from google.cloud import bigquery
import os

# Debug: Check for shadowing files/folders
print("Current working directory:", os.getcwd())
print("Files in current directory:", os.listdir())

import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from pymongo import MongoClient
from tqdm import tqdm
import pandas as pd


Current working directory: d:\Hustle\Projects\AIA Google\healthtrack\HealthTrack-AI\notebooks
Files in current directory: ['embed_mimic_biobert.ipynb']


  from .autonotebook import tqdm as notebook_tqdm


In [1]:
# gcloud login  
!gcloud auth application-default login

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=QkR7fablvY6OPTc1whE8mqEmvVmiWU&access_type=offline&code_challenge=xYGC_zgt5ELil1Om2DSawwh7zvZiJIygPuMtTW7GCWQ&code_challenge_method=S256


Credentials saved to file: [C:\Users\ekagr\AppData\Roaming\gcloud\application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).

Quota project "healthtrack-hack" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project owning the resource.


🔹 Step 2: Set Credentials & Connect to BigQuery

In [23]:
cred = r"C:\Users\ekagr\AppData\Roaming\gcloud\application_default_credentials.json"
# Set this if you haven't already authenticated locally
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred

bq_client = bigquery.Client()
TABLE_ID = "healthtrack-hack.healthtrack_data.case_embeddings"



🔹 Step 3: Load Data from BigQuery

In [8]:

query = f"SELECT * FROM `{TABLE_ID}` WHERE note IS NOT NULL LIMIT 10000"
df = bq_client.query(query).to_dataframe()
df = df.dropna(subset=["note"])  # Safety check
df.head()




Unnamed: 0,note_id,subject_id,hadm_id,age,sex,admittime,dischtime,charttime,note,temp,bp,hr,spo2,rr,icd_tags
0,14371636-DS-14,14371636,22341970,18,M,2165-08-15 15:28:00,2165-08-16 15:30:00,2165-08-16 00:00:00+00:00,\nName: ___ Unit No: ___...,,,,,,"[{'code': '49392', 'label': 'Asthma, unspecifi..."
1,11044044-DS-3,11044044,22240600,18,F,2159-10-16 04:58:00,2159-10-18 18:03:00,2159-10-18 00:00:00+00:00,\nName: ___ Unit No: ___...,,,,,,"[{'code': '09954', 'label': 'Other venereal di..."
2,15414917-DS-15,15414917,22802282,18,M,2169-11-17 11:15:00,2169-11-18 15:11:00,2169-11-18 00:00:00+00:00,\nName: ___ Unit No: _...,,,,,,"[{'code': '72210', 'label': 'Displacement of l..."
3,16286332-DS-12,16286332,22423640,18,F,2126-10-30 11:59:00,2126-11-02 14:45:00,2126-11-02 00:00:00+00:00,\nName: ___ Unit No: ___\n...,,,,,,"[{'code': 'F329', 'label': 'Major depressive d..."
4,19076973-DS-9,19076973,25794363,18,F,2170-09-12 13:39:00,2170-09-16 19:00:00,2170-09-16 00:00:00+00:00,\nName: ___ Unit No: __...,10.0,,,22.0,,"[{'code': 'O42912', 'label': 'Preterm prematur..."


🔹 Step 4: Load BioBERT Model for Embedding

In [9]:
model = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")  # HuggingFace


🔹 Step 5: Generate Embeddings

In [10]:

def embed_text(text):
    return model.encode(text, show_progress_bar=False).tolist()

tqdm.pandas(desc="Embedding")
df["embedding"] = df["note"].progress_apply(embed_text)


Embedding: 100%|██████████| 10000/10000 [39:21<00:00,  4.24it/s] 


🔹 Step 6: Connect to MongoDB Atlas

In [11]:
from dotenv import load_dotenv
import os
from pymongo import MongoClient

load_dotenv()

MONGODBENV = os.getenv("MONGODBPW")
uri = "mongodb+srv://naki:<db_password>@cluster0.hrnm8fq.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
MONGO_URI = uri.replace("<db_password>", MONGODBENV)  # Replace with your actual password
client = MongoClient(MONGO_URI)

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)
collection = client.healthtrack.case_embeddings  # or whatever DB/collection you want

Pinged your deployment. You successfully connected to MongoDB!


In [45]:
df['note'].head()


0     \nName:  ___                   Unit No:   ___...
1     \nName:  ___                   Unit No:   ___...
2     \nName:  ___                     Unit No:   _...
3     \nName:  ___                 Unit No:   ___\n...
4     \nName:  ___                    Unit No:   __...
Name: note, dtype: object

🔹 Step 7: Format and Insert into MongoDB

In [43]:
import pandas as pd
import tqdm
import json

def to_record(row):
    tags = row.get("icd_tags", [])
    # Ensure tags is a list of dicts
    if tags is None or (isinstance(tags, float) and pd.isna(tags)):
        tags = []
    elif isinstance(tags, str):
        import ast
        try:
            tags = ast.literal_eval(tags)
        except Exception:
            tags = []
    elif not isinstance(tags, list):
        tags = list(tags) if hasattr(tags, '__iter__') and not isinstance(tags, dict) else [tags]
    icd_codes = [t.get("code") for t in tags if isinstance(t, dict) and t.get("code")]
    icd_labels = [t.get("label") for t in tags if isinstance(t, dict) and t.get("label")]

    return {
        "subject_id": int(row["subject_id"]) if pd.notnull(row["subject_id"]) else None,
        "hadm_id": int(row["hadm_id"]) if pd.notnull(row["hadm_id"]) else None,
        "age": int(row["age"]) if pd.notnull(row["age"]) else None,
        "sex": row["sex"],
        "note": row["note"],
        "embedding": row["embedding"],
        "vitals": {
            "temp": row.get("temp"),
            "bp": row.get("bp"),
            "hr": row.get("hr"),
            "spo2": row.get("spo2"),
            "rr": row.get("rr"),
        },
        "icd": icd_codes,
        "icd_label": icd_labels
    }

records = [to_record(row) for _, row in tqdm.tqdm(df.iterrows(), total=len(df))]

# with open("case_embeddings_dataset.json", "w", encoding="utf-8") as f:
#     json.dump(records, f, ensure_ascii=False, indent=2)
# print(f"✅ Saved {len(records)} records to case_embeddings_dataset.json")

# Insert records into MongoDB
if records:
    try:
        collection.insert_many(records)
        print(f"✅ Successfully inserted {len(records)} records into MongoDB.")
    except Exception as e:
        print(f"❌ Error inserting records into MongoDB: {e}")


100%|██████████| 10000/10000 [00:01<00:00, 8217.48it/s]



✅ Successfully inserted 10000 records into MongoDB.


✅ 🔍 Vector Indexing in MongoDB Atlas (Once)

In [None]:
{
  "createIndexes": "case_embeddings",
  "indexes": [
    {
      "name": "embedding_vector_index",
      "key": {
        "embedding": "cosmosSearch"
      },
      "type": "vectorSearch",
      "options": {
        "dimensions": 768,
        "similarity": "cosine"
      }
    }
  ]
}
