In [1]:
import xarray as xr
ds = xr.open_dataset(r"C:\Users\Ankit\ARGO_prototype\ARGO_datasets\1902673_prof.nc")

In [2]:
ds.data_vars

Data variables:
    DATA_TYPE                     object 8B ...
    FORMAT_VERSION                object 8B ...
    HANDBOOK_VERSION              object 8B ...
    REFERENCE_DATE_TIME           object 8B ...
    DATE_CREATION                 object 8B ...
    DATE_UPDATE                   object 8B ...
    PLATFORM_NUMBER               (N_PROF) object 552B ...
    PROJECT_NAME                  (N_PROF) object 552B ...
    PI_NAME                       (N_PROF) object 552B ...
    STATION_PARAMETERS            (N_PROF, N_PARAM) object 2kB ...
    CYCLE_NUMBER                  (N_PROF) float64 552B ...
    DIRECTION                     (N_PROF) object 552B ...
    DATA_CENTRE                   (N_PROF) object 552B ...
    DC_REFERENCE                  (N_PROF) object 552B ...
    DATA_STATE_INDICATOR          (N_PROF) object 552B ...
    DATA_MODE                     (N_PROF) object 552B ...
    PLATFORM_TYPE                 (N_PROF) object 552B ...
    FLOAT_SERIAL_NO               (N_P

In [3]:
vars_to_keep = [
    "JULD",          
    "LATITUDE",     
    "LONGITUDE",     
    "PRES",         
    "PRES_ADJUSTED", 
    "TEMP",          
    "TEMP_ADJUSTED", 
    "PSAL",          
    "PSAL_ADJUSTED"  
]

In [4]:
ds1 = ds[vars_to_keep]

In [5]:
df = ds1.to_dataframe().reset_index()

In [6]:
df = df[["JULD", "LATITUDE", "LONGITUDE", 
         "PRES_ADJUSTED", "TEMP_ADJUSTED", "PSAL_ADJUSTED"]]

In [7]:
df = df.rename(columns={
    "JULD": "time",
    "LATITUDE": "lat",
    "LONGITUDE": "lon",
    "PRES_ADJUSTED": "depth_dbar",
    "TEMP_ADJUSTED": "temperature_C",
    "PSAL_ADJUSTED": "salinity_PSU"
})

In [8]:
df = df.dropna().reset_index(drop=True)

In [9]:
df.to_csv(r"C:\Users\Ankit\ARGO_prototype\ARGO_datasets\argo_1902673_clean1.csv", index=False)

In [10]:
df.head(3)

Unnamed: 0,time,lat,lon,depth_dbar,temperature_C,salinity_PSU
0,2023-10-26 14:17:25,13.6,68.366667,0.1,29.847,36.616001
1,2023-10-26 14:17:25,13.6,68.366667,0.8,29.847,36.618
2,2023-10-26 14:17:25,13.6,68.366667,2.0,29.853001,36.615002


In [11]:
import pandas as pd
import sqlite3
from sentence_transformers import SentenceTransformer
import numpy as np




In [12]:
csv_file = r"C:\Users\Ankit\ARGO_prototype\ARGO_datasets\argo_1902673_clean1.csv"
df = pd.read_csv(csv_file)

In [13]:
db_file = "argo_data.db"
conn = sqlite3.connect(db_file)

In [14]:
table_name = "argo_profiles"
df.to_sql(table_name, conn, if_exists="replace", index=False)

5817

In [15]:
query = f"""
SELECT time, lat, lon, depth_dbar, temperature_C, salinity_PSU
FROM {table_name}
WHERE depth_dbar < 200
LIMIT 5;
"""
sample = pd.read_sql(query, conn)
print(sample)

                  time   lat        lon  depth_dbar  temperature_C  \
0  2023-10-26 14:17:25  13.6  68.366667         0.1         29.847   
1  2023-10-26 14:17:25  13.6  68.366667         0.8         29.847   
2  2023-10-26 14:17:25  13.6  68.366667         2.0         29.853   
3  2023-10-26 14:17:25  13.6  68.366667         2.8         29.855   
4  2023-10-26 14:17:25  13.6  68.366667         3.7         29.854   

   salinity_PSU  
0        36.616  
1        36.618  
2        36.615  
3        36.614  
4        36.616  


In [16]:
def build_embeddings(df, text_cols=["time", "lat", "lon"]):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    texts = df[text_cols].astype(str).agg(" ".join, axis=1).tolist()
    embeddings = model.encode(texts, convert_to_numpy=True)
    return embeddings

In [17]:
import re
from transformers import pipeline
import tensorflow_hub as hub

In [18]:
nlp = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")













In [19]:
def classify_intent(query: str) -> str:
    query_lower = query.lower()
    if "compare" in query_lower:
        return "compare_parameters"
    elif "nearest" in query_lower or "closest" in query_lower:
        return "find_nearest_floats"
    else:
        return "fetch_profile"

In [20]:
def extract_entities(query: str) -> dict:
    embedding = nlp([query])

    entities = {"variable": None, "time": None, "lat_range": None, "lon_range": None, "depth_range": None}

    q = query.lower()
    if "salinity" in q:
        entities["variable"] = "salinity_PSU"
    elif "temperature" in q or "temp" in q:
        entities["variable"] = "temperature_C"

    time_match = re.findall(r"(20\d{2})(?:[-/ ]?(0[1-9]|1[0-2]))?", q)
    if time_match:
        year, month = time_match[0]
        if month:
            entities["time"] = f"{year}-{month}"
        else:
            entities["time"] = year

    if "equator" in q:
        entities["lat_range"] = (-5, 5)
    if "indian ocean" in q:
        entities["lon_range"] = (20, 120)
        entities["lat_range"] = (-40, 25)

    depth_match = re.findall(r"(\d+)\s*(?:m|meter|dbar)", q)
    if depth_match:
        depth = int(depth_match[0])
        entities["depth_range"] = (depth - 5, depth + 5) 

    return entities, embedding