In [None]:
import kagglehub

# Download latest version
path = kagglehub.model_download("google/bert/tensorFlow1/uncased-l-12-h-768-a-12")

print("Path to model files:", path)

In [4]:
import xarray as xr
ds = xr.open_dataset(r"C:\Users\Ankit\ARGO_datasets\1902673_prof.nc")

In [5]:
ds.data_vars

Data variables:
    DATA_TYPE                     object 8B ...
    FORMAT_VERSION                object 8B ...
    HANDBOOK_VERSION              object 8B ...
    REFERENCE_DATE_TIME           object 8B ...
    DATE_CREATION                 object 8B ...
    DATE_UPDATE                   object 8B ...
    PLATFORM_NUMBER               (N_PROF) object 552B ...
    PROJECT_NAME                  (N_PROF) object 552B ...
    PI_NAME                       (N_PROF) object 552B ...
    STATION_PARAMETERS            (N_PROF, N_PARAM) object 2kB ...
    CYCLE_NUMBER                  (N_PROF) float64 552B ...
    DIRECTION                     (N_PROF) object 552B ...
    DATA_CENTRE                   (N_PROF) object 552B ...
    DC_REFERENCE                  (N_PROF) object 552B ...
    DATA_STATE_INDICATOR          (N_PROF) object 552B ...
    DATA_MODE                     (N_PROF) object 552B ...
    PLATFORM_TYPE                 (N_PROF) object 552B ...
    FLOAT_SERIAL_NO               (N_P

In [24]:
vars_to_keep = [
    "JULD",          
    "LATITUDE",     
    "LONGITUDE",     
    "PRES",         
    "PRES_ADJUSTED", 
    "TEMP",          
    "TEMP_ADJUSTED", 
    "PSAL",          
    "PSAL_ADJUSTED"  
]

In [7]:
ds1 = ds[vars_to_keep]

In [8]:
df = ds1.to_dataframe().reset_index()

In [9]:
df = df[["JULD", "LATITUDE", "LONGITUDE", 
         "PRES_ADJUSTED", "TEMP_ADJUSTED", "PSAL_ADJUSTED"]]

In [10]:
df = df.rename(columns={
    "JULD": "time",
    "LATITUDE": "lat",
    "LONGITUDE": "lon",
    "PRES_ADJUSTED": "depth_dbar",
    "TEMP_ADJUSTED": "temperature_C",
    "PSAL_ADJUSTED": "salinity_PSU"
})

In [11]:
df = df.dropna().reset_index(drop=True)

In [12]:
df.to_csv(r"C:\Users\Ankit\ARGO_datasets\argo_1902673_clean.csv", index=False)

In [13]:
df.head(3)

Unnamed: 0,time,lat,lon,depth_dbar,temperature_C,salinity_PSU
0,2023-10-26 14:17:25,13.6,68.366667,0.1,29.847,36.616001
1,2023-10-26 14:17:25,13.6,68.366667,0.8,29.847,36.618
2,2023-10-26 14:17:25,13.6,68.366667,2.0,29.853001,36.615002


In [16]:
import pandas as pd
import sqlite3
from sentence_transformers import SentenceTransformer
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm





In [17]:
csv_file = r"C:\Users\Ankit\ARGO_datasets\argo_1902673_clean.csv"
df = pd.read_csv(csv_file)

In [19]:
db_file = "argo_data.db"
conn = sqlite3.connect(db_file)

In [20]:
table_name = "argo_profiles"
df.to_sql(table_name, conn, if_exists="replace", index=False)

5817

In [22]:
query = f"""
SELECT time, lat, lon, depth_dbar, temperature_C, salinity_PSU
FROM {table_name}
WHERE depth_dbar < 200
LIMIT 5;
"""
sample = pd.read_sql(query, conn)
print(sample)

                  time   lat        lon  depth_dbar  temperature_C  \
0  2023-10-26 14:17:25  13.6  68.366667         0.1         29.847   
1  2023-10-26 14:17:25  13.6  68.366667         0.8         29.847   
2  2023-10-26 14:17:25  13.6  68.366667         2.0         29.853   
3  2023-10-26 14:17:25  13.6  68.366667         2.8         29.855   
4  2023-10-26 14:17:25  13.6  68.366667         3.7         29.854   

   salinity_PSU  
0        36.616  
1        36.618  
2        36.615  
3        36.614  
4        36.616  


In [23]:
def build_embeddings(df, text_cols=["time", "lat", "lon"]):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    texts = df[text_cols].astype(str).agg(" ".join, axis=1).tolist()
    embeddings = model.encode(texts, convert_to_numpy=True)
    return embeddings