In [1]:
import xarray as xr

import pandas as pd
import numpy as np

In [2]:
# Load the .nc file
spco2 = xr.open_dataset('/Users/arup/Documents/ISRO-Project/prediction/raw_data/spco2.nc')

# View the data
spco2

In [3]:
# Check lev (depth) metadata
print(spco2.depth.attrs)
# Output should confirm:
#   units = "meters"
#   positive = "down" (standard for ocean depth)

{'standard_name': 'depth', 'long_name': 'depth', 'units': 'm', 'positive': 'down', 'axis': 'Z'}


In [4]:
# Convert the 4D data to 2D dataframe
spco2 = spco2['spco2'].to_dataframe().reset_index()

# Rename columns for clarity
spco2 = spco2.rename(columns={'spco2': 'spco2'})

spco2

Unnamed: 0,time,j,i,longitude,latitude,depth,spco2
0,2025-01-16 12:00:00,137.0,10.0,87.209916,22.810536,0.0,
1,2025-01-16 12:00:00,137.0,11.0,87.644821,22.818800,0.0,
2,2025-01-16 12:00:00,137.0,12.0,88.079744,22.827879,0.0,
3,2025-01-16 12:00:00,137.0,13.0,88.514688,22.837772,0.0,
4,2025-01-16 12:00:00,137.0,14.0,88.949652,22.848474,0.0,
...,...,...,...,...,...,...,...
5395,2029-12-16 12:00:00,146.0,14.0,89.022688,19.994032,0.0,42.771862
5396,2029-12-16 12:00:00,146.0,15.0,89.463056,20.002688,0.0,42.655754
5397,2029-12-16 12:00:00,146.0,16.0,89.903443,20.011949,0.0,42.701675
5398,2029-12-16 12:00:00,146.0,17.0,90.343850,20.021812,0.0,42.692757


In [5]:
# Get the time range this dataset covers
time_range_start = spco2['time'].min()
time_range_end = spco2['time'].max()

print(f"Time range: {time_range_start} to {time_range_end}")

Time range: 2025-01-16 12:00:00 to 2029-12-16 12:00:00


In [6]:
# Get the time range this dataset covers
time_range_start = spco2['depth'].min()
time_range_end = spco2['depth'].max()

print(f"Depth range: {time_range_start} to {time_range_end}")

Depth range: 0.0 to 0.0


In [7]:
# Filter the dataset to include only rows where depth = 0
spco2 = spco2[spco2['depth'] == 0.0]

# Select only required columns
spco2 = spco2[['time', 'longitude', 'latitude', 'spco2']]

# Remove NaN values from spco2 column
spco2 = spco2.dropna(subset=['spco2'])

# Round off latitude and longitude to 2 decimal places
spco2['longitude'] = spco2['longitude'].round(2)
spco2['latitude'] = spco2['latitude'].round(2)

# Convert time to datetime if not already
spco2['time'] = pd.to_datetime(spco2['time'])

# Create a new column with first day of each month
spco2['time'] = spco2['time'].dt.strftime('%Y-%m-01')

# Group by time, longitude, latitude and calculate mean of spco2
spco2 = spco2.groupby(['time', 'longitude', 'latitude'])['spco2'].mean().reset_index()

# Convert time back to datetime
spco2['time'] = pd.to_datetime(spco2['time'])

# Sort by date
spco2 = spco2.sort_values('time')

print("Shape after monthly aggregation:", spco2.shape)

spco2

Shape after monthly aggregation: (3240, 4)


Unnamed: 0,time,longitude,latitude,spco2
0,2025-01-01,87.24,21.24,30.543810
29,2025-01-01,89.42,21.60,34.912796
30,2025-01-01,89.43,21.28,37.626900
31,2025-01-01,89.44,20.97,37.999607
32,2025-01-01,89.45,20.65,38.253769
...,...,...,...,...
3207,2029-12-01,88.58,19.99,42.774567
3208,2029-12-01,88.58,20.31,42.858105
3209,2029-12-01,88.98,21.59,38.763214
3197,2029-12-01,88.11,21.57,31.998611


In [8]:
# Save the dataset as a CSV file in the processed_data folder
spco2.to_csv('/Users/arup/Documents/ISRO-Project/prediction/processed_data/spco2.csv', index=False)