In [1]:
import xarray as xr

import pandas as pd
import numpy as np

In [2]:
# Load the .nc file
po4 = xr.open_dataset('/Users/arup/Documents/ISRO-Project/prediction/raw_data/po4.nc')

# View the data
po4

In [3]:
# Check lev (depth) metadata
print(po4.lev.attrs)
# Output should confirm:
#   units = "meters"
#   positive = "down" (standard for ocean depth)

{'standard_name': 'depth', 'long_name': 'ocean depth coordinate', 'units': 'm', 'positive': 'down', 'axis': 'Z', 'bounds': 'lev_bnds'}


In [4]:
# Convert the 4D data to 2D dataframe
po4 = po4['po4'].to_dataframe().reset_index()

# Rename columns for clarity
po4 = po4.rename(columns={'po4': 'po4'})

po4

Unnamed: 0,time,lev,j,i,longitude,latitude,po4
0,2025-01-16 12:00:00,6.0,137.0,10.0,87.209916,22.810536,
1,2025-01-16 12:00:00,6.0,137.0,11.0,87.644821,22.818800,
2,2025-01-16 12:00:00,6.0,137.0,12.0,88.079744,22.827879,
3,2025-01-16 12:00:00,6.0,137.0,13.0,88.514688,22.837772,
4,2025-01-16 12:00:00,6.0,137.0,14.0,88.949652,22.848474,
...,...,...,...,...,...,...,...
215995,2029-12-16 12:00:00,5720.0,146.0,14.0,89.022688,19.994032,
215996,2029-12-16 12:00:00,5720.0,146.0,15.0,89.463056,20.002688,
215997,2029-12-16 12:00:00,5720.0,146.0,16.0,89.903443,20.011949,
215998,2029-12-16 12:00:00,5720.0,146.0,17.0,90.343850,20.021812,


In [5]:
# Get the time range this dataset covers
time_range_start = po4['time'].min()
time_range_end = po4['time'].max()

print(f"Time range: {time_range_start} to {time_range_end}")

Time range: 2025-01-16 12:00:00 to 2029-12-16 12:00:00


In [6]:
# Get the time range this dataset covers
time_range_start = po4['lev'].min()
time_range_end = po4['lev'].max()

print(f"Lev range: {time_range_start} to {time_range_end}")

Lev range: 6.0 to 5720.0


In [7]:
# Filter the dataset to include only rows where lev = 6
po4 = po4[po4['lev'] == 6.0]

# Select only required columns
po4 = po4[['time', 'longitude', 'latitude', 'po4']]

# Remove NaN values from po4 column
po4 = po4.dropna(subset=['po4'])

# Round off latitude and longitude to 2 decimal places
po4['longitude'] = po4['longitude'].round(2)
po4['latitude'] = po4['latitude'].round(2)

# Convert time to datetime if not already
po4['time'] = pd.to_datetime(po4['time'])

# Create a new column with first day of each month
po4['time'] = po4['time'].dt.strftime('%Y-%m-01')

# Group by time, longitude, latitude and calculate mean of po4
po4 = po4.groupby(['time', 'longitude', 'latitude'])['po4'].mean().reset_index()

# Convert time back to datetime
po4['time'] = pd.to_datetime(po4['time'])

# Sort by date
po4 = po4.sort_values('time')

print("Shape after monthly aggregation:", po4.shape)

po4

Shape after monthly aggregation: (3240, 4)


Unnamed: 0,time,longitude,latitude,po4
0,2025-01-01,87.24,21.24,0.000013
29,2025-01-01,89.42,21.60,0.000022
30,2025-01-01,89.43,21.28,0.000022
31,2025-01-01,89.44,20.97,0.000030
32,2025-01-01,89.45,20.65,0.000045
...,...,...,...,...
3207,2029-12-01,88.58,19.99,0.000103
3208,2029-12-01,88.58,20.31,0.000079
3209,2029-12-01,88.98,21.59,0.000007
3197,2029-12-01,88.11,21.57,0.000031


In [8]:
# Convert po4 from mol/m3 to mmol/m3
po4['po4'] = po4['po4'] * 1000

po4

Unnamed: 0,time,longitude,latitude,po4
0,2025-01-01,87.24,21.24,0.012663
29,2025-01-01,89.42,21.60,0.022338
30,2025-01-01,89.43,21.28,0.021703
31,2025-01-01,89.44,20.97,0.030130
32,2025-01-01,89.45,20.65,0.044844
...,...,...,...,...
3207,2029-12-01,88.58,19.99,0.103455
3208,2029-12-01,88.58,20.31,0.078776
3209,2029-12-01,88.98,21.59,0.007088
3197,2029-12-01,88.11,21.57,0.031158


In [9]:
# Save the dataset as a CSV file in the processed_data folder
po4.to_csv('/Users/arup/Documents/ISRO-Project/prediction/processed_data/po4.csv', index=False)