In [1]:
import xarray as xr
import pandas as pd
import os

def read_netcdf_to_df(variable_name):
    """
    Read a NetCDF file for a specific variable and convert it to a pandas DataFrame.
    Processes the data by:
    - Removing depth column if present 
    - Removing rows with NaN values
    - Converting time to YYYY-MM-01 format
    - Rounding lat/lon to 2 decimal places
    - Averaging values for same lat/lon/year/month combinations
    - Reordering columns to time, lat, lon, parameter
    - Sorting by time
    - Saving processed data to CSV in processed_data folder
    
    Parameters:
    -----------
    variable_name : str
        Name of the variable (e.g., 'chl', 'fe', 'no3', etc.)
        
    Returns:
    --------
    pandas.DataFrame
        Processed DataFrame containing the variable data
    """
    
    # Construct file path
    file_path = f'raw_data/{variable_name}.nc'
    
    # Check if file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"No NetCDF file found for {variable_name}")
    
    # Read the NetCDF file
    try:
        ds = xr.open_dataset(file_path)
        
        # Convert to DataFrame
        df = ds.to_dataframe()
        
        # Reset index to make coordinate variables become columns
        df = df.reset_index(inplace=False)  # Prevent modifying original index
        
        # Remove depth column if it exists
        if 'depth' in df.columns:
            df = df.drop('depth', axis=1)
        
        # Remove rows with NaN values
        df = df.dropna()
        
        # Convert time to datetime and format as YYYY-MM-01
        if 'time' in df.columns and not pd.api.types.is_datetime64_any_dtype(df['time']):
            df['time'] = pd.to_datetime(df['time'])
        df['time'] = df['time'].dt.strftime('%Y-%m-01').astype('datetime64[ns]')
        
        # Round latitude and longitude to 2 decimal places and convert to float64
        df['latitude'] = df['latitude'].astype('float64').round(2)
        df['longitude'] = df['longitude'].astype('float64').round(2)
        
        # Group by lat/lon/year/month and calculate mean of parameter
        df = df.groupby(['latitude', 'longitude', 'time'], as_index=False).mean()
        
        # Reorder columns to put time first
        param_col = [col for col in df.columns if col not in ['time', 'latitude', 'longitude']][0]
        df = df[['time', 'latitude', 'longitude', param_col]]
        
        # Sort by time
        df = df.sort_values('time')
        
        # Create processed_data directory if it doesn't exist
        os.makedirs('processed_data', exist_ok=True)
        
        # Save processed data to CSV
        output_path = f'processed_data/{variable_name}.csv'
        df.to_csv(output_path, index=False)
        print(f"Processed data saved to {output_path}")
        print(f"Number of rows in processed {variable_name} data: {len(df)}")
        
        return df
        
    except Exception as e:
        print(f"Error reading {variable_name}.nc: {str(e)}")
        return None

In [2]:
# Process all available variables from raw_data/
variables = ['thetao', 'so', 'uo', 'vo', 'wo', 'kd', 'ph', 'spco2', 'o2', 'no3', 'po4', 'si', 'fe', 'chl']

# Create a dictionary to store all dataframes
dfs = {}

# Process each variable
for var in variables:
    print(f"\nProcessing {var}...")
    df = read_netcdf_to_df(var)
    if df is not None:
        dfs[var] = df
    else:
        print(f"Failed to process {var}")

print("\nProcessing complete!")
print(f"Successfully processed {len(dfs)} variables")


Processing thetao...
Processed data saved to processed_data/thetao.csv
Number of rows in processed thetao data: 35200

Processing so...
Processed data saved to processed_data/so.csv
Number of rows in processed so data: 35200

Processing uo...
Processed data saved to processed_data/uo.csv
Number of rows in processed uo data: 35200

Processing vo...
Processed data saved to processed_data/vo.csv
Number of rows in processed vo data: 35200

Processing wo...
Processed data saved to processed_data/wo.csv
Number of rows in processed wo data: 35200

Processing kd...
Processed data saved to processed_data/kd.csv
Number of rows in processed kd data: 2010

Processing ph...
Processed data saved to processed_data/ph.csv
Number of rows in processed ph data: 5226

Processing spco2...
Processed data saved to processed_data/spco2.csv
Number of rows in processed spco2 data: 5226

Processing o2...
Processed data saved to processed_data/o2.csv
Number of rows in processed o2 data: 5226

Processing no3...
P