In [1]:
import pandas as pd
import xarray as xr
import s3fs

def load_data(year):
    # Define the path to the NetCDF file on Amazon S3 based on the provided year
    s3_path = f's3://noaa-wod-pds/{year}/wod_xbt_{year}.nc'

    # Open the NetCDF file using s3fs without credentials
    fs = s3fs.S3FileSystem(anon=True)
    with fs.open(s3_path, 'rb') as f:
        ds = xr.open_dataset(f)

        # Extract the variables of interest
        variables = ['country', 'Institute', 'dataset', 'lat', 'lon', 'date']

        # Convert the dataset to a DataFrame
        df = ds[variables].to_dataframe()

        # Add the 'year' column with the value provided by the user
        df.insert(0, 'year', year)

        # Decode byte strings to regular strings
        df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

    return df

# Example usage:
desired_year = input("Enter the desired year: ")
dataframe = load_data(desired_year)
print(dataframe)


Enter the desired year:  2019


       year        country                                          Institute  \
casts                                                                           
0      2019  UNITED STATES                                                      
1      2019      AUSTRALIA  AUSTRALIAN BUREAU OF METEOROLOGY (ABOM) (MELBO...   
2      2019  UNITED STATES                                                      
3      2019  UNITED STATES                                                      
4      2019  UNITED STATES                                                      
...     ...            ...                                                ...   
13978  2019  UNITED STATES  SCRIPPS INSTITUTION OF OCEANOGRAPHY; LA JOLLA; CA   
13979  2019  UNITED STATES  SCRIPPS INSTITUTION OF OCEANOGRAPHY; LA JOLLA; CA   
13980  2019  UNITED STATES  SCRIPPS INSTITUTION OF OCEANOGRAPHY; LA JOLLA; CA   
13981  2019  UNITED STATES  SCRIPPS INSTITUTION OF OCEANOGRAPHY; LA JOLLA; CA   
13982  2019      AUSTRALIA  

  df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)


In [None]:
import os
import pandas as pd
import xarray as xr
import s3fs

def load_data(year):
    # Define the path to the NetCDF file on Amazon S3 based on the provided year
    s3_path = f's3://noaa-wod-pds/{year}/'

    # Open the S3 filesystem
    fs = s3fs.S3FileSystem(anon=True)

    # List all files in the year directory
    files = fs.ls(s3_path)

    # Filter files to get only the ones matching the pattern 'wod_xxx_YYYY.nc'
    files = [file for file in files if 'wod_' in file and f'_{year}.nc' in file]

    # Create an empty list to store DataFrames
    dfs = []

    # Iterate through each file and load data into DataFrame
    for file in files:
        with fs.open(file, 'rb') as f:
            ds = xr.open_dataset(f)

            # Define the variables of interest
            variables = ['country', 'dataset', 'lat', 'lon', 'date']

            # Check if 'Institute' variable exists before adding it to the variables of interest
            if 'Institute' in ds.variables:
                variables.append('Institute')

            # Convert the dataset to a DataFrame
            df = ds[variables].to_dataframe()

            # Add the 'year' column with the value from the file name
            df.insert(0, 'year', int(year))

            # Decode byte strings to regular strings
            df = df.apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)


            # Append the DataFrame to the list of DataFrames
            dfs.append(df)

    # Concatenate all DataFrames in the list into one DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)

    return combined_df

# Example usage:
desired_year = input("Enter the desired year: ")
dataframe = load_data(desired_year)
print(dataframe)


In [17]:
# Filter data for country containing "AUSTRALIA"
australia_data = df[df['Institute'].str.contains('COMMONWEALTH')]

# Reset the index of the filtered DataFrame
australia_data_reset_index = australia_data.reset_index(drop=True)

# Display the DataFrame with the correct index
print(australia_data_reset_index)


                                      country  \
0    FRANCE                                     
1    FRANCE                                     
2    FRANCE                                     
3    FRANCE                                     
4    FRANCE                                     
..                                        ...   
868  AUSTRALIA                                  
869  AUSTRALIA                                  
870  AUSTRALIA                                  
871  AUSTRALIA                                  
872  AUSTRALIA                                  

                                             Institute  \
0    COMMONWEALTH SCIENTIFIC AND INDUSTRIAL RESEARC...   
1    COMMONWEALTH SCIENTIFIC AND INDUSTRIAL RESEARC...   
2    COMMONWEALTH SCIENTIFIC AND INDUSTRIAL RESEARC...   
3    COMMONWEALTH SCIENTIFIC AND INDUSTRIAL RESEARC...   
4    COMMONWEALTH SCIENTIFIC AND INDUSTRIAL RESEARC...   
..                                                 ...   
868  

In [18]:
australia_data_reset_index

Unnamed: 0,country,Institute,dataset,lat,lon
0,FRANCE,COMMONWEALTH SCIENTIFIC AND INDUSTRIAL RESEARC...,XBT ...,-55.153900,144.085495
1,FRANCE,COMMONWEALTH SCIENTIFIC AND INDUSTRIAL RESEARC...,XBT ...,-54.737099,144.241806
2,FRANCE,COMMONWEALTH SCIENTIFIC AND INDUSTRIAL RESEARC...,XBT ...,-54.340302,144.365097
3,FRANCE,COMMONWEALTH SCIENTIFIC AND INDUSTRIAL RESEARC...,XBT ...,-53.923000,144.488205
4,FRANCE,COMMONWEALTH SCIENTIFIC AND INDUSTRIAL RESEARC...,XBT ...,-53.701500,144.547104
...,...,...,...,...,...
868,AUSTRALIA,COMMONWEALTH SCIENTIFIC AND INDUSTRIAL RESEARC...,XBT ...,-39.463402,169.676193
869,AUSTRALIA,COMMONWEALTH SCIENTIFIC AND INDUSTRIAL RESEARC...,XBT ...,-39.648499,170.340805
870,AUSTRALIA,COMMONWEALTH SCIENTIFIC AND INDUSTRIAL RESEARC...,XBT ...,-39.850899,171.026596
871,AUSTRALIA,COMMONWEALTH SCIENTIFIC AND INDUSTRIAL RESEARC...,XBT ...,-40.022598,171.670593
