In [5]:
from pathlib import Path
import xarray as xr
import polars as pl


In [6]:
filename = Path(r"workshop_data/local_sea_surface_temperature.nc")  #  r raw string to avoid issues caused by scape characters (not really needed here)
ds = xr.open_dataset(filename)  # opens the data set as a multidimensional array
print(type(ds))

<class 'xarray.core.dataset.Dataset'>


In [4]:
df = ds.to_dataframe()  # converts the dataset into a multidimensional array

In [None]:
print(df.info()) #notice the number of dimensions and size

In [7]:
df = df.reset_index()  # resets the multidimensional index and turns it into 3 columns

In [8]:
print(df.info())  #notice the number of dimensions and size

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 772450560 entries, 0 to 772450559
Data columns (total 4 columns):
 #   Column                   Dtype         
---  ------                   -----         
 0   lon                      float32       
 1   lat                      float32       
 2   time0                    datetime64[ns]
 3   sea_surface_temperature  float32       
dtypes: datetime64[ns](1), float32(3)
memory usage: 14.4 GB
None


In [10]:
df[['lon', 'lat']] = df[['lon', 'lat']].astype('float16')  # changes the datatype for lat and lon to float 16

In [11]:
print(df.info()) # notice the size

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 772450560 entries, 0 to 772450559
Data columns (total 4 columns):
 #   Column                   Dtype         
---  ------                   -----         
 0   lon                      float16       
 1   lat                      float16       
 2   time0                    datetime64[ns]
 3   sea_surface_temperature  float32       
dtypes: datetime64[ns](1), float16(2), float32(1)
memory usage: 11.5 GB
None


In [None]:
print(df['sea_surface_temperature'].mean())  # notice the execution time

In [12]:
pl_df = pl.from_pandas(df)  # converts the dataframe to a polars dataframe

In [13]:
print(pl_df.estimated_size('gb'))  # notice the size
print(pl_df.head())  # notice the data types

14.47793796658516
shape: (5, 4)
┌─────┬──────┬─────────────────────┬─────────────────────────┐
│ lon ┆ lat  ┆ time0               ┆ sea_surface_temperature │
│ --- ┆ ---  ┆ ---                 ┆ ---                     │
│ f32 ┆ f32  ┆ datetime[ns]        ┆ f32                     │
╞═════╪══════╪═════════════════════╪═════════════════════════╡
│ 0.0 ┆ 90.0 ┆ 2022-01-01 00:00:00 ┆ 271.4375                │
│ 0.0 ┆ 90.0 ┆ 2022-01-01 01:00:00 ┆ 271.4375                │
│ 0.0 ┆ 90.0 ┆ 2022-01-01 02:00:00 ┆ 271.4375                │
│ 0.0 ┆ 90.0 ┆ 2022-01-01 03:00:00 ┆ 271.4375                │
│ 0.0 ┆ 90.0 ┆ 2022-01-01 04:00:00 ┆ 271.4375                │
└─────┴──────┴─────────────────────┴─────────────────────────┘


In [None]:
print(pl_df['sea_surface_temperature'].mean())  # notice the execution time