# Reading Index Files

Pandas, Polars, and Dask can be used to read GRIB2 index files.


In [2]:
import pandas as pd
import polars as pl

## wgrib2-style index files


In [3]:
index_path = "https://noaa-hrrr-bdp-pds.s3.amazonaws.com/hrrr.20231201/conus/hrrr.t00z.wrfsfcf00.grib2.idx"

In [11]:
%%timeit
index_pandas = pd.read_csv(
    index_path,
    sep=":",
    header=None,
)

363 ms ± 16.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
%%timeit
index_polars = pl.read_csv(
    index_path,
    separator=":",
    truncate_ragged_lines=True,
    has_header=False,
)

388 ms ± 46.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
index_pandas = pd.read_csv(
    index_path,
    sep=":",
    header=None,
)
index_pandas

Unnamed: 0,0,1,2,3,4,5,6
0,1,0,d=2023120100,REFC,entire atmosphere,anl,
1,2,404023,d=2023120100,RETOP,cloud top,anl,
2,3,640892,d=2023120100,var discipline=0 center=7 local_table=1 parmca...,entire atmosphere,anl,
3,4,1192793,d=2023120100,VIL,entire atmosphere,anl,
4,5,1532460,d=2023120100,VIS,surface,anl,
...,...,...,...,...,...,...,...
165,166,132979980,d=2023120100,ICEC,surface,anl,
166,167,132983690,d=2023120100,SBT123,top of atmosphere,anl,
167,168,134547872,d=2023120100,SBT124,top of atmosphere,anl,
168,169,136934894,d=2023120100,SBT113,top of atmosphere,anl,


In [10]:
index_polars = pl.read_csv(
    index_path,
    separator=":",
    truncate_ragged_lines=True,
    has_header=False,
)
index_polars

column_1,column_2,column_3,column_4,column_5,column_6,column_7
i64,i64,str,str,str,str,str
1,0,"""d=2023120100""","""REFC""","""entire atmosph…","""anl""",
2,404023,"""d=2023120100""","""RETOP""","""cloud top""","""anl""",
3,640892,"""d=2023120100""","""var discipline…","""entire atmosph…","""anl""",
4,1192793,"""d=2023120100""","""VIL""","""entire atmosph…","""anl""",
5,1532460,"""d=2023120100""","""VIS""","""surface""","""anl""",
6,2963754,"""d=2023120100""","""REFD""","""1000 m above g…","""anl""",
7,3239430,"""d=2023120100""","""REFD""","""4000 m above g…","""anl""",
8,3415741,"""d=2023120100""","""REFD""","""263 K level""","""anl""",
9,3677963,"""d=2023120100""","""GUST""","""surface""","""anl""",
10,4879687,"""d=2023120100""","""UGRD""","""250 mb""","""anl""",


## eccodes-style index files


In [13]:
index_filepath = "https://ai4edataeuwest.blob.core.windows.net/ecmwf/20231201/00z/0p4-beta/oper/20231201000000-0h-oper-fc.index"

In [14]:
df = pd.read_json(index_filepath, lines=True)
df

Unnamed: 0,domain,date,time,expver,class,type,stream,step,levelist,levtype,param,_offset,_length
0,g,20231201,0,1,od,fc,oper,0,1000.0,pl,r,0,199377
1,g,20231201,0,1,od,fc,oper,0,700.0,pl,gh,199377,222840
2,g,20231201,0,1,od,fc,oper,0,,sfc,st,422217,276625
3,g,20231201,0,1,od,fc,oper,0,250.0,pl,t,698842,259574
4,g,20231201,0,1,od,fc,oper,0,1000.0,pl,gh,958416,242618
...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,g,20231201,0,1,od,fc,oper,0,925.0,pl,vo,24295901,565867
79,g,20231201,0,1,od,fc,oper,0,200.0,pl,vo,24861768,561754
80,g,20231201,0,1,od,fc,oper,0,250.0,pl,vo,25423522,588929
81,g,20231201,0,1,od,fc,oper,0,50.0,pl,d,26012451,583943


In [17]:
# doesn't look like this function can read remote files yet
# https://github.com/pola-rs/polars/issues/13486
pl.read_ndjson(index_filepath)

FileNotFoundError: No such file or directory (os error 2): ...t.blob.core.windows.net/ecmwf/20231201/00z/0p4-beta/oper/20231201000000-0h-oper-fc.index

In [22]:
# But polars can read a local json lines file

import requests

local_json = "my_json.json"

response = requests.get(index_filepath)
with open(local_json, "wb") as file:
    file.write(response.content)

pl.read_ndjson(local_json)

domain,date,time,expver,class,type,stream,step,levelist,levtype,param,_offset,_length
str,str,str,str,str,str,str,str,str,str,str,i64,i64
"""g""","""20231201""","""0000""","""0001""","""od""","""fc""","""oper""","""0""","""1000""","""pl""","""r""",0,199377
"""g""","""20231201""","""0000""","""0001""","""od""","""fc""","""oper""","""0""","""700""","""pl""","""gh""",199377,222840
"""g""","""20231201""","""0000""","""0001""","""od""","""fc""","""oper""","""0""",,"""sfc""","""st""",422217,276625
"""g""","""20231201""","""0000""","""0001""","""od""","""fc""","""oper""","""0""","""250""","""pl""","""t""",698842,259574
"""g""","""20231201""","""0000""","""0001""","""od""","""fc""","""oper""","""0""","""1000""","""pl""","""gh""",958416,242618
"""g""","""20231201""","""0000""","""0001""","""od""","""fc""","""oper""","""0""","""50""","""pl""","""v""",1201034,271253
"""g""","""20231201""","""0000""","""0001""","""od""","""fc""","""oper""","""0""","""50""","""pl""","""u""",1472287,283796
"""g""","""20231201""","""0000""","""0001""","""od""","""fc""","""oper""","""0""","""1000""","""pl""","""t""",1756083,256981
"""g""","""20231201""","""0000""","""0001""","""od""","""fc""","""oper""","""0""","""500""","""pl""","""r""",2013064,240409
"""g""","""20231201""","""0000""","""0001""","""od""","""fc""","""oper""","""0""","""200""","""pl""","""r""",2253473,193332


## Polars index parser
I need to make parsing data from the index file as fast as possible. My Pandas-based parser takes ~2 seconds. Can Polars do it faster?

In [None]:
index_path = "https://noaa-hrrr-bdp-pds.s3.amazonaws.com/hrrr.20231201/conus/hrrr.t00z.wrfsfcf00.grib2.idx"

In [23]:
from herbie.inventory import Inventory

In [24]:
%%timeit
I = Inventory(index_path)
df = I.dataframe

1.95 s ± 43 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [26]:
index_polars = pl.read_csv(
    index_path,
    separator=":",
    truncate_ragged_lines=True,
    has_header=False,
)
index_polars.head()

column_1,column_2,column_3,column_4,column_5,column_6,column_7
i64,i64,str,str,str,str,str
1,0,"""d=2023120100""","""REFC""","""entire atmosph…","""anl""",
2,404023,"""d=2023120100""","""RETOP""","""cloud top""","""anl""",
3,640892,"""d=2023120100""","""var discipline…","""entire atmosph…","""anl""",
4,1192793,"""d=2023120100""","""VIL""","""entire atmosph…","""anl""",
5,1532460,"""d=2023120100""","""VIS""","""surface""","""anl""",


In [39]:
index_polars.with_columns(
    pl.col("column_3").str.strptime(pl.Datetime, format="d=%Y%m%d%H", strict=True)
)

ComputeError: Invalid format string: Please either specify both hour and minute, or neither.

In [42]:
df = pl.DataFrame({"date_str": ["d=2023120100", "d=2023120112"]})
print(df)

shape: (2, 1)
┌──────────────┐
│ date_str     │
│ ---          │
│ str          │
╞══════════════╡
│ d=2023120100 │
│ d=2023120112 │
└──────────────┘


In [44]:
df.with_columns(pl.col("date_str").str.strptime(pl.Datetime, "d=%Y%m%d%H"))

ComputeError: Invalid format string: Please either specify both hour and minute, or neither.