[![AWS Data Wrangler](_static/logo.png "AWS Data Wrangler")](https://github.com/awslabs/aws-data-wrangler)

# 10 - Parquet Crawler

[Wrangler](https://github.com/awslabs/aws-data-wrangler) can extract only the metadata from Parquet files and Partitions and then add it to the Glue Catalog.

In [1]:
import awswrangler as wr

## Enter your bucket name:

In [2]:
import getpass
bucket = getpass.getpass()
path = f"s3://{bucket}/data/"

 ···········································


### Creating a Parquet Table from the NOAA's CSV files

[Reference](https://registry.opendata.aws/noaa-ghcn/)

In [3]:
cols = ["id", "dt", "element", "value", "m_flag", "q_flag", "s_flag", "obs_time"]

df = wr.s3.read_csv(
    path="s3://noaa-ghcn-pds/csv/189",
    names=cols,
    parse_dates=["dt", "obs_time"])  # Read 10 files from the 1890 decade (~1GB)

df

Unnamed: 0,id,dt,element,value,m_flag,q_flag,s_flag,obs_time
0,AGE00135039,1890-01-01,TMAX,160,,,E,
1,AGE00135039,1890-01-01,TMIN,30,,,E,
2,AGE00135039,1890-01-01,PRCP,45,,,E,
3,AGE00147705,1890-01-01,TMAX,140,,,E,
4,AGE00147705,1890-01-01,TMIN,74,,,E,
...,...,...,...,...,...,...,...,...
29240014,UZM00038457,1899-12-31,PRCP,16,,,r,
29240015,UZM00038457,1899-12-31,TAVG,-73,,,r,
29240016,UZM00038618,1899-12-31,TMIN,-76,,,r,
29240017,UZM00038618,1899-12-31,PRCP,0,,,r,


In [4]:
df["year"] = df["dt"].dt.year

df.head(3)

Unnamed: 0,id,dt,element,value,m_flag,q_flag,s_flag,obs_time,year
0,AGE00135039,1890-01-01,TMAX,160,,,E,,1890
1,AGE00135039,1890-01-01,TMIN,30,,,E,,1890
2,AGE00135039,1890-01-01,PRCP,45,,,E,,1890


In [5]:
res = wr.s3.to_parquet(
    df=df,
    path=path,
    dataset=True,
    mode="overwrite",
    partition_cols=["year"],
)

In [6]:
[ x.split("data/", 1)[1] for x in wr.s3.list_objects(path)]

['year=1890/f66834ded9314208908667b40ccb5b54.snappy.parquet',
 'year=1891/73ee737ebb9144929ee63f6cd2725b8b.snappy.parquet',
 'year=1892/aee80df68614404d957d54f8b36a6143.snappy.parquet',
 'year=1893/159ae23b89b14de499b0312f03aca345.snappy.parquet',
 'year=1894/1694a1fe48194862803d8494c5405ad1.snappy.parquet',
 'year=1895/ba4d698250364922971a7b7dce96dc67.snappy.parquet',
 'year=1896/c2e422d32b2e4cb4a9d38b398845a976.snappy.parquet',
 'year=1897/2ec3223d6f284bfe9b604abbac225996.snappy.parquet',
 'year=1898/ffc78ab36f954d4ba6890892767a3cfb.snappy.parquet',
 'year=1899/c05cd01236a94b158b2b49e924e71431.snappy.parquet']

## Crawling!

In [7]:
%%time

res = wr.s3.store_parquet_metadata(
    path=path,
    database="awswrangler_test",
    table="crawler",
    dataset=True,
    mode="overwrite",
    dtype={"year": "int"}
)

CPU times: user 862 ms, sys: 382 ms, total: 1.24 s
Wall time: 1.45 s


## Checking

In [8]:
wr.catalog.table(database="awswrangler_test", table="crawler")

Unnamed: 0,Column Name,Type,Partition,Comment
0,id,string,False,
1,dt,timestamp,False,
2,element,string,False,
3,value,bigint,False,
4,m_flag,string,False,
5,q_flag,string,False,
6,s_flag,string,False,
7,obs_time,string,False,
8,year,int,True,


In [9]:
%%time

wr.athena.read_sql_query("SELECT * FROM crawler WHERE year=1890", database="awswrangler_test")

CPU times: user 2.08 s, sys: 423 ms, total: 2.5 s
Wall time: 7.23 s


Unnamed: 0,id,dt,element,value,m_flag,q_flag,s_flag,obs_time,year
0,CA006116254,1890-01-01,TMIN,-61,,,C,,1890
1,CA006116254,1890-01-01,PRCP,127,,,C,,1890
2,CA006116254,1890-01-01,SNOW,0,,,C,,1890
3,CA006116705,1890-01-01,PRCP,0,,,C,,1890
4,CA006116705,1890-01-01,SNOW,0,,,C,,1890
...,...,...,...,...,...,...,...,...,...
1276241,CA006120315,1890-12-31,TMAX,11,,,C,,1890
1276242,CA006120315,1890-12-31,TMIN,0,,,C,,1890
1276243,CA006120315,1890-12-31,PRCP,5,,,C,,1890
1276244,CA006120315,1890-12-31,SNOW,0,,,C,,1890


## Cleaning Up S3

In [10]:
wr.s3.delete_objects(path)

## Cleaning Up the Database

In [11]:
for table in wr.catalog.get_tables(database="awswrangler_test"):
    wr.catalog.delete_table_if_exists(database="awswrangler_test", table=table["Name"])