[![AWS Data Wrangler](_static/logo.png "AWS Data Wrangler")](https://github.com/awslabs/aws-data-wrangler)

# 10 - Parquet Crawler

[Wrangler](https://github.com/awslabs/aws-data-wrangler) can extract only the metadata from Parquet files and Partitions and then add it to the Glue Catalog.

In [1]:
import awswrangler as wr

## Enter your bucket name:

In [2]:
import getpass
bucket = getpass.getpass()
path = f"s3://{bucket}/data/"

 ··········································


### Creating a Parquet Table from the NOAA's CSV files

[Reference](https://registry.opendata.aws/noaa-ghcn/)

In [3]:
cols = ["id", "dt", "element", "value", "m_flag", "q_flag", "s_flag", "obs_time"]

df = wr.s3.read_csv(
    path="s3://noaa-ghcn-pds/csv/189",
    names=cols,
    parse_dates=["dt", "obs_time"])  # Read 10 files from the 1890 decade (~1GB)

df

Unnamed: 0,id,dt,element,value,m_flag,q_flag,s_flag,obs_time
0,ASN00070200,1890-01-01,PRCP,0,,,a,
1,SF000782720,1890-01-01,PRCP,0,,,I,
2,CA005022790,1890-01-01,TMAX,-222,,,C,
3,CA005022790,1890-01-01,TMIN,-261,,,C,
4,CA005022790,1890-01-01,PRCP,0,,,C,
...,...,...,...,...,...,...,...,...
29240012,USC00181790,1899-12-31,PRCP,0,P,,6,1830
29240013,ASN00061000,1899-12-31,PRCP,0,,,a,
29240014,ASN00040284,1899-12-31,PRCP,0,,,a,
29240015,ASN00048117,1899-12-31,PRCP,0,,,a,


In [4]:
df["year"] = df["dt"].dt.year

df.head(3)

Unnamed: 0,id,dt,element,value,m_flag,q_flag,s_flag,obs_time,year
0,ASN00070200,1890-01-01,PRCP,0,,,a,,1890
1,SF000782720,1890-01-01,PRCP,0,,,I,,1890
2,CA005022790,1890-01-01,TMAX,-222,,,C,,1890


In [5]:
res = wr.s3.to_parquet(
    df=df,
    path=path,
    dataset=True,
    mode="overwrite",
    partition_cols=["year"]
)

In [6]:
[ x.split("data/", 1)[1] for x in wr.s3.list_objects(path)]

['year=1890/3f77d3d50ab249549bb7915ef9064355.snappy.parquet',
 'year=1891/b5908029aeae483494658e2fe5aa6ec8.snappy.parquet',
 'year=1892/595110757d474ae292a58af2cc4390e4.snappy.parquet',
 'year=1893/e56414be2a2843caae0ee0ed61936807.snappy.parquet',
 'year=1894/57b719a1e0cf4e4d8e378262555b97a5.snappy.parquet',
 'year=1895/2a05f1cfa6fa4ac98ffc477f89379b7e.snappy.parquet',
 'year=1896/ae1c607f0e8d472ea8787faabaaea64e.snappy.parquet',
 'year=1897/a5ad85c32a1046b498741b5c5a36d613.snappy.parquet',
 'year=1898/49cb5ca9100d42d89541c4f9fc382173.snappy.parquet',
 'year=1899/cad5835d6e3a40c39ba838929127983a.snappy.parquet']

## Crawling!

In [7]:
%%time

res = wr.s3.store_parquet_metadata(
    path=path,
    database="awswrangler_test",
    table="crawler",
    dataset=True
)

CPU times: user 321 ms, sys: 46.5 ms, total: 368 ms
Wall time: 1.15 s


## Checking

In [8]:
wr.catalog.table(database="awswrangler_test", table="crawler")

Unnamed: 0,Column Name,Type,Partition,Comment
0,id,string,False,
1,dt,timestamp,False,
2,element,string,False,
3,value,bigint,False,
4,m_flag,string,False,
5,q_flag,string,False,
6,s_flag,string,False,
7,obs_time,string,False,
8,year,bigint,True,


In [9]:
%%time

wr.athena.read_sql_query("SELECT * FROM crawler WHERE year=1890", database="awswrangler_test")

CPU times: user 1.11 s, sys: 178 ms, total: 1.29 s
Wall time: 8.96 s


Unnamed: 0,id,dt,element,value,m_flag,q_flag,s_flag,obs_time,year
0,RSM00031369,1890-01-02,PRCP,0,,,r,,1890
1,RSM00031369,1890-01-02,TAVG,-266,,,r,,1890
2,ASN00019036,1890-01-02,PRCP,0,,,a,,1890
3,ASN00075006,1890-01-02,PRCP,0,,,a,,1890
4,ASN00015087,1890-01-02,PRCP,58,,,a,,1890
...,...,...,...,...,...,...,...,...,...
1276241,CA006131910,1890-12-31,SNOW,0,,,C,,1890
1276242,USC00174230,1890-12-31,TMAX,-106,,,6,,1890
1276243,USC00174230,1890-12-31,TMIN,-244,,,6,,1890
1276244,USC00174230,1890-12-31,PRCP,0,P,,6,,1890


## Cleaning Up S3

In [10]:
wr.s3.delete_objects(path)

## Cleaning Up the Database

In [11]:
for table in wr.catalog.get_tables(database="awswrangler_test"):
    wr.catalog.delete_table_if_exists(database="awswrangler_test", table=table["Name"])