# Reading and Writing Parquet Files
source:  https://arrow.apache.org/docs/python/parquet.html

In [None]:
!pip install pyarrow

In [None]:
import pyarrow.parquet as pq
import pandas as pd
import pyarrow as pa

In [3]:
# read S3 file into a data frame and show its data & metadata
df = pd.read_csv('https://www1.ncdc.noaa.gov/pub/data/cdo/samples/PRECIP_HLY_sample_csv.csv')
df.head(30)

Unnamed: 0,STATION,STATION_NAME,ELEVATION,LATITUDE,LONGITUDE,DATE,HPCP,Measurement Flag,Quality Flag
0,COOP:310301,ASHEVILLE NC US,682.1,35.5954,-82.5568,20100101 00:00,99999,],
1,COOP:310301,ASHEVILLE NC US,682.1,35.5954,-82.5568,20100101 01:00,0,g,
2,COOP:310301,ASHEVILLE NC US,682.1,35.5954,-82.5568,20100102 06:00,1,,


In [15]:
table = pa.Table.from_pandas(df)
pq.write_table(table, 'bigdata/examples/example.parquet')
#pq.write.parquet("v3io://bigdata/examples/my-parquet-table")

FileNotFoundError: [Errno 2] No such file or directory: 'bigdata/examples/example.parquet'

In [5]:
table2 = pq.read_table('example.parquet')
table2.to_pandas().head()

Unnamed: 0,STATION,STATION_NAME,ELEVATION,LATITUDE,LONGITUDE,DATE,HPCP,Measurement Flag,Quality Flag
0,COOP:310301,ASHEVILLE NC US,682.1,35.5954,-82.5568,20100101 00:00,99999,],
1,COOP:310301,ASHEVILLE NC US,682.1,35.5954,-82.5568,20100101 01:00,0,g,
2,COOP:310301,ASHEVILLE NC US,682.1,35.5954,-82.5568,20100102 06:00,1,,


In [6]:
# read specific columns
pq.read_table('example.parquet', columns=['age', 'job']).to_pandas().head()

In [7]:
# read specific columns, if previously written by Pandas (read extra metadata)
pq.read_pandas('example.parquet', columns=['age', 'job']).to_pandas().head()

0
1
2


In [8]:
parquet_file = pq.ParquetFile('example.parquet')
parquet_file.schema

<pyarrow._parquet.ParquetSchema object at 0x7f08ad858da0>
STATION: BYTE_ARRAY UTF8
STATION_NAME: BYTE_ARRAY UTF8
ELEVATION: DOUBLE
LATITUDE: DOUBLE
LONGITUDE: DOUBLE
DATE: BYTE_ARRAY UTF8
HPCP: INT64
Measurement Flag: BYTE_ARRAY UTF8
Quality Flag: BYTE_ARRAY UTF8
__index_level_0__: INT64
 

In [9]:
parquet_file.metadata

<pyarrow._parquet.FileMetaData object at 0x7f08b1be43b8>
  created_by: parquet-cpp version 1.5.1-SNAPSHOT
  num_columns: 10
  num_rows: 3
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 2448

In [10]:
# write to multiple partitioned files 
pq.write_to_dataset(table, root_path='my_parq', partition_cols=['job'])

KeyError: 'job'

In [None]:
# read partitioned table
table3 = pq.read_table('my_parq')
table3.to_pandas().head()

In [None]:
# read with multiple threads
pq.read_table('my_parq', nthreads=4).to_pandas().head()