In [34]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq 
import fastparquet as fsp 
import glob

In [2]:
df201 = pd.read_csv('weather.20160201.csv', parse_dates=["ObservationDate"])
df301 = pd.read_csv("weather.20160301.csv", parse_dates=['ObservationDate'])

In [3]:
print(df201.shape, df301.shape)

(93255, 15) (101442, 15)


## Parquet supported packages
There is two widely used package within the python ecosystem which is pyarrow and fastparquest. Pyarrow is c++ implementation of the Parquet file format that allows efficient implementation in python. On the other hand fastparquet is python based implementation of the Parquet file takes advantage of the numbas JIT compiler. Despite the fact pandas library allows conversion between file formats to Parquet using either of these packages, more granular customization for the I/O operations can be achive using the packages explicitly. 

## Pyarrow conversion

In [4]:
tbl201 = pa.Table.from_pandas(df201, preserve_index=False)
tbl301 = pa.Table.from_pandas(df301, preserve_index=False)

In [7]:
pq.write_to_dataset(tbl201, root_path="pyarrow/basic")
pq.write_to_dataset(tbl301, root_path="pyarrow/basic")

In [9]:
#read both files 
pyarrow_tbl = pq.ParquetDataset("pyarrow/basic")

In [10]:
#Only read the necesary columns
full_df = pyarrow_tbl.read(columns=["ObservationDate",
"ScreenTemperature",
"Region"]).to_pandas()

In [11]:
len(full_df) == len(df201) + len(df301)

True

In [12]:
full_df.columns

Index(['ObservationDate', 'ScreenTemperature', 'Region'], dtype='object')

In [13]:
full_df.loc[full_df["ScreenTemperature"] == full_df["ScreenTemperature"].max(), ["ObservationDate"]]

Unnamed: 0,ObservationDate
147768,2016-03-17


In [14]:
full_df.loc[full_df["ScreenTemperature"] == full_df["ScreenTemperature"].max(), ["ObservationDate", "ScreenTemperature"]]

Unnamed: 0,ObservationDate,ScreenTemperature
147768,2016-03-17,15.8


In [15]:
full_df.loc[full_df["ScreenTemperature"] == full_df["ScreenTemperature"].max(), ["Region"]]

Unnamed: 0,Region
147768,Highland & Eilean Siar


## Fastparquet 

In [46]:
fsp.write(filename="fastparquet/basic", data=df201, file_scheme='hive', write_index=False)

In [51]:
fsp.write(filename="fastparquet/basic/", data=df301, file_scheme="hive", write_index=False, append=True)

In [56]:
mdata_max = fsp.ParquetFile("fastparquet/basic/_metadata")

In [61]:
max_temp_list = mdata_max.statistics['max']['ScreenTemperature']


In [63]:
max_index = max_temp_list.index(max(max_temp_list))

In [64]:
related_file = fsp.ParquetFile(glob.glob("fastparquet/basic/*")[max_index]).to_pandas(columns=['ObservationDate', 'ScreenTemperature', 'Region'])

In [0]:
related_file[related_file['ScreenTemperature'] == related_file['ScreenTemperature'].max()]