# Data Formats - Exercise 1

Prove that reading a single column from a parquet file is faster than reading all the columns from the file.


In [2]:
import sys
import logging
# Create logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Create STDERR handler
handler = logging.StreamHandler(sys.stderr)

# Create formatter and add it to the handler
formatter = logging.Formatter('%(levelname)s - %(message)s')
handler.setFormatter(formatter)

# Set STDERR handler as the only handler 
logger.handlers = [handler]

logger.info("Test Logging Output")

INFO - Test Logging Output


## Create a parquet file

In [3]:
import pandas as pd

MULTIPLIER = 10_000
records = [
    {u'station': u'011990-99999', u'temp': 0, u'time': 1433269388},
    {u'station': u'011990-99999', u'temp': 22, u'time': 1433270389},
    {u'station': u'011990-99999', u'temp': -11, u'time': 1433273379},
    {u'station': u'012650-99999', u'temp': 111, u'time': 1433275478},
] * MULTIPLIER

pd_df = pd.DataFrame(records)

from fastparquet import write
file_path = '/tmp/outfile.parquet'
write(file_path, pd_df)

logger.info(f"Wrote a parquet file containing {len(pd_df.index)} records at {file_path}")


INFO - Wrote a parquet file containing 40000 records at /tmp/outfile.parquet


## Read the Parquet File

In [4]:
import time
from fastparquet import ParquetFile
pf = ParquetFile(file_path)

NUM_ITER = 1_000

start = time.time()
for i in range(NUM_ITER):
    pd_df = pf.to_pandas()
end = time.time()

logger.info(f"Reading all columns took {end-start} seconds")

start = time.time()
for i in range(NUM_ITER):
    pd_df = pf.to_pandas(['temp'])
end = time.time()

print(f"Reading only 1 column took {end-start} seconds")

INFO - Reading all columns took 6.8668129444122314 seconds


Reading only 1 column took 1.1175260543823242 seconds
