In [29]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import os
import pyarrow as pa
import pyarrow.parquet as pq

%matplotlib inline

pd.set_option('max_colwidth', 512)
pd.set_option('max_columns', 127)
pd.set_option('display.max_rows', 100)
pd.set_option('precision', 2)
pd.options.display.float_format = '{:,.0f}'.format

def show(n):         
    pd.set_option('display.max_rows', n)
    
# Expand cell width to take up more space on the display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; } </style>"))

# Enable multiple outputs from each cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


In [30]:
def convert_csv_to_parquet(input_file_path, output_file_path, drop_option, delimiter=None, columns_file_path=None):
    # Read CSV file into a Pandas DataFrame
    df = pd.read_csv(input_file_path, delimiter=delimiter)
    if columns_file_path is not None:
        df.columns = list(pd.read_csv(columns_file_path, header=None)[0])

    # Remove rows or columns with NaN fields based on the drop_option argument
    if drop_option == 'row':
        df = df.dropna()
    elif drop_option == 'column':
        df = df.dropna(axis=1)

    # Convert Pandas DataFrame to PyArrow Table
    table = pa.Table.from_pandas(df)

    # Write PyArrow Table to Parquet file
    pq.write_table(table, output_file_path)

    # Open the Parquet file
    table = pq.read_table(output_file_path)

    # Convert the table to a Pandas DataFrame
    df = table.to_pandas()

    # Print the DataFrame
    print(df.head(5))


In [31]:
!ls /Users/adarshrp/Projects/tpch-data/sf0.01

customer.parquet lineitem.tbl     part.tbl         supplier.tbl
customer.tbl     nation.tbl       partsupp.tbl
lineitem.parquet orders.tbl       region.tbl


In [32]:
input_file_path = '/Users/adarshrp/Projects/tpch-data/sf0.01/lineitem.tbl'
output_file_path = '/Users/adarshrp/Projects/tpch-data/sf0.01/lineitem.parquet'
columns_file_path = '/Users/adarshrp/Projects/tpch-data/columns/lineitem'
drop_option = 'none'  # options: 'row' or 'column' or 'none'

convert_csv_to_parquet(input_file_path, output_file_path, drop_option, delimiter="|", columns_file_path=columns_file_path)

   L_ORDERKEY  L_PARTKEY  L_SUPPKEY  L_LINENUMBER  L_QUANTITY  \
0           1        674         75             2          36   
1           1        637         38             3           8   
2           1         22         48             4          28   
3           1        241         23             5          24   
4           1        157         10             6          32   

   L_EXTENDEDPRICE  L_DISCOUNT  L_TAX L_RETURNFLAG L_LINESTATUS  L_SHIPDATE  \
0           56,688           0      0            N            O  1996-04-12   
1           12,301           0      0            N            O  1996-01-29   
2           25,817           0      0            N            O  1996-04-21   
3           27,390           0      0            N            O  1996-03-30   
4           33,829           0      0            N            O  1996-01-30   

  L_COMMITDATE L_RECEIPTDATE     L_SHIPINSTRUCT L_SHIPMODE  \
0   1996-02-28    1996-04-20   TAKE BACK RETURN       MAIL   
1   1996-0