In [3]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import os
import pyarrow as pa
import pyarrow.parquet as pq

%matplotlib inline

pd.set_option('max_colwidth', 512)
pd.set_option('max_columns', 127)
pd.set_option('display.max_rows', 100)
pd.set_option('precision', 2)
pd.options.display.float_format = '{:,.0f}'.format

def show(n):         
    pd.set_option('display.max_rows', n)
    
# Expand cell width to take up more space on the display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; } </style>"))

# Enable multiple outputs from each cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


In [16]:
def convert_csv_to_parquet(input_file_path, output_file_path, drop_option, delimiter=None, columns_file_path=None):
    # Read CSV file into a Pandas DataFrame
    df = pd.read_csv(input_file_path, delimiter=delimiter)
    if columns_file_path is not None:
        df.columns = list(pd.read_csv(columns_file_path, header=True)[0])

    # Remove rows or columns with NaN fields based on the drop_option argument
    if drop_option == 'row':
        df = df.dropna()
    elif drop_option == 'column':
        df = df.dropna(axis=1)

    print("---- datatypes ---")
    print(df.dtypes)
    
    # Convert Pandas DataFrame to PyArrow Table
    table = pa.Table.from_pandas(df)

    # Write PyArrow Table to Parquet file
    pq.write_table(table, output_file_path)

    # Open the Parquet file
    table = pq.read_table(output_file_path)

    # Convert the table to a Pandas DataFrame
    df = table.to_pandas()

    # Print the DataFrame
    print(df.head(5))


In [17]:
!ls /Users/adarshrp/Projects/yard/data

[31mConvert.ipynb[m[m     datatypes.csv     emp.parquet       userdata1.parquet
R.csv             dept.csv          [34mempdir[m[m
S.csv             dept_details.csv  [34mnorthwind[m[m
T.csv             emp.csv           [34mtpch0.01[m[m


In [20]:
input_file_path = '/Users/adarshrp/Projects/tpch-data/sf0.01/lineitem.tbl'
output_file_path = '/Users/adarshrp/Projects/tpch-data/sf0.01/lineitem.parquet'
columns_file_path = '/Users/adarshrp/Projects/tpch-data/columns/lineitem'

input_file_path = '/Users/adarshrp/Projects/yard/data/emp.csv'
output_file_path = '/Users/adarshrp/Projects/yard/data/emp.parquet'
columns_file_path = None

drop_option = 'none'  # options: 'row' or 'column' or 'none'

convert_csv_to_parquet(input_file_path, output_file_path, drop_option, delimiter=",", columns_file_path=columns_file_path)

---- datatypes ---
name            object
age              int64
emp_dept_id      int64
salary         float64
dtype: object
    name  age  emp_dept_id  salary
0   john   37            1      10
1  sarah   53            2      22
2   ruby   53            2      22
3    joe   45            3      18
4   hari   25            3      45


In [19]:
df = pd.read_csv(input_file_path, delimiter=",")
df

Unnamed: 0,name,age,emp_dept_id,salary
0,john,37,1,10
1,sarah,53,2,22
2,ruby,53,2,22
3,joe,45,3,18
4,hari,25,3,45
5,kumar,28,3,23
6,james,45,4,25
7,rani,25,4,31
8,piyush,28,4,34
9,ajay,28,4,19
