In [2]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import os
import pyarrow as pa
import pyarrow.parquet as pq

%matplotlib inline

pd.set_option('max_colwidth', 512)
pd.set_option('max_columns', 127)
pd.set_option('display.max_rows', 100)
pd.set_option('precision', 2)
pd.options.display.float_format = '{:,.0f}'.format

def show(n):         
    pd.set_option('display.max_rows', n)
    
# Expand cell width to take up more space on the display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; } </style>"))

# Enable multiple outputs from each cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


In [3]:
def convert_csv_to_parquet(input_file_path, output_file_path, drop_option):
    # Read CSV file into a Pandas DataFrame
    df = pd.read_csv(input_file_path)

    # Remove rows or columns with NaN fields based on the drop_option argument
    if drop_option == 'row':
        df = df.dropna()
    elif drop_option == 'column':
        df = df.dropna(axis=1)

    # Convert Pandas DataFrame to PyArrow Table
    table = pa.Table.from_pandas(df)

    # Write PyArrow Table to Parquet file
    pq.write_table(table, output_file_path)

    # Open the Parquet file
    table = pq.read_table(output_file_path)

    # Convert the table to a Pandas DataFrame
    df = table.to_pandas()

    # Print the DataFrame
    print(df.head(100))


In [4]:
!ls

[31mConvert.ipynb[m[m    T.csv            dept_details.csv [34mnorthwind[m[m
R.csv            datatypes.csv    emp.csv          [34mtpch0.01[m[m
S.csv            dept.csv         [34mempdir[m[m


In [5]:
input_file_path = 'emp.csv'
output_file_path = 'emp.parquet'
drop_option = 'none'  # options: 'row' or 'column' or 'none'

convert_csv_to_parquet(input_file_path, output_file_path, drop_option)

      name  age  emp_dept_id
0     john   37            1
1    sarah   53            2
2     ruby   53            2
3      joe   45            3
4     hari   25            3
5    kumar   28            3
6    james   45            4
7     rani   25            4
8   piyush   28            4
9     ajay   28            4
10  ramani   24            5
11  adarsh   27            2
12    kate   42            5
13   shawn   39            5
14  ramesh   22            5
15   kiran   61            4


In [6]:
!ls

[31mConvert.ipynb[m[m    T.csv            dept_details.csv [34mempdir[m[m
R.csv            datatypes.csv    emp.csv          [34mnorthwind[m[m
S.csv            dept.csv         emp.parquet      [34mtpch0.01[m[m
