In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
def convert_csv_to_parquet(input_file_path, output_file_path, drop_option):
    # Read CSV file into a Pandas DataFrame
    df = pd.read_csv(input_file_path)

    # Remove rows or columns with NaN fields based on the drop_option argument
    if drop_option == 'row':
        df = df.dropna()
    elif drop_option == 'column':
        df = df.dropna(axis=1)

    # Convert Pandas DataFrame to PyArrow Table
    table = pa.Table.from_pandas(df)

    # Write PyArrow Table to Parquet file
    pq.write_table(table, output_file_path)

    # Open the Parquet file
    table = pq.read_table(output_file_path)

    # Convert the table to a Pandas DataFrame
    df = table.to_pandas()

    # Print the DataFrame
    print(df.head(100))

In [3]:
input_file_path = 'behavior_factor.csv'
output_file_path = 'bf.parquet'
drop_option = 'column'  # options: 'row' or 'column'

convert_csv_to_parquet(input_file_path, output_file_path, drop_option)

    Unnamed: 0  HeightInMeters  WeightInKilograms    BMI      AgeCategory  \
0            0            1.60              71.67  27.99     Age 65 to 69   
1            1            1.78              95.25  30.13     Age 70 to 74   
2            2            1.85             108.86  31.66     Age 75 to 79   
3            3            1.70              90.72  31.32  Age 80 or older   
4            4            1.55              79.38  33.07  Age 80 or older   
..         ...             ...                ...    ...              ...   
95          95            1.80              77.11  23.71     Age 75 to 79   
96          96            1.68              90.72  32.28     Age 50 to 54   
97          97            1.70              72.57  25.06  Age 80 or older   
98          98            1.73             120.20  40.29     Age 45 to 49   
99          99            1.65              93.89  34.45     Age 60 to 64   

     SmokerStatus ECigaretteUsage AlcoholDrinkers PhysicalActivities  \
0  

In [12]:
bf = pd.read_parquet('bf.parquet', engine='pyarrow')

In [13]:
bf

Unnamed: 0.1,Unnamed: 0,HeightInMeters,WeightInKilograms,BMI,AgeCategory,SmokerStatus,ECigaretteUsage,AlcoholDrinkers,PhysicalActivities,SleepHours,HadHeartAttack
0,0,1.60,71.67,27.99,Age 65 to 69,Former smoker,Never,No,Yes,9.0,No
1,1,1.78,95.25,30.13,Age 70 to 74,Former smoker,Never,No,Yes,6.0,No
2,2,1.85,108.86,31.66,Age 75 to 79,Former smoker,Never,Yes,No,8.0,No
3,3,1.70,90.72,31.32,Age 80 or older,Never smoked,Never,No,Yes,9.0,No
4,4,1.55,79.38,33.07,Age 80 or older,Never smoked,Never,No,Yes,5.0,No
...,...,...,...,...,...,...,...,...,...,...,...
234314,234314,1.78,102.06,32.28,Age 60 to 64,Never smoked,Never,Yes,Yes,6.0,No
234315,234315,1.93,90.72,24.34,Age 25 to 29,Never smoked,Never,No,Yes,7.0,No
234316,234316,1.68,83.91,29.86,Age 65 to 69,Never smoked,Never,Yes,Yes,7.0,No
234317,234317,1.70,83.01,28.66,Age 50 to 54,Never smoked,Never,No,Yes,7.0,No


In [14]:
bf = bf.drop(bf.columns[0], axis=1)

In [15]:
bf

Unnamed: 0,HeightInMeters,WeightInKilograms,BMI,AgeCategory,SmokerStatus,ECigaretteUsage,AlcoholDrinkers,PhysicalActivities,SleepHours,HadHeartAttack
0,1.60,71.67,27.99,Age 65 to 69,Former smoker,Never,No,Yes,9.0,No
1,1.78,95.25,30.13,Age 70 to 74,Former smoker,Never,No,Yes,6.0,No
2,1.85,108.86,31.66,Age 75 to 79,Former smoker,Never,Yes,No,8.0,No
3,1.70,90.72,31.32,Age 80 or older,Never smoked,Never,No,Yes,9.0,No
4,1.55,79.38,33.07,Age 80 or older,Never smoked,Never,No,Yes,5.0,No
...,...,...,...,...,...,...,...,...,...,...
234314,1.78,102.06,32.28,Age 60 to 64,Never smoked,Never,Yes,Yes,6.0,No
234315,1.93,90.72,24.34,Age 25 to 29,Never smoked,Never,No,Yes,7.0,No
234316,1.68,83.91,29.86,Age 65 to 69,Never smoked,Never,Yes,Yes,7.0,No
234317,1.70,83.01,28.66,Age 50 to 54,Never smoked,Never,No,Yes,7.0,No
