In [8]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [9]:
def convert_csv_to_parquet(input_file_path, output_file_path, drop_option):
    # Read CSV file into a Pandas DataFrame
    df = pd.read_csv(input_file_path)

    # Remove rows or columns with NaN fields based on the drop_option argument
    if drop_option == 'row':
        df = df.dropna()
    elif drop_option == 'column':
        df = df.dropna(axis=1)

    # Convert Pandas DataFrame to PyArrow Table
    table = pa.Table.from_pandas(df)

    # Write PyArrow Table to Parquet file
    pq.write_table(table, output_file_path)

    # Open the Parquet file
    table = pq.read_table(output_file_path)

    # Convert the table to a Pandas DataFrame
    df = table.to_pandas()

    # Print the DataFrame
    print(df.head(100))

In [10]:
input_file_path = 'behavior_factor.csv'
output_file_path = 'bf.parquet'
drop_option = 'column'  # options: 'row' or 'column'

convert_csv_to_parquet(input_file_path, output_file_path, drop_option)

    Unnamed: 0  HeightInMeters  WeightInKilograms    BMI   AgeCategory  \
0         1627            1.52              62.14  26.76  Age 55 to 59   
1        38323            1.75              58.97  19.20  Age 60 to 64   
2       233161            1.63              78.02  29.52  Age 50 to 54   
3          748            1.57             113.40  45.73  Age 35 to 39   
4       229966            1.83              99.79  29.84  Age 55 to 59   
..         ...             ...                ...    ...           ...   
95      205711            1.63              70.76  26.78  Age 65 to 69   
96      226583            1.65              72.57  26.63  Age 55 to 59   
97      162657            1.75              81.65  26.58  Age 50 to 54   
98        2678            1.85             111.13  32.32  Age 55 to 59   
99      217331            1.83              98.88  29.57  Age 25 to 29   

     SmokerStatus ECigaretteUsage AlcoholDrinkers PhysicalActivities  \
0    Never smoked           Never      

In [11]:
bf = pd.read_parquet('bf.parquet', engine='pyarrow')

In [12]:
bf

Unnamed: 0.1,Unnamed: 0,HeightInMeters,WeightInKilograms,BMI,AgeCategory,SmokerStatus,ECigaretteUsage,AlcoholDrinkers,PhysicalActivities,SleepHours,HadHeartAttack
0,1627,1.52,62.14,26.76,Age 55 to 59,Never smoked,Never,No,Yes,6.0,No
1,38323,1.75,58.97,19.20,Age 60 to 64,Never smoked,Never,No,Yes,7.0,No
2,233161,1.63,78.02,29.52,Age 50 to 54,Never smoked,Never,No,No,6.0,No
3,748,1.57,113.40,45.73,Age 35 to 39,Never smoked,Never,Yes,Yes,9.0,No
4,229966,1.83,99.79,29.84,Age 55 to 59,Former smoker,Never,Yes,Yes,6.0,No
...,...,...,...,...,...,...,...,...,...,...,...
58990,174938,1.75,68.04,22.15,Age 35 to 39,Former smoker,Somedays,Yes,Yes,7.0,No
58991,74272,1.57,95.25,38.41,Age 55 to 59,Never smoked,Never,Yes,Yes,6.0,No
58992,127009,1.78,86.18,27.26,Age 55 to 59,Never smoked,Never,Yes,Yes,8.0,No
58993,192281,1.68,90.72,32.28,Age 65 to 69,Former smoker,Never,No,Yes,7.0,No


In [13]:
bf = bf.drop(bf.columns[0], axis=1)

In [14]:
bf

Unnamed: 0,HeightInMeters,WeightInKilograms,BMI,AgeCategory,SmokerStatus,ECigaretteUsage,AlcoholDrinkers,PhysicalActivities,SleepHours,HadHeartAttack
0,1.52,62.14,26.76,Age 55 to 59,Never smoked,Never,No,Yes,6.0,No
1,1.75,58.97,19.20,Age 60 to 64,Never smoked,Never,No,Yes,7.0,No
2,1.63,78.02,29.52,Age 50 to 54,Never smoked,Never,No,No,6.0,No
3,1.57,113.40,45.73,Age 35 to 39,Never smoked,Never,Yes,Yes,9.0,No
4,1.83,99.79,29.84,Age 55 to 59,Former smoker,Never,Yes,Yes,6.0,No
...,...,...,...,...,...,...,...,...,...,...
58990,1.75,68.04,22.15,Age 35 to 39,Former smoker,Somedays,Yes,Yes,7.0,No
58991,1.57,95.25,38.41,Age 55 to 59,Never smoked,Never,Yes,Yes,6.0,No
58992,1.78,86.18,27.26,Age 55 to 59,Never smoked,Never,Yes,Yes,8.0,No
58993,1.68,90.72,32.28,Age 65 to 69,Former smoker,Never,No,Yes,7.0,No
