# Parquet Conversion File


In [None]:
#necessary files
import pandas as pd
import os
import base64
# import pyarrow 
# import fastparquet

**function to add images to data frame & save as parquet file**

In [None]:
# helper function
def image_to_base64(filepath):
    '''Convert image to base64 string'''
    try:
        with open(filepath, "rb") as image_file:
            base64_string = base64.b64encode(image_file.read()).decode('utf-8')
        return base64_string
    except Exception as e:
        print(f"Error converting {filepath} to base64: {e}")
        return None

# main/core function
def add_images_to_dataframe(input_tsv, output, image_folder):
    '''Function to add images to the DataFrame and save as parquet'''
    # read the input as TSV file
    df = pd.read_csv(input_tsv, sep='\t', encoding='ISO-8859-1')
    
    image_base64_list = []
    image_path_list = []
    for image_id in df['Image Id']:
        image_path = os.path.join(image_folder, f"{image_id}.jpg")
        image_base64 = image_to_base64(image_path)
        if image_base64:
            image_base64_list.append(image_base64)
            image_path_list.append(f"{image_id}.jpg")
        else:
            # raise error if image is not found
            raise Exception(f"IMG file not found: {image_id}")
    
    df['image'] = image_base64_list
    df['image_path'] = image_path_list
    
    columns_to_save = ["image", "image_path", "Image Id", "Prompt", "Rewritten Question"]
    if 'Category' in df.columns:
        columns_to_save.append('Category') # include this if needed
    df = df[columns_to_save]
    
    # save the updated DataFrame to a .parquet file
    df.to_parquet(output)
    print(f"File: {output} updated")


In [None]:
#initialize input and output file paths
input_tsv = "C:/Users//ReVision/data/product_data/test_product_data_rewrite.tsv"
valid_output = "C:/Users//ReVision/data/product_data/test_product_data_rewrite.parquet"
image_folder = "C:/Users//ReVision/data/product_data/product_images"

add_images_to_dataframe(input_tsv, valid_output, image_folder)

In [None]:
input_tsv = "C:/Users//ReVision/data/product_data/train_product_data_rewrite.tsv"
valid_output = "C:/Users//ReVision/data/product_data/train_product_data_rewrite.parquet"
image_folder = "C:/Users//ReVision/data/product_data/product_images"

add_images_to_dataframe(input_tsv, valid_output, image_folder)

In [None]:
input_tsv = "C:/Users//ReVision/data/product_data/valid_product_data_rewrite.tsv"
valid_output = "C:/Users//ReVision/data/product_data/valid_product_data_rewrite.parquet"
image_folder = "C:/Users//ReVision/data/product_data/product_images"

add_images_to_dataframe(input_tsv, valid_output, image_folder)

In [None]:
# Reading the Parquet file back into a DataFrame
df = pd.read_parquet('train_product_data_rewrite.parquet', engine='auto')
df.head(10)

In [None]:
df.info()