In [10]:
import pandas as pd
import re
import csv

# Read the CSV file
filepath = 'resource/csv/completed_recipes.csv'
df = pd.read_csv(filepath)

# Display the first few rows to check the data
print("Original DataFrame head:")
print(df.head(2))

# Function to extract the first URL from image_link column and remove all quotes
def extract_first_url(image_link_value):
    if pd.isna(image_link_value):
        return ""
    
    # Convert to string if not already
    if not isinstance(image_link_value, str):
        return str(image_link_value).strip('"\'')
    
    # Check if the string starts with c(
    if image_link_value.startswith('c('):
        # Use regex to extract the first URL without quotes
        match = re.search(r'c\(\s*"?(https?://[^",\s]+)"?', image_link_value)
        if match:
            # Get the URL and remove any remaining quotes
            url = match.group(1)
            url = url.replace('"', '').replace("'", "").strip()
            return url
    
    # If not in c() format or no URL found, remove any quotes and return
    return image_link_value.replace('"', '').replace("'", "").strip()

# Function to clean c( format from any column
def clean_c_format(value):
    if pd.isna(value):
        return ""
    
    # Convert to string if not already
    if not isinstance(value, str):
        return str(value)
    
    # Check if the string starts with c(
    if value.startswith('c('):
        # Remove c( at the beginning and ) at the end
        value = re.sub(r'^c\(\s*', '', value)
        value = re.sub(r'\s*\)$', '', value)
        
        # Remove quotes around values
        value = value.replace('\"', '').replace("\'", "")
        
        # Clean up any \n or excessive whitespace
        value = re.sub(r'\s*\\n\s*', ' ', value)
        value = re.sub(r'\s+', ' ', value)
    
    return value.strip()

# Apply the function to extract the first URL
df['image_link'] = df['image_link'].apply(extract_first_url)

# Apply clean_c_format to the specified columns
columns_to_clean = ['RecipeInstructions', 'Keywords', 'RecipeIngredientQuantities', 'RecipeIngredientParts']
for column in columns_to_clean:
    if column in df.columns:
        df[column] = df[column].apply(clean_c_format)

# Remove the "Images" column
df = df.drop(columns=['Images'])

# Rename "image_link" to "Images"
df = df.rename(columns={'image_link': 'Images'})

# Double-check to make sure no quotes remain in the Images column
df['Images'] = df['Images'].str.replace('"', '').str.replace("'", "")

# Display the modified DataFrame
print("\nModified DataFrame head:")
print(df.head(2))

# Show examples of the cleaned columns
print("\nExamples of cleaned columns:")
for column in columns_to_clean:
    if column in df.columns:
        print(f"\n{column} (first row):")
        print(df[column].iloc[0])

# Save the modified DataFrame back to CSV
output_filepath = 'resource/csv/completed_recipes_modified.csv'
df.to_csv(output_filepath, index=False, quoting=csv.QUOTE_MINIMAL, escapechar='\\')

print(f"\nModified CSV saved to {output_filepath}")

Original DataFrame head:
   RecipeId                               Name  AuthorId AuthorName CookTime  \
0        38  Low-Fat Berry Blue Frozen Dessert      1533     Dancer    PT24H   
1        39                            Biryani      1567   elly9812    PT25M   

  PrepTime TotalTime         DatePublished  \
0    PT45M  PT24H45M  1999-08-09T21:46:00Z   
1     PT4H   PT4H25M  1999-08-29T13:12:00Z   

                                         Description  \
0  Make and share this Low-Fat Berry Blue Frozen ...   
1  Make and share this Biryani recipe from Food.com.   

                                              Images  ... SodiumContent  \
0  c("https://img.sndimg.com/food/image/upload/w_...  ...          29.8   
1  c("https://img.sndimg.com/food/image/upload/w_...  ...         368.4   

  CarbohydrateContent FiberContent SugarContent  ProteinContent  \
0                37.1          3.6         30.2             3.2   
1                84.4          9.0         20.4            63.4   

In [14]:
import os

# Convert to Parquet
parquet_output_filepath = 'resource/csv/completed_recipes.parquet'
df.to_parquet(parquet_output_filepath, index=False)
print(f"Parquet file saved to {parquet_output_filepath}")

# Print file sizes for comparison
csv_size = os.path.getsize(output_filepath) / (1024 * 1024)  # Size in MB
parquet_size = os.path.getsize(parquet_output_filepath) / (1024 * 1024)  # Size in MB
print(f"\nFile size comparison:")
print(f"CSV file: {csv_size:.2f} MB")
print(f"Parquet file: {parquet_size:.2f} MB")
print(f"Compression ratio: {csv_size/parquet_size:.2f}x")

Parquet file saved to resource/csv/completed_recipes.parquet

File size comparison:
CSV file: 808.57 MB
Parquet file: 377.71 MB
Compression ratio: 2.14x
