In [2]:
import pandas as pd
import json
import ast

# Function to clean and parse the CSV data
def clean_data(row):
    # Convert the string representation of regions to actual list of dictionaries
    if row['regions'] != '[]':
        row['regions'] = json.loads(row['regions'].replace("'", '"'))
    else:
        row['regions'] = []
    
    # Convert the string representation of size to actual tuple
    row['Size'] = ast.literal_eval(row['Size'])
    
    return row

# Function to calculate the text-image ratio
def calculate_text_image_ratio(row):
    # Calculate the area of the image
    image_area = row['Size'][0] * row['Size'][1]
    
    # Initialize total area of bounding boxes
    total_bbox_area = 0
    
    # Iterate over each bounding box to calculate its area
    for bbox in row['regions']:
        bbox_width = bbox['R'] - bbox['L']
        bbox_height = bbox['B'] - bbox['T']
        bbox_area = bbox_width * bbox_height
        total_bbox_area += bbox_area
    
    # Calculate the ratio of text area to image area
    ratio = (total_bbox_area / image_area) * 100
    
    # Format the ratio to two decimal places
    formatted_ratio = "{:.2f}%".format(ratio)
    
    return formatted_ratio

# Load the CSV file
file_path = 'test2.csv'
data = pd.read_csv(file_path)

# Apply the cleaning function to each row
data_cleaned = data.apply(clean_data, axis=1)

# Apply the function to calculate text-image ratio
data_cleaned['text_image_ratio'] = data_cleaned.apply(calculate_text_image_ratio, axis=1)

# Print the updated dataframe with text-image ratios
print(data_cleaned[['id', 'Size', 'text_image_ratio']])


data_cleaned.to_csv('output.csv', index=False)


        id         Size text_image_ratio
0        0  (1009, 762)            1.97%
1        1  (1017, 708)            1.78%
2        2  (1023, 764)            0.00%
3        3   (859, 747)            0.00%
4        4   (760, 763)            0.00%
...    ...          ...              ...
1795  1795   (858, 768)           18.28%
1796  1796   (814, 768)           16.36%
1797  1797  (1024, 702)           18.82%
1798  1798  (1023, 357)           40.04%
1799  1799  (1017, 681)           33.09%

[1800 rows x 3 columns]
