In [24]:
import os
import requests
import pandas as pd
import numpy as np
import signal
from PIL import Image
from io import BytesIO
from paddleocr import PaddleOCR
from ppocr.utils.logging import get_logger

In [25]:
import logging
logger = get_logger()
logger.setLevel(logging.ERROR)

In [26]:
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# Path to your CSV file
csv_file_path = r'student_resource3\dataset\train.csv'

# Output folder for the OCR processed images and CSV
output_folder = r'paddle_ocr_output'
os.makedirs(output_folder, exist_ok=True)

In [37]:
# Read the CSV (limit to 100 rows for this example)
df = pd.read_csv(csv_file_path).head(1000)

In [47]:
start_row = 200

if start_row >= len(df):
    raise ValueError("Starting row index exceeds the number of rows in the DataFrame")
df = df.iloc[start_row:]

In [51]:
processed_data = []
chunk_size = 50  # Save the CSV after every 5 rows
chunk_counter = 0
initial_chunk_number = start_row // chunk_size

In [52]:
def process_row(row):
    img_url = row['image_link']
    group_id = row['group_id']
    entity_name = row['entity_name']
    entity_value = row['entity_value']

    ocr_result_text = ""

    try:
        # Download the image from the URL
        response = requests.get(img_url)
        response.raise_for_status()  # Ensure we notice bad responses
        img = Image.open(BytesIO(response.content)).convert('RGB')

        # Convert the image to a NumPy array
        img_np = np.array(img)

        # Perform OCR on the image
        result = ocr.ocr(img_np, cls=True)

        if result is not None:
            # Concatenate all detected texts with ";" delimiter
            ocr_result_text = "; ".join([line[1][0] for res in result for line in res])

    except Exception as e:
        # Log the error and skip to the next row
        logger.error(f"Error processing image {img_url}: {e}")
        return None  # Return None to indicate an error occurred

    # Return the processed row as a dictionary
    return {
        'group_id': group_id,
        'text': ocr_result_text.strip(),  # Clean up trailing spaces
        'entity_name': entity_name,
        'entity_value': entity_value
    }

In [53]:

def get_next_chunk_filename(chunk_number):
    """Generate a filename for the next chunk, checking if it already exists."""
    while True:
        output_csv_path = os.path.join(output_folder, f'ocr_results_chunk_{chunk_number}.csv')
        if not os.path.exists(output_csv_path):
            return output_csv_path
        chunk_number += 1

In [54]:
def save_chunk(processed_data, chunk_number):
    if processed_data:
        output_csv_path = get_next_chunk_filename(chunk_number)
        pd.DataFrame(processed_data).to_csv(output_csv_path, index=False)
        print(f"Chunk saved to {output_csv_path}")
        processed_data.clear()

In [56]:
try:
    chunk_counter = initial_chunk_number
    for idx, row in df.iterrows():
        processed_row = process_row(row)
        if processed_row is not None:  # Only append if processing was successful
            processed_data.append(processed_row)

        # Save if we have enough rows
        if len(processed_data) >= chunk_size:
            save_chunk(processed_data, chunk_counter)
            chunk_counter += 1  # Increment chunk counter after saving

except KeyboardInterrupt:
    print("\nKeyboardInterrupt detected! Saving current progress...")

finally:
    # Save any remaining rows in the buffer if present
    if processed_data:
        save_chunk(processed_data, chunk_counter)

    print("OCR processing complete. All data saved.")


KeyboardInterrupt detected! Saving current progress...
OCR processing complete. All data saved.


In [34]:
import pandas as pd
import os
folder_path = r'paddle_ocr_output'

csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

dataframes = []
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

merged_df = pd.concat(dataframes, ignore_index=True)
merged_df.to_csv('./merged_output.csv', index=False)

print("All CSV files have been merged successfully!")


All CSV files have been merged successfully!
