In [1]:
!pip install paddlepaddle
!pip install paddleocr

Collecting paddlepaddle
  Downloading paddlepaddle-2.6.2-cp310-cp310-manylinux1_x86_64.whl.metadata (8.6 kB)
Collecting astor (from paddlepaddle)
  Downloading astor-0.8.1-py2.py3-none-any.whl.metadata (4.2 kB)
Downloading paddlepaddle-2.6.2-cp310-cp310-manylinux1_x86_64.whl (126.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading astor-0.8.1-py2.py3-none-any.whl (27 kB)
Installing collected packages: astor, paddlepaddle
Successfully installed astor-0.8.1 paddlepaddle-2.6.2
Collecting paddleocr
  Downloading paddleocr-2.8.1-py3-none-any.whl.metadata (19 kB)
Collecting lmdb (from paddleocr)
  Downloading lmdb-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting rapidfuzz (from paddleocr)
  Downloading rapidfuzz-3.9.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting python-docx (from paddleocr)
  Down

In [2]:
import pandas as pd  # For handling CSV data
import requests  # To fetch image data from URLs
from PIL import Image  # To open and manipulate images
from io import BytesIO  # For handling byte data in memory
import re  # For regular expressions and text pattern matching
import time  # To track and print elapsed time
import csv  # For handling CSV input and output
from paddleocr import PaddleOCR  # PaddleOCR for Optical Character Recognition

# Initialize PaddleOCR with angle classification and English language model
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# Define the entity unit map: a mapping of entity types to their valid measurement units
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

# Function to clean extracted text by removing extra spaces
def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

# Function to extract numeric terms followed by a word (for example, "5 kg" or "10 inches")
def extract_numeric_with_word(text):
    pattern = r'(\d+\.?\d*\s?\w+)'  # Matches numbers followed by words
    matches = re.findall(pattern, text)
    return ' '.join(matches)  # Joins the matches into a single string

# Function to download an image from a URL and extract text using PaddleOCR
def extract_text_from_image(url):
    try:
        # Download the image from the given URL
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        img.save("/tmp/temp_image.jpg")  # Temporarily save the image for processing

        # Use PaddleOCR to extract text from the image
        result = ocr.ocr('/tmp/temp_image.jpg', cls=True)
        text = ' '.join([line[1][0] for line in result[0]])  # Extract recognized text

        # Clean and extract numeric terms with one word after them
        extracted_text = extract_numeric_with_word(text)
        return clean_text(extracted_text)
    except Exception as e:
        return f"Error: {e}"

# Function to replace abbreviated units with full forms based on the entity name
def extract_and_replace_abbreviated_units(extracted_text, entity_name, entity_unit_map):
    # Get the set of valid units for the given entity name
    unit_set = entity_unit_map.get(entity_name, set())
    
    # Mapping of abbreviations to full unit names
    abbreviation_to_full_unit = {
        'mg': 'milligram', 'g': 'gram', 'G': 'gram', 'mcg': 'microgram', 'MCG': 'microgram',
        'GSM': 'gram', 'gm': 'gram', 'kg': 'kilogram', 'KG': 'kilogram', 'KGS': 'kilogram',
        'Kg': 'kilogram', 'ug': 'microgram', 'oz': 'ounce', 'OZ': 'ounce', 'Oz': 'ounce',
        'ounces': 'ounce', 'lb': 'pound', 'LB': 'pound', 'LBS': 'pound', 'Lbs': 'pound',
        'Lb': 'pound', 'cm': 'centimetre', 'cms': 'centimetre', 'CM': 'centimetre',
        'Cm': 'centimetre', 'm': 'metre', 'M': 'metre', 'in': 'inch', 'IN': 'inch',
        'INCH': 'inch', 'In': 'inch', 'ft': 'foot', 'feet': 'foot', 'FT': 'foot',
        'FEET': 'foot', 'Ft': 'foot', 'grams': 'gram', 't': 'ton', 'T': 'ton',
        'ml': 'millilitre', 'mL': 'millilitre', 'ML': 'millilitre', 'mm': 'millimetre',
        'l': 'litre', 'cl': 'centilitre', 'kV': 'kilovolt', 'mV': 'millivolt',
        'V': 'volt', 'W': 'watt', 'kW': 'kilowatt'
    }
    
    # Regex pattern to extract numeric value and unit abbreviation
    pattern = r'(\d+(\.\d+)?)(\s*|\b)(grams|gm|G|g|GSM|kg|Kg|KGS|cms|ug|mcg|MCG|mg|µg|oz|OZ|Oz|ounces|lb|Lbs|LBS|LB|Lb|t|T|ml|mL|ML|mm|l|cl|cms|cm|CM|Cm|m|M|in|INCH|IN|In|Ft|feet|FT|FEET|kV|mV|V|W|kW)'
    
    # Search for a match using the pattern
    match = re.search(pattern, extracted_text)
    
    if match:
        value = match[1]  # Extract the numeric value
        abbrev = match[4]  # Extract the unit abbreviation

        # If the abbreviation is in the mapping and valid for the entity, return the full form
        full_unit = abbreviation_to_full_unit.get(abbrev)
        if full_unit and full_unit in unit_set:
            return f'{value} {full_unit}'  # Return "value unit" format

    # If no valid match is found, return an empty string
    return ''

# Start timing the process
start_time = time.time()

# Load the dataset and limit to 200 rows for processing
df = pd.read_csv('/kaggle/input/testset/test.csv')  # Replace with actual dataset
df = df.iloc[115000:120000]  # Adjust row range as needed

# List to store extracted text from images
extracted_texts = []
for i, link in enumerate(df['image_link']):
    extracted_texts.append(extract_text_from_image(link))
    
    # Print elapsed time after every 100 rows
    if (i + 1) % 100 == 0:
        elapsed_time = time.time() - start_time
        print(f"Processed {i + 1} rows. Time elapsed: {elapsed_time:.2f} seconds.")

# Add extracted text to the DataFrame
df['extracted_text'] = extracted_texts

# Save the updated DataFrame to a new CSV file
intermediate_csv = '/kaggle/working/updated_dataset_ONTEST_22k-35k.csv'
df.to_csv(intermediate_csv, index=False)

print("Text extraction complete. Intermediate dataset saved.")

# Function to process the CSV and replace abbreviations in the extracted text
def process_csv(input_file, output_file, entity_unit_map):
    with open(input_file, mode='r', newline='', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)
        rows = list(reader)  # Read all rows into a list

    with open(output_file, mode='w', newline='', encoding='utf-8') as outfile:
        # Define the output columns
        fieldnames = ['index', 'prediction']
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        
        # Process each row and update the extracted text with standardized units
        for row in rows:
            index = row['index']  # Original index
            entity_name = row['entity_name']  # Entity type
            extracted_text = row['extracted_text']  # Extracted text
            
            # Replace unit abbreviations with full units
            updated_value = extract_and_replace_abbreviated_units(extracted_text, entity_name, entity_unit_map)
            
            # Write updated values to the output CSV
            writer.writerow({'index': index, 'prediction': updated_value})

# Define input and output file paths for final processing
final_input_file = intermediate_csv  # Path to intermediate CSV
final_output_file = '/kaggle/working/updated_output_starting_final_115k-120k.csv'  # Path for final output

# Process the CSV and save the updated version
process_csv(final_input_file, final_output_file, entity_unit_map)

print(f"CSV processing complete. Final dataset saved as '{final_output_file}'.")


download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 4.00M/4.00M [00:00<00:00, 7.95MiB/s]


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10.2M/10.2M [00:00<00:00, 18.4MiB/s]


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2.19M/2.19M [00:00<00:00, 5.52MiB/s]

[2024/09/16 04:42:51] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_c




[2024/09/16 04:42:53] ppocr DEBUG: dt_boxes num : 10, elapsed : 0.5540115833282471
[2024/09/16 04:42:53] ppocr DEBUG: cls num  : 10, elapsed : 0.06502437591552734
[2024/09/16 04:42:54] ppocr DEBUG: rec_res num  : 10, elapsed : 0.8348281383514404
[2024/09/16 04:42:55] ppocr DEBUG: dt_boxes num : 10, elapsed : 0.2560100555419922
[2024/09/16 04:42:55] ppocr DEBUG: cls num  : 10, elapsed : 0.024482250213623047
[2024/09/16 04:42:55] ppocr DEBUG: rec_res num  : 10, elapsed : 0.42424631118774414
[2024/09/16 04:42:55] ppocr DEBUG: dt_boxes num : 4, elapsed : 0.25008296966552734
[2024/09/16 04:42:55] ppocr DEBUG: cls num  : 4, elapsed : 0.010318279266357422
[2024/09/16 04:42:56] ppocr DEBUG: rec_res num  : 4, elapsed : 0.22098875045776367
[2024/09/16 04:42:56] ppocr DEBUG: dt_boxes num : 4, elapsed : 0.2485954761505127
[2024/09/16 04:42:56] ppocr DEBUG: cls num  : 4, elapsed : 0.010169029235839844
[2024/09/16 04:42:56] ppocr DEBUG: rec_res num  : 4, elapsed : 0.16176152229309082
[2024/09/16 04: