In [None]:
!pip install torch torchvision torchaudio
!pip install easyocr
!pip install transformers
!pip install opencv-python-headless

Collecting easyocr
  Downloading easyocr-1.7.1-py3-none-any.whl.metadata (11 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.3.0.post5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (9.0 kB)
Collecting ninja (from easyocr)
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl.metadata (5.3 kB)
Downloading easyocr-1.7.1-py3-none-any.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyclipper-1.3.0.post5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (

In [None]:
import os
import pandas as pd
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import time
import easyocr
import re
from tqdm import tqdm

# Initialize EasyOCR reader for English language
reader = easyocr.Reader(['en'], gpu=True)

# Load the Data (ensure paths are correct)
test_csv_file_path = r'/content/test.csv'
output_file_path = r'/content/result_out_65000_80000.csv'

# Load test data
test_df = pd.read_csv(test_csv_file_path)

# Filter the DataFrame for rows between index 39,000 and 60,000
filtered_df = test_df[(test_df['index'] >= 85001) & (test_df['index'] <= 105000)]

# Create output CSV file if it doesn't exist
if not os.path.exists(output_file_path):
    with open(output_file_path, 'w') as f:
        f.write("prediction index,prediction\n")

# Function to configure session with retries and larger connection pool
def create_session():
    session = requests.Session()
    # Increase pool size and set retries
    adapter = HTTPAdapter(pool_connections=100, pool_maxsize=100)  # Increased pool size
    retries = Retry(total=3, backoff_factor=1, status_forcelist=[502, 503, 504])
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    session.adapters['http://'].max_retries = retries
    session.adapters['https://'].max_retries = retries
    return session

# Function to download image using requests session
def download_image(image_link, save_folder, session, retries=3, delay=1):
    filename = Path(image_link).name
    image_save_path = os.path.join(save_folder, filename)

    if os.path.exists(image_save_path):
        return image_save_path

    for attempt in range(retries):
        try:
            response = session.get(image_link, stream=True, timeout=10)
            if response.status_code == 200:
                with open(image_save_path, 'wb') as f:
                    for chunk in response.iter_content(1024):
                        f.write(chunk)
                return image_save_path
        except Exception as e:
            time.sleep(delay)

    return None

# Optimized parallel image download using session-based requests with an increased connection pool
def download_images_concurrently(image_links, save_folder):
    with ThreadPoolExecutor(max_workers=20) as executor:  # Increase workers for more parallelism
        with create_session() as session:
            futures = {executor.submit(download_image, link, save_folder, session): link for link in image_links}
            for future in as_completed(futures):
                link = futures[future]
                try:
                    future.result()
                except Exception as exc:
                    print(f"Failed to download image {link}: {exc}")

# Function to extract text from an image
def extract_text_from_image(image_path):
    results = reader.readtext(image_path)
    extracted_text = ' '.join([result[1] for result in results])
    return extracted_text

# Function to categorize extracted text using regex patterns
entity_patterns = {
    "item_weight": r'(\d+(\.\d+)?\s?(g|kg|mg|lb))',
    "item_volume": r'(\d+(\.\d+)?\s?(ml|l|fl oz|gal))',
    "height": r'(\d+(\.\d+)?\s?(cm|mm|in|ft))',
    "width": r'(\d+(\.\d+)?\s?(cm|mm|in|ft))',
    "depth": r'(\d+(\.\d+)?\s?(cm|mm|in|ft))',
    "voltage": r'(\d+(\.\d+)?\s?(V|kV|mV))',
    "wattage": r'(\d+(\.\d+)?\s?(W|kW))',
    "maximum_weight_recommendation": r'(\d+(\.\d+)?\s?(g|kg|lb))'
}

def categorize_text_by_entity(extracted_text, entity_name):
    pattern = entity_patterns.get(entity_name.lower())
    if pattern:
        matches = re.findall(pattern, extracted_text, re.IGNORECASE)
        return matches[0][0] if matches else None
    return None

# Process the dataset and save results in chunks
def process_and_save_results(df, save_folder, output_file):
    for idx, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Images"):
        image_url = row['image_link']
        entity_name = row['entity_name']
        index = row['index']

        image_filename = Path(image_url).name
        image_path = os.path.join(save_folder, image_filename)

        # Download image if not already present
        if not os.path.exists(image_path):
            image_path = download_image(image_url, save_folder, requests.Session())

        if not image_path:
            continue  # Skip if the image could not be downloaded

        # Extract text from the image
        extracted_text = extract_text_from_image(image_path)

        # Categorize text based on entity name
        predicted_value = categorize_text_by_entity(extracted_text, entity_name)

        # Store the result
        with open(output_file, 'a') as f:
            f.write(f"{index},{predicted_value}\n")

# Define save folder for images
save_folder = '/content/images'
os.makedirs(save_folder, exist_ok=True)

# Download images concurrently before processing
image_links = filtered_df['image_link'].unique()
download_images_concurrently(image_links, save_folder)

# Start processing the filtered dataset and storing results
process_and_save_results(filtered_df, save_folder, output_file_path)

print("Processing complete. Results saved in result_out.csv")




Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |██████████████████████████████████████████████████| 100.0% Complete

  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  model.load_state_dict(torch.load(model_path, map_location=device))
Processing Images: 100%|██████████| 19993/19993 [3:28:38<00:00,  1.60it/s]

Processing complete. Results saved in result_out.csv



