In [3]:
!pip install pytesseract
!apt-get update
!apt-get install -y tesseract-ocr

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Ign:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy Release [5,713 B]
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy Release.gpg [793 B]
Get:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/grap

In [4]:
!pip install tabulate



In [5]:
import pytesseract
pytesseract.pytesseract.teserract_cmd="C:\\Program Files\\Tesseract-OCR\\tesseract.exe"

In [6]:
import pandas as pd
import cv2
import pytesseract
import numpy as np
import requests
import re

df_input = pd.read_csv('/content/test1.csv')

pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
patterns = {
    'item_weight': r'(?:item weight|weight)[\s:]*([\d.]+\s?(?:lbs?|pounds?|kg|kilograms?|g|grams?))',
    'depth': r'depth[\s:]*([\d.]+\s?(?:in(?:ches)?|cm|mm|m))',
    'width': r'width[\s:]*([\d.]+\s?(?:in(?:ches)?|cm|mm|m))',
    'height': r'height[\s:]*([\d.]+\s?(?:in(?:ches)?|cm|mm|m))',
    'voltage': r'voltage[\s:]*([\d.]+\s?(?:v|volts?))',
    'wattage': r'wattage[\s:]*([\d.]+\s?(?:w|watts?))',
    'item_volume': r'(?:item )?volume[\s:]*([\d.]+\s?(?:l|liters?|ml|fl\.?\s?oz))',
    'maximum_weight_recommendation': r'(?:maximum weight|weight capacity|max load)[\s:]*([\d.]+\s?(?:lbs?|pounds?|kg|kilograms?))'
}

def extract_specific_value(text, entity_name):
    # Try to match the specific entity pattern
    if entity_name in patterns:
        match = re.search(patterns[entity_name], text, re.IGNORECASE)
        if match:
            return match.group(1).strip()

    # If no match, try all patterns
    for pattern in patterns.values():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(1).strip()

    # If still no match, try to find any number followed by a unit
    general_pattern = r'([\d.]+\s?(?:lbs?|pounds?|kg|kilograms?|g|grams?|in(?:ches)?|cm|mm|m|v|volts?|w|watts?|l|liters?|ml|fl\.?\s?oz))'
    match = re.search(general_pattern, text, re.IGNORECASE)
    if match:
        return match.group(1).strip()

    return None

results = []

for _, row in df_input.iterrows():
    image_url = row['image_link']
    entity_name = row['entity_name']

    try:
        response = requests.get(image_url, timeout=10)
        img_arr = np.asarray(bytearray(response.content), dtype=np.uint8)
        img = cv2.imdecode(img_arr, cv2.IMREAD_COLOR)
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        text = pytesseract.image_to_string(img_rgb)
        cleaned_text = ' '.join(text.split())

        matched_value = extract_specific_value(cleaned_text, entity_name)

        results.append({
            'entity_name': entity_name,
            'image_link': image_url,
            'extracted_value': matched_value
        })
    except Exception as e:
        print(f"Error processing {image_url}: {str(e)}")
        results.append({
            'entity_name': entity_name,
            'image_link': image_url,
            'extracted_value': None
        })

df_results = pd.DataFrame(results)

print(df_results)

not_null_count = df_results['extracted_value'].notnull().sum()
null_count = df_results['extracted_value'].isnull().sum()

print(f"Count of non-null values: {not_null_count}")
print(f"Count of null values: {null_count}")

df_results.to_csv('extraction_results.csv', index=False)
print("Results saved to 'extraction_results.csv'")

                        entity_name  \
0                            height   
1                             width   
2                            height   
3                             depth   
4                             depth   
...                             ...   
1494                          depth   
1495                          width   
1496                          width   
1497  maximum_weight_recommendation   
1498                    item_weight   

                                             image_link extracted_value  
0     https://m.media-amazon.com/images/I/110EibNycl...            None  
1     https://m.media-amazon.com/images/I/11TU2clswz...            42cm  
2     https://m.media-amazon.com/images/I/11TU2clswz...            42cm  
3     https://m.media-amazon.com/images/I/11TU2clswz...            42cm  
4     https://m.media-amazon.com/images/I/11gHj8dhhr...            90cm  
...                                                 ...             ...  
1494  https:/

In [7]:
import tabulate
print(tabulate.tabulate(df_results, headers='keys', tablefmt='pretty'))

+------+-------------------------------+-----------------------------------------------------+-----------------+
|      |          entity_name          |                     image_link                      | extracted_value |
+------+-------------------------------+-----------------------------------------------------+-----------------+
|  0   |            height             | https://m.media-amazon.com/images/I/110EibNyclL.jpg |                 |
|  1   |             width             | https://m.media-amazon.com/images/I/11TU2clswzL.jpg |      42cm       |
|  2   |            height             | https://m.media-amazon.com/images/I/11TU2clswzL.jpg |      42cm       |
|  3   |             depth             | https://m.media-amazon.com/images/I/11TU2clswzL.jpg |      42cm       |
|  4   |             depth             | https://m.media-amazon.com/images/I/11gHj8dhhrL.jpg |      90cm       |
|  5   |            height             | https://m.media-amazon.com/images/I/11gHj8dhhrL.jpg |  

In [8]:
import pandas as pd
import re

entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

allowed_units = {unit for units in entity_unit_map.values() for unit in units}

unit_mapping = {
    'g': 'gram',
    'gm': 'gram',
    'gms': 'gram',
    'kg': 'kilogram',
    'kgs': 'kilogram',
    'cm': 'centimetre',
    'cms': 'centimetre',
    'm': 'metre',
    'mm': 'millimetre',
    'l': 'litre',
    'ml': 'millilitre',
    'v': 'volt',
    'w': 'watt',
    'kw': 'kilowatt',
    'oz': 'ounce',
    'lb': 'pound',
    'lbs': 'pound',
    'ft': 'foot',
    'in': 'inch',
    'yd': 'yard',
}

def convert_unit(text):
    if not isinstance(text, str):
        return ""

    match = re.match(r'([\d.]+)\s*(.+)', text)
    if not match:
        return ""

    value, unit = match.groups()
    unit = unit.lower().rstrip('s')

    if unit in allowed_units:
        return f"{value} {unit}"

    if unit in unit_mapping:
        return f"{value} {unit_mapping[unit]}"

    return ""

df = pd.read_csv('extraction_results.csv')

df['converted_value'] = df['extracted_value'].apply(lambda x: convert_unit(str(x)) if pd.notna(x) else "")

df.to_csv('converted_submissions.csv', index=False)

print("Conversion complete. Results saved to 'converted_submissions.csv'.")

total_rows = len(df)
converted_rows = (df['converted_value'] != "").sum()
print(f"Total rows: {total_rows}")
print(f"Rows with converted units: {converted_rows}")
print(f"Conversion rate: {converted_rows/total_rows:.2%}")

print("\nExample conversions:")
examples = df[['extracted_value', 'converted_value']].head(10)
print(examples.to_string(index=False))

Conversion complete. Results saved to 'converted_submissions.csv'.
Total rows: 1499
Rows with converted units: 736
Conversion rate: 49.10%

Example conversions:
extracted_value converted_value
            NaN                
           42cm   42 centimetre
           42cm   42 centimetre
           42cm   42 centimetre
           90cm   90 centimetre
           90cm   90 centimetre
           90cm   90 centimetre
            NaN                
          40 cm   40 centimetre
          40 cm   40 centimetre


In [9]:
import tabulate
print(tabulate.tabulate(df, headers='keys', tablefmt='pretty'))

+------+-------------------------------+-----------------------------------------------------+-----------------+-------------------+
|      |          entity_name          |                     image_link                      | extracted_value |  converted_value  |
+------+-------------------------------+-----------------------------------------------------+-----------------+-------------------+
|  0   |            height             | https://m.media-amazon.com/images/I/110EibNyclL.jpg |       nan       |                   |
|  1   |             width             | https://m.media-amazon.com/images/I/11TU2clswzL.jpg |      42cm       |   42 centimetre   |
|  2   |            height             | https://m.media-amazon.com/images/I/11TU2clswzL.jpg |      42cm       |   42 centimetre   |
|  3   |             depth             | https://m.media-amazon.com/images/I/11TU2clswzL.jpg |      42cm       |   42 centimetre   |
|  4   |             depth             | https://m.media-amazon.com/i

In [10]:
import pandas as pd

df = pd.read_csv('converted_submissions.csv')

df.rename(columns={df.columns[0]: 'index', 'extracted_value': 'prediction'}, inplace=True)

df = df[['index', 'prediction']]

df.to_csv('test_out.csv', index=False)

print("File updated and saved as 'test_out.csv'.")


File updated and saved as 'test_out.csv'.
