In [16]:
import pandas as pd
import re

# Load dataset (output from LLM, assumed to be '/home/inference0to10k.csv')
df = pd.read_csv("./inference0tolast.csv")

# Define the entity_unit_map
entity_unit_map = {
    "width": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "depth": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "height": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "item_weight": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "maximum_weight_recommendation": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "voltage": {"millivolt", "kilovolt", "volt"},
    "wattage": {"kilowatt", "watt"},
    "item_volume": {
        "cubic foot", "microlitre", "cup", "fluid ounce", "centilitre", "imperial gallon",
        "pint", "decilitre", "litre", "millilitre", "quart", "cubic inch", "gallon"
    }
}

# Define the cleaning function
def clean_prediction(row):
    value = row['prediction']
    entity_name = row['entity_name']
    
    # Convert to string and strip leading/trailing whitespaces
    value = str(value).strip().lower()
    if "the image" in value or "the image shows" in value or "this image provided" in value:
        return ""
    # Extract numerical value (can include decimals and negatives)
    num_match = re.search(r'-?\d+(\.\d+)?', value)
    
    # If no numeric value is found, return None
    if not num_match:
        return None
    
    # Extract the numeric value
    num_value = num_match.group()
    
    # Default unit in case no unit is found
    unit = ""

    # Match and replace common units with the desired format
    if re.search(r'(?i)\bin\b|in\b|\b"|inch\b|\binch\b|\binches\b', value):
        unit = 'inch'
    elif re.search(r'(?i)cm\b|\bcentimetre\b|\bcentimeter\b', value):
        unit = 'centimetre'
    elif re.search(r'(?i)ft\b|\bFT\b|FT\b|ft\b|\bfoot\b|\bfeet\b', value):
        unit = 'foot'
    elif re.search(r'(?i)mm\b|\bmillimetre\b|\bmillimeter\b', value):
        unit = 'millimetre'
    elif re.search(r'(?i)m\b|\bmetre\b|\bmeter\b', value):
        unit = 'metre'
    elif re.search(r'(?i)yd\b|\byard\b', value):
        unit = 'yard'
    elif re.search(r'(?i)kg\b|\bkilogram\b|\bkilograms\b', value):
        unit = 'kilogram'
    elif re.search(r'(?i)mg\b|\bmilligram\b|\bmilligrams\b', value):
        unit = 'milligram'
    elif re.search(r'(?i)mcg\b|\bmicrogram\b|\bmicrograms\b', value):
        unit = 'microgram'
    elif re.search(r'(?i)\bg\b|g\b|\bgram\b|\bgrams\b', value):
        unit = 'gram'
    elif re.search(r'(?i)fl\b|fl\boz|\bfl\boz\b|\bfluid ounce\b|\bfluid ounces\b|\bfl oz\b', value):
        unit = 'fluid ounce'
    elif re.search(r'(?i)oz\b|\bounces\b|\boz\b', value):
        unit = 'ounce'
    elif re.search(r'(?i)ton\b|\btons\b', value):
        unit = 'ton'
    elif re.search(r'(?i)lbs\b|\blb\b|lb\b|\bpound\b|\bpounds\b', value):
        unit = 'pound'
    elif re.search(r'(?i)kw\b|\bkilowatt\b|\bkilowatts\b', value):
        unit = 'kilowatt'
    elif re.search(r'(?i)w\b|w\b|\bwatt\b|\bwattage\b|\bwatts\b', value):
        unit = 'watt'
    elif re.search(r'(?i)kV\b|\bkV\b|\bKV\b', value):
        unit = 'kilovolt'
    elif re.search(r'(?i)mV\b|\bmV\b|\bMV\b', value):
        unit = 'millivolt'
    elif re.search(r'(?i)v\b|\bvolt\b|\bvoltage\b|\bvolts\b', value):
        unit = 'volt'
    elif re.search(r'(?i)ml\b|\bmillilitre\b|\bmillilitres\b|\bmilliliter\b|\bmilliliters\b', value):
        unit = 'millilitre'
    elif re.search(r'(?i)l\b|\blitre\b|\blitres\b|\bliter\b|\bliters\b', value):
        unit = 'litre'
    elif re.search(r'(?i)cl\b|\bcentilitre\b|\bcentilitres\b', value):
        unit = 'centilitre'
    elif re.search(r'(?i)dl\b|\bdecilitre\b|\bdecilitres\b', value):
        unit = 'decilitre'
    elif re.search(r'(?i)cup\b|\bcups\b', value):
        unit = 'cup'
    elif re.search(r'(?i)pint\b|\bpints\b', value):
        unit = 'pint'
    elif re.search(r'(?i)quart\b|\bquarts\b', value):
        unit = 'quart'
    elif re.search(r'(?i)gallon\b|\bgallons\b', value):
        unit = 'gallon'
    elif re.search(r'(?i)imperial gallon\b|\bimperial gallons\b', value):
        unit = 'imperial gallon'
    elif re.search(r'(?i)cubic foot\b|\bcubic feet\b', value):
        unit = 'cubic foot'
    elif re.search(r'(?i)cubic inch\b|\bcubic inches\b', value):
        unit = 'cubic inch'
    elif re.search(r'(?i)microlitre\b|\bmicrolitres\b|\bmicroliter\b|\bmicroliters\b', value):
        unit = 'microlitre'

    if unit == "":
        return ""
    # If multiple units are present, select the one that matches the entity name
    if '/' in value:
        units = value.split('/')
        for u in units:
            if any(re.search(rf'(?i)\b{unit}\b|{unit}\b', u.strip()) for unit in entity_unit_map.get(entity_name, [])):
                unit = u.strip()
                break

    # Combine the unit and value into the desired format
    if unit:
        return f"{num_value} {unit}"
    
    # If no unit is identified, return the numeric value as is
    return num_value

# Apply the cleaning function to the 'prediction' column
df['cleaned_prediction'] = df.apply(clean_prediction, axis=1)

# Save cleaned data to a new CSV file
output_path = './cleaned_predictions_model.csv'
df.to_csv(output_path, index=False)

# Preview the cleaned dataset
print(df.head())

   index         image_id entity_name prediction cleaned_prediction
0      0  110EibNyclL.jpg      height      2.65m         2.65 metre
1      1  11TU2clswzL.jpg       width       42cm      42 centimetre
2      2  11TU2clswzL.jpg      height      200cm     200 centimetre
3      3  11TU2clswzL.jpg       depth        NaN               None
4      4  11gHj8dhhrL.jpg       depth        NaN               None


In [17]:
columns_to_save = ['index', 'cleaned_prediction']


In [19]:
output_path = './cleaned_predictions_model.csv'
df.to_csv(output_path, columns=columns_to_save, index=False)

# Preview the cleaned dataset
print(df[columns_to_save].head())


   index cleaned_prediction
0      0         2.65 metre
1      1      42 centimetre
2      2     200 centimetre
3      3               None
4      4               None
