In [208]:
import pandas as pd
from tqdm.notebook import tqdm

units_dict = {
    'item_weight': ['kilogram', 'gram', 'pound', 'ounce'],
    'maximum_weight_recommendation': ['kilogram', 'gram', 'pound', 'ounce'],
    'item_volume': ['millilitre', 'litre', 'cubic foot', 'cubic inch', 'fluid ounce', 'gallon', 'quart', 'pint'],
    'depth': ['centimetre', 'millimetre', 'metre', 'inch', 'foot'],
    'width': ['centimetre', 'millimetre', 'metre', 'inch', 'foot'],
    'height': ['centimetre', 'millimetre', 'metre', 'inch', 'foot'],
    'voltage': ['volt'],
    'wattage': ['watt', 'kilowatt'],
}

unit_normalization = {
    # Weight Units
    'gram':'gram',
    'kg': 'kilogram',
    'kq': 'kilogram',
    'kgs': 'kilogram',
    'kilograms': 'kilogram',
    'kilogram': 'kilogram',
    'g': 'gram',
    'grams': 'gram',
    'mg': 'milligram',
    'milligrams': 'milligram',
    'milligram': 'milligram',
    'lb': 'pound',
    'ibs':'pound',
    'ib': 'pound',
    'lbs': 'pound',
    'bs':'pound',
    'pound' : 'pound',
    'pounds': 'pound',
    'oz': 'ounce',
    'ounces': 'ounce',
    'ounce': 'ounce',
    'fl oz': 'fluid ounce',
    'fluid ounce': 'fluid ounce',
    'fluid ounces': 'fluid ounce',
    'gal': 'gallon',
    'gallon': 'gallon',
    'qt': 'quart',
    'quart': 'quart',
    'pt': 'pint',
    'pint': 'pint',

    # Volume Units
    'ml': 'millilitre',
    'milliliters': 'millilitre',
    'millilitres': 'millilitre',
    'milliliter':'millilitre',
    'l': 'litre',
    'liters': 'litre',
    'litres': 'litre',
    'cu ft': 'cubic foot',
    'cubic feet': 'cubic foot',
    'ft3': 'cubic foot',
    'ft³': 'cubic foot',
    'cu in': 'cubic inch',
    'cubic inches': 'cubic inch',
    'in3': 'cubic inch',
    'in³': 'cubic inch',
    'fl oz': 'fluid ounce',
    'floz': 'fluid ounce',
    'fluid ounces': 'fluid ounce',
    'gallon': 'gallon',
    'gallons': 'gallon',
    'qt': 'quart',
    'quarts': 'quart',
    'pt': 'pint',
    'pint': 'pint',
    'pints': 'pint',
    'quart': 'quart',


    # Length Units
    'cm': 'centimetre',
    'centimeter': 'centimetre',
    'centimetre': 'centimetre',
    'centimeters': 'centimetre',
    'mm': 'millimetre',
    'millimeter': 'millimetre',
    'millimeters': 'millimetre',
    'm': 'metre',
    'meter': 'metre',
    'meters': 'metre',
    'in': 'inch',
    'cubic inch':'cubic inch',
    'inch': 'inch',
    'inches': 'inch',
    '"': 'inch',
    'ft': 'foot',
    'feet': 'foot',
    "'": 'foot',
    'foot':'foot',

    # Voltage Units
    'v': 'volt',
    'volt': 'volt',
    'volts': 'volt',

    # Wattage Units
    'w': 'watt',
    'watt':'watt',
    'watts': 'watt',
    'kw': 'kilowatt',
    'kilowatts': 'kilowatt',
    'kilowatt': 'kilowatt',
     'fl':'fluid ounce',
    'cubicfeet':'cubic foot',
    'fluidounces':'fluid ounce',
    'inche':'inch',
    'liter':'litre',
    'metre':'metre',
    'metres':'metre',

}

In [152]:
data = pd.read_csv(r"D:\OneDrive - NITT\Custom_Download\final_prediction_submission4.csv")
data1 = pd.read_csv(r"D:\OneDrive - NITT\Custom_Download\result_3000.csv")
data2 = pd.read_csv(r"D:\OneDrive - NITT\Custom_Download\result_3000_10k.csv")
data3 = pd.read_csv(r"D:\OneDrive - NITT\Custom_Download\empty10k_15k.csv")
data4 = pd.read_csv(r"D:\OneDrive - NITT\Custom_Download\result_15k_20k.csv")

In [317]:
data1.head(12)

Unnamed: 0,index,image_link,group_id,entity_name,image_id,question,pred
0,15,https://m.media-amazon.com/images/I/216rjgJHAeL.jpg,279307,item_weight,216rjgJHAeL.jpg,What is the weight of the item?,4 lbs
1,94,https://m.media-amazon.com/images/I/31+zdbOuiTL.jpg,279307,item_weight,31+zdbOuiTL.jpg,What is the weight of the item?,3 pack
2,113,https://m.media-amazon.com/images/I/3101lgy28BL.jpg,219211,item_weight,3101lgy28BL.jpg,What is the weight of the item?,2.4 kg
3,114,https://m.media-amazon.com/images/I/3105qskWRcL.jpg,219211,item_weight,3105qskWRcL.jpg,What is the weight of the item?,unanswerable
4,117,https://m.media-amazon.com/images/I/3106iDqsfQL.jpg,267482,item_weight,3106iDqsfQL.jpg,What is the weight of the item?,0.6
5,190,https://m.media-amazon.com/images/I/313ZWJXgMXL.jpg,276075,item_weight,313ZWJXgMXL.jpg,What is the weight of the item?,0.53 oz
6,276,https://m.media-amazon.com/images/I/317V84vTmRL.jpg,152057,item_weight,317V84vTmRL.jpg,What is the weight of the item?,24
7,298,https://m.media-amazon.com/images/I/319PRYi9q9L.jpg,276700,wattage,319PRYi9q9L.jpg,What is the wattage of this item?,unanswerable
8,299,https://m.media-amazon.com/images/I/319PRYi9q9L.jpg,276700,voltage,319PRYi9q9L.jpg,What is the operating voltage of the item?,unanswerable
9,362,https://m.media-amazon.com/images/I/31CUkGWrSHL.jpg,254449,item_weight,31CUkGWrSHL.jpg,What is the weight of the item?,1050 g


In [153]:
df = pd.concat([data1,data2,data3,data4])

In [154]:
df = df[['index','entity_name','pred']]

In [155]:
df.columns = ['index','entity_name','prediction']

In [156]:
df.reset_index(drop=True,inplace=True)

In [157]:
df.to_csv('a.csv')

In [158]:
import re
def no_digits(text):
    return not re.search(r'\d', text)

In [159]:
def is_numeric(text):
    # Regex to match integers or floats (with optional sign and decimal point)
    pattern = r'^-?\d+(\.\d+)?$'
    return bool(re.match(pattern, text))

In [163]:
def convert_inches(text):
    # Regex to find numbers followed by a double quote (")
    # It matches a digit (or float), followed by a quote symbol
    pattern = r'(\d+(\.\d+)?)["]'
    
    # Replace the matched pattern with the number followed by ' inch'
    converted_text = re.sub(pattern, r'\1 inch', text)
    
    return converted_text

In [160]:
df['prediction'] = df['prediction'].apply(lambda x:'' if no_digits(x) else x)

In [161]:
df['prediction']

0             4 lbs
1            3 pack
2            2.4 kg
3                  
4               0.6
            ...    
19271      24 fl oz
19272      3.75 lbs
19273       4.3 lbs
19274    11.5 fl oz
19275      3.75 lbs
Name: prediction, Length: 19276, dtype: object

In [162]:
df['prediction'] = df['prediction'].apply(lambda x:'' if 'pack' in x else x)

In [164]:
df['prediction'] = df['prediction'].apply(convert_inches)

In [165]:
df

Unnamed: 0,index,entity_name,prediction
0,15,item_weight,4 lbs
1,94,item_weight,
2,113,item_weight,2.4 kg
3,114,item_weight,
4,117,item_weight,0.6
...,...,...,...
19271,131165,item_volume,24 fl oz
19272,131241,item_weight,3.75 lbs
19273,131245,item_weight,4.3 lbs
19274,131265,item_volume,11.5 fl oz


In [189]:
def clean(x):
    if '-' in x:
        return ''
    if x:
        if is_numeric(x):
            return x
        else:
            t = re.findall(r'(\d+\.?\d*)\s*([a-zA-Z]+)', x)
            
            if len(t)>=1:
#                 print(t)
                return t[0][0] + ' ' + t[0][1]
            else:
#                 print(x,t)
                return ''
    else:
        return x

In [191]:
df['prediction'] = df['prediction'].apply(clean)

In [192]:
# df['prediction'].apply(clean)

In [201]:
def get(x):
    return unit_normalization.get(x,x)

In [211]:
df['prediction'] = df['prediction'].apply(lambda x:x.strip())

In [216]:
df['new'] = df['prediction'].apply(lambda x:x.split(' ')[-1])

In [219]:
df['new'] = df['new'].apply(get)

In [224]:
df['new_pred'] = df['prediction'].apply(lambda x: x.split(' ')[0])

In [231]:
df['prediction'] = df.apply(lambda x:x['new_pred']+' '+x['new'] if isinstance(x['new'],str) else x['new_pred'],axis=1)

In [238]:
df['prediction'] = df['prediction'].apply(lambda x: ' '.join(list(set(x.split()))))

In [241]:
t = df[['index' ,'entity_name' ,'prediction']]

In [242]:
t.to_csv('t.csv',index=False)

In [243]:
t

Unnamed: 0,index,entity_name,prediction
0,15,item_weight,4 pound
1,94,item_weight,
2,113,item_weight,2.4 kilogram
3,114,item_weight,
4,117,item_weight,0.6
...,...,...,...
19271,131165,item_volume,24 ounce fluid
19272,131241,item_weight,3.75 pound
19273,131245,item_weight,4.3 pound
19274,131265,item_volume,11.5 ounce fluid


In [244]:
df

Unnamed: 0,index,entity_name,prediction,new,new_pred
0,15,item_weight,4 pound,pound,4
1,94,item_weight,,,
2,113,item_weight,2.4 kilogram,kilogram,2.4
3,114,item_weight,,,
4,117,item_weight,0.6,0.6,0.6
...,...,...,...,...,...
19271,131165,item_volume,24 ounce fluid,fluid ounce,24
19272,131241,item_weight,3.75 pound,pound,3.75
19273,131245,item_weight,4.3 pound,pound,4.3
19274,131265,item_volume,11.5 ounce fluid,fluid ounce,11.5


In [245]:
import re

def bring_numeric_to_front(text):
    # Regex to find numeric values (including floats)
    pattern = r'(-?\d+(\.\d+)?)'
    
    # Find all numeric values
    numeric_part = re.findall(pattern, text)
    
    if numeric_part:
        # Extract the first numeric value found
        number = numeric_part[0][0]
        
        # Remove the numeric value from the original text
        text_without_number = re.sub(pattern, '', text).strip()
        
        # Return the number followed by the rest of the text
        return f"{number} {text_without_number}"
    
    return text  # Return the original text if no numeric value is found

# # Example usage
# text1 = "inch 1"
# text2 = "5 kg"
# text3 = "3.5 pounds"
# text4 = "grams 200"

# print(bring_numeric_to_front(text1))  # Output: "1 inch"
# print(bring_numeric_to_front(text2))  # Output: "5 kg"
# print(bring_numeric_to_front(text3))  # Output: "3.5 pounds"
# print(bring_numeric_to_front(text4))  # Output: "200 grams"


In [248]:
df['prediction'] = df['prediction'].apply(bring_numeric_to_front)

In [270]:
df[['index','entity_name' ,'prediction']].to_csv('paligemma.csv',index=False)

In [266]:
df['prediction'] = df['prediction'].apply(lambda x: x.replace('ounce fluid','fluid ounce'))

In [273]:
set(sum(df['prediction'].apply(lambda x:x.split()[1:]),[]))

{'A',
 'Hz',
 'V',
 'VAC',
 'W',
 'a',
 'aa',
 'aaa',
 'adults',
 'amp',
 'awg',
 'b',
 'bit',
 'br',
 'btu',
 'c',
 'can',
 'cc',
 'centimetre',
 'd',
 'db',
 'filters',
 'fluid',
 'foot',
 'gallon',
 'gb',
 'ghz',
 'gm',
 'gram',
 'h',
 'hour',
 'hp',
 'hz',
 'inch',
 'k',
 'kilogram',
 'kilowatt',
 'litre',
 'lm',
 'loads',
 'lumens',
 'mA',
 'mAh',
 'ma',
 'mah',
 'mb',
 'mega',
 'metre',
 'mhz',
 'milligram',
 'millilitre',
 'millimetre',
 'mins',
 'mv',
 'or',
 'ounce',
 'p',
 'pa',
 'phase',
 'pin',
 'pint',
 'plus',
 'point',
 'pound',
 'psi',
 'quart',
 'riders',
 'rpm',
 'servings',
 'sheet',
 'sq',
 'steel',
 't',
 'tb',
 'tbw',
 'ti',
 'to',
 'ton',
 'tons',
 'u',
 'vac',
 'volt',
 'watt',
 'way',
 'x',
 'z'}

In [276]:
print(set(sum(df['prediction'].apply(lambda x:x.split()[1:]),[])))

{'ma', 'loads', 'z', 'riders', 'mhz', 'kilowatt', 'x', 'bit', 'ghz', 'mins', 'adults', 'amp', 'servings', 'u', 'V', 'metre', 'to', 'mv', 'rpm', 'quart', 'vac', 'h', 'kilogram', 'mA', 'a', 'hp', 'millimetre', 'aaa', 'psi', 'mAh', 'aa', 'd', 'mega', 'k', 'way', 'tons', 'point', 'watt', 'ounce', 'centimetre', 'A', 't', 'btu', 'pa', 'fluid', 'pint', 'plus', 'litre', 'hz', 'b', 'W', 'phase', 'or', 'filters', 'sheet', 'inch', 'db', 'ti', 'lumens', 'steel', 'tbw', 'foot', 'mah', 'awg', 'pin', 'p', 'tb', 'sq', 'millilitre', 'can', 'Hz', 'milligram', 'VAC', 'br', 'volt', 'lm', 'hour', 'gallon', 'ton', 'gram', 'gb', 'c', 'mb', 'pound', 'gm', 'cc'}


In [274]:
df[df['prediction'].apply(lambda x:'z' in x)]

Unnamed: 0,index,entity_name,prediction,new,new_pred
854,35487,voltage,60 hz,hz,60.0
1318,61141,voltage,60 Hz,Hz,60.0
2109,114538,voltage,667 mhz,mhz,667.0
2341,119088,voltage,50 hz,hz,50.0
2541,123053,voltage,3200 mhz,mhz,3200.0
2546,123166,voltage,4800 mhz,mhz,4800.0
2720,126792,voltage,2133 mhz,mhz,2133.0
2721,126828,voltage,2400 mhz,mhz,2400.0
2745,127135,voltage,2933 mhz,mhz,2933.0
2851,128760,voltage,3200 mhz,mhz,3200.0


In [275]:
df

Unnamed: 0,index,entity_name,prediction,new,new_pred
0,15,item_weight,4 pound,pound,4
1,94,item_weight,,,
2,113,item_weight,2.4 kilogram,kilogram,2.4
3,114,item_weight,,,
4,117,item_weight,0.6,0.6,0.6
...,...,...,...,...,...
19271,131165,item_volume,24 fluid ounce,fluid ounce,24
19272,131241,item_weight,3.75 pound,pound,3.75
19273,131245,item_weight,4.3 pound,pound,4.3
19274,131265,item_volume,11.5 fluid ounce,fluid ounce,11.5


In [292]:
import google.generativeai as genai
# 
genai.configure(api_key="----------")

In [289]:
# pip install google-generativeai
from tqdm import tqdm

In [320]:
# t = []
# for line,row in tqdm(data1.iterrows()):
#     path = rf"D:\Images_ml\test\{row['image_id']}"
myfile = genai.upload_file(r'D:\OneDrive - NITT\Custom_Download\51iBwliQ43L.jpg')
model = genai.GenerativeModel("gemini-1.5-flash") #("gemini-1.5-flash")
result = model.generate_content(
  [myfile, "\n\n", row['question']]
)
# t.append(result.parts)
print(f"{result.text=}")

result.text='The provided context does not contain information about the weight of the item. \n'


In [None]:
13000*4/3600

In [305]:
t.index("text: "The wattage is 10W.')

ValueError: 'text: "The wattage is 10W.' is not in list

In [311]:
[i for i in t if 'wattage is 10W' in i[0]]

AttributeError: wattage is 10W

In [310]:
t[0][0]

text: "The weight of the item is not specified in the image. \n"

In [314]:
for i in t:
    print(i[0])

text: "The weight of the item is not specified in the image. \n"

text: "The weight of the item is not mentioned in the image. \n"

text: "The weight of the item is not mentioned in the provided context."

text: "The provided image does not contain information about the weight of the item. \n"

text: "The item weighs 5 oz (142g)."

text: "The item weighs 3.5 oz (100g)."

text: "The weight of the item is 8 oz (227 g). \n"

text: "The provided text does not contain information about the wattage of the item. It describes the size of blades that the item fits.  To find the wattage, you need to look at the product description or packaging. \n"

text: "The text provided does not mention the operating voltage of the item. It only mentions the size of the blades it fits."

text: "The item weighs 1050g."



IndexError: list index out of range