In [1]:
import pandas as pd
import numpy as np
import os

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from tqdm import tqdm
import re

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')  

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


True

# load data

In [2]:
datasets = os.listdir('/kaggle/input/amazon-ml-images')
datasets

['file1.csv', 'file2.csv', 'file4.csv', 'file3.csv']

In [3]:
megaframe = pd.DataFrame()

for file in datasets:
    df = pd.read_csv(f'/kaggle/input/amazon-ml-images/{file}')
    megaframe = pd.concat((megaframe,df),axis = 0)

In [4]:
megaframe = megaframe.drop(columns = ['Unnamed: 0'])
megaframe

Unnamed: 0,image_link,group_id,entity_name,entity_value,textfromimg
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram,PROPOS' NATUREI INGRÉDIENT MÉNAGER MULTI-USAGE...
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup,<ss ( € ess< RRIFIC! LEBENSMITTELECHT Cur DAY ...
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram,COMPOSITION Serving Size: Tablet (0.709 g) Eac...
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram,"2 5 1 1 J{ 858858{e 1 59š8 1 5 1 1 1 1 H "" [ 1..."
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram,Horbäach' HIG H S TR E NGTh PSYLLIUM HUSK PLAN...
...,...,...,...,...,...
995,https://m.media-amazon.com/images/I/81uget9aH4...,639090,item_weight,470 gram,Milk CHoCoLATE FLAVORED 1) Qe N2 UTeoz 4702) Q...
996,https://m.media-amazon.com/images/I/91QvLtiYaB...,365637,item_weight,454 gram,TRADITIONAL BUTTER OOOKIES 80 COPENHAGEN Oani...
997,https://m.media-amazon.com/images/I/81lazo6sxl...,601746,item_weight,4.0 ounce,98jv 0 @ eSUNS 2 !qpg @ 28a8 08p0 #84y s8UN8 0...
998,https://m.media-amazon.com/images/I/81fYO+HP5Q...,186035,voltage,0.5 volt,~ CQBMIELD] apagaoo J20v UMioo AL= Non-S Lee ...


In [5]:
megaframe

Unnamed: 0,image_link,group_id,entity_name,entity_value,textfromimg
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram,PROPOS' NATUREI INGRÉDIENT MÉNAGER MULTI-USAGE...
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup,<ss ( € ess< RRIFIC! LEBENSMITTELECHT Cur DAY ...
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram,COMPOSITION Serving Size: Tablet (0.709 g) Eac...
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram,"2 5 1 1 J{ 858858{e 1 59š8 1 5 1 1 1 1 H "" [ 1..."
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram,Horbäach' HIG H S TR E NGTh PSYLLIUM HUSK PLAN...
...,...,...,...,...,...
995,https://m.media-amazon.com/images/I/81uget9aH4...,639090,item_weight,470 gram,Milk CHoCoLATE FLAVORED 1) Qe N2 UTeoz 4702) Q...
996,https://m.media-amazon.com/images/I/91QvLtiYaB...,365637,item_weight,454 gram,TRADITIONAL BUTTER OOOKIES 80 COPENHAGEN Oani...
997,https://m.media-amazon.com/images/I/81lazo6sxl...,601746,item_weight,4.0 ounce,98jv 0 @ eSUNS 2 !qpg @ 28a8 08p0 #84y s8UN8 0...
998,https://m.media-amazon.com/images/I/81fYO+HP5Q...,186035,voltage,0.5 volt,~ CQBMIELD] apagaoo J20v UMioo AL= Non-S Lee ...


# NLTK 

In [6]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
text = "PROPOS' NATUREI INGRÉDIENT MÉNAGER MULTI-USAGE... and for entity name weight"

def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stop_words]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [7]:
cleaned_text = preprocess_text(text)
print(cleaned_text)

propos naturei ingrédient ménager entity name weight


In [8]:
listtext = list(megaframe['textfromimg'])
new_text = []
for lt in tqdm(listtext):
    text = preprocess_text(str(lt))
    text = text.lower()
    new_text.append(text)
megaframe['processed text'] = new_text
megaframe

100%|██████████| 4000/4000 [00:03<00:00, 1328.32it/s]


Unnamed: 0,image_link,group_id,entity_name,entity_value,textfromimg,processed text
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram,PROPOS' NATUREI INGRÉDIENT MÉNAGER MULTI-USAGE...,propos naturei ingrédient ménager terre de som...
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup,<ss ( € ess< RRIFIC! LEBENSMITTELECHT Cur DAY ...,ss ess rrific lebensmittelecht cur day geprägt...
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram,COMPOSITION Serving Size: Tablet (0.709 g) Eac...,composition serving size tablet g serving cont...
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram,"2 5 1 1 J{ 858858{e 1 59š8 1 5 1 1 1 1 H "" [ 1...",2 5 1 1 j 858858 e 1 59š8 1 5 1 1 1 1 h 1 ihh ...
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram,Horbäach' HIG H S TR E NGTh PSYLLIUM HUSK PLAN...,horbäach hig h tr e ngth psyllium husk plantag...
...,...,...,...,...,...,...
995,https://m.media-amazon.com/images/I/81uget9aH4...,639090,item_weight,470 gram,Milk CHoCoLATE FLAVORED 1) Qe N2 UTeoz 4702) Q...,milk chocolate flavored 1 qe n2 uteoz 4702 q 2...
996,https://m.media-amazon.com/images/I/91QvLtiYaB...,365637,item_weight,454 gram,TRADITIONAL BUTTER OOOKIES 80 COPENHAGEN Oani...,traditional butter oookies 80 copenhagen oanis...
997,https://m.media-amazon.com/images/I/81lazo6sxl...,601746,item_weight,4.0 ounce,98jv 0 @ eSUNS 2 !qpg @ 28a8 08p0 #84y s8UN8 0...,98jv 0 esuns 2 qpg 28a8 08p0 84y s8un8 0 b0 w ...
998,https://m.media-amazon.com/images/I/81fYO+HP5Q...,186035,voltage,0.5 volt,~ CQBMIELD] apagaoo J20v UMioo AL= Non-S Lee ...,cqbmield apagaoo j20v umioo lee ete aoeauaf au...


# uniques

In [9]:
unique_units = []
for item in megaframe['entity_value'].unique():
    item = item.split(' ')[-1]
    if item not in unique_units:
        unique_units.append(item)
unique_units

['gram',
 'cup',
 'milligram',
 'kilogram',
 'ounce',
 'gallon',
 'volt',
 'watt',
 'pound',
 'millilitre',
 'foot',
 'ton',
 'decilitre',
 'inch',
 'litre',
 'microgram',
 'centimetre',
 'quart',
 'horsepower',
 'kilowatt']

# cover full form conflict

In [10]:
unit_dictionary = {
    'g': 'gram',
    'grams': 'gram',
    'kg': 'kilogram',
    'mg': 'milligram',
    'cup': 'cup',
    'mg': 'milligram',
    'kg': 'kilogram',
    'oz': 'ounce',
    'gallon': 'gallon',
    'v': 'volt',
    'volt': 'volt',
    'w': 'watt',
    'watt': 'watt',
    'lb': 'pound',
    'pound': 'pound',
    'ml': 'millilitre',
    'millilitre': 'millilitre',
    'ft': 'foot',
    'foot': 'foot',
    'ton': 'ton',
    'dl': 'decilitre',
    'decilitre': 'decilitre',
    'in': 'inch',
    'inch': 'inch',
    'l': 'litre',
    'litre': 'litre',
    'mcg': 'microgram',
    'microgram': 'microgram',
    'cm': 'centimetre',
    'centimetre': 'centimetre',
    'qt': 'quart',
    'quart': 'quart',
    'hp': 'horsepower',
    'horsepower': 'horsepower',
    'kw': 'kilowatt',
    'kilowatt': 'kilowatt'
}

In [11]:
def handle_shorters(text):
    textparts = text.split(' ')
    for key in unit_dictionary.keys():
        if key in textparts:
            index = textparts.index(key)
            textparts[index] = unit_dictionary[key]
            
    textparts = ' '.join(textparts)
    return textparts

handle_shorters('my weight is 500 g')

'my weight is 500 gram'

In [12]:
processed_text = megaframe['processed text']

In [13]:
new_processed_text = []

for text in processed_text:
    new_text = handle_shorters(text)
    new_processed_text.append(new_text)
    
megaframe['new processed text'] = new_processed_text

In [14]:
megaframe.to_csv('dataset.csv')