In [90]:
import re
from nltk.corpus import stopwords # !pip install nltk
from nltk.tokenize import word_tokenize
from word2number import w2n # !pip install word2number

In [99]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Replace 'dollar' or 'dollars' with '$', accounting for end of sentence or periods
    text = re.sub(r'\b(dollars?)(?=\s|\.$|$)', '$', text)

    # Replace comma in numbers with dot
    text = re.sub(r'(\d+),(\d+)', r'\1.\2', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)

    # Convert words representing numbers to digits
    tokens = [str(w2n.word_to_num(token)) if token in w2n.american_number_system else token for token in tokens]
    
    # Lists of descriptors and units to be removed
    descriptors = ['fresh', 'beautiful', 'big', 'small', 'vibrant', 'fragrant', 'colorful', 'delicate', 'crisp', 'luscious', 'elegant', 'juicy', 'tender', 'aromatic', 'radiant', 'exquisite', 'flavorful', 'picturesque', 'charming']
    units = ['kilos', 'grams', 'pounds', 'liters', 'milliliters', 'pieces', 'bunches', 'cups', 'teaspoons', 'tablespoons', 'slices', 'servings', 'bottles', 'cans', 'packets', 'boxes', 'bags', 'jars', 'bars', 'pots', 'containers', 'plates', 'bowls', 'glasses', 'pints', 'quarts', 'gallons', 'ounces', 'milligrams', 'micrograms', 'centimeters', 'inches', 'feet', 'meters', 'yards', 'kilometers', 'miles', 'square meters', 'square feet', 'cubic meters', 'cubic feet', 'degrees Celsius', 'degrees Fahrenheit', 'seconds', 'minutes', 'hours', 'days', 'weeks', 'months', 'years']

    # Remove stopwords and words from descriptors and units lists
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token not in descriptors and token not in units]
    
    # Join the tokens back into a string
    clean_text = ' '.join(tokens)
    
    return clean_text

# Sample text
input_text = "I bought three Samsung smartphones 150 $ each, four kilos of fresh banana for 1,2 dollar a kilogram, one Hamburger with 4,5 dollar."

# Preprocess the input text
cleaned_text = preprocess_text(input_text)

print("Cleaned Text:")
print(cleaned_text)

Cleaned Text:
bought 3 samsung smartphones 150 $ , 4 banana 1.2 $ kilogram , 1 hamburger 4.5 $ .


In [100]:
def extract_product_details(text):
    # Regex pattern to match product details
    # This pattern looks for a number (quantity), followed by the product name, followed by the price,
    pattern = r"(\d+)\s([a-z\s]+)\s+(\d+(?:\.\d+)?)\s+\$"
    
    # Find all matches in the text
    matches = re.findall(pattern, text)
    
    # Process matches to calculate total price and format the details
    product_details = []
    for quantity, name, unit_price in matches:
        quantity = int(quantity)
        unit_price = float(unit_price)
        total_price = quantity * unit_price
        product_details.append({
            'Product Name': name.strip(),
            'Quantity': quantity,
            'Unit Price': unit_price,
            'Total Price': total_price
        })
    
    return product_details

# Extract product details
product_details = extract_product_details(cleaned_text)

# Display the extracted details in a table format
print(f"{'Product':<20} {'Quantity':<10} {'Unit Price':<12} {'Total Price':<12}")
print('-' * 60)  # Print a line for the header

for detail in product_details:
    print(f"{detail['Product Name']:<20} {detail['Quantity']:<10} {detail['Unit Price']:<12} {detail['Total Price']:<12}")

Product              Quantity   Unit Price   Total Price 
------------------------------------------------------------
samsung smartphones  3          150.0        450.0       
banana               4          1.2          4.8         
hamburger            1          4.5          4.5         
