In [1]:
#pip install pandas openpyxl nltk langdetect

In [2]:
import pandas as pd
import re
import string
from langdetect import detect
from nltk.corpus import words
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary corpora (only needed once)
#nltk.download('words')
#nltk.download('wordnet')
#nltk.download('omw-1.4')

# English words set
english_vocab = set(words.words())
lemmatizer = WordNetLemmatizer()

# Common quantity/measurement units
units = {
    "g", "gram", "grams", "kg", "ml", "l", "tsp", "tbsp", "cup", "cups", "oz", "pound", "pinch",
    "teaspoon", "teaspoons", "tablespoon", "tablespoons", "litre", "litres", "millilitre", "millilitres",
    "inch", "inches", "clove", "cloves", "slice", "slices", "can", "cans", "bottle", "bottles",
    "packet", "packets", "piece", "pieces"
}

# Load Excel file
df = pd.read_excel("output_with_ingredients.xlsx")

# Combine all ingredient strings into one
all_ingredients = " ".join(df["ingredients"].dropna().astype(str).tolist())

# Extract tokens (words)
tokens = re.findall(r'\b\w+\b', all_ingredients)

non_english_words = set()

for word in tokens:
    word_lower = word.lower()

    # Skip numbers, units, or words starting with digits
    if (word_lower.isdigit() or
        re.match(r'^\d', word_lower) or
        word_lower in units):
        continue

    # Lemmatize the word to its singular/base form
    lemma = lemmatizer.lemmatize(word_lower)

    # Skip if it's an English word (after lemmatizing)
    if lemma in english_vocab:
        continue

    # Detect and include only non-English words
    try:
        if detect(word_lower) != 'en':
            non_english_words.add(word_lower)
    except:
        # If language detection fails, include if it's non-ASCII
        if not word_lower.isascii():
            non_english_words.add(word_lower)

# Output the final result
print("Non-English / Hindi-like words:")
for word in sorted(non_english_words):
    print(word)

Non-English / Hindi-like words:
achari
afza
ajinomoto
ajwain
almondette
aloo
amchur
amul
ararot
arbi
asafoetida
baati
bafla
baingan
barfi
basmati
bathua
bhaji
bhajji
bharta
bhatura
bhindi
bhujia
bhujiya
bhurji
biryani
boondi
bundi
bura
chaap
chaat
chakli
chana
chane
chapati
charoli
chatpata
chawal
cheeselings
chenna
cherrie
chhena
chhole
chikoo
chiku
chilies
chilli
chillies
chironji
choco
chokha
chole
choori
chura
cilantro
daal
dabeli
dahi
dahibada
dalia
daliya
dana
desiccated
dhaniya
dhokla
eno
fafda
falafel
falhari
faluda
flattened
frankie
fruitspistachio
frutti
frying
gajar
gappe
garam
gatte
gm
gobhi
gond
gran
gujarati
gujiya
gulab
gulkand
halwa
hara
hummus
idli
imli
indian
jalapeno
jamun
java
jeera
jeerawan
kaale
kabab
kachha
kadai
kaddu
kadhai
kadhi
kaju
karonda
karonde
karounda
kashmere
kashmiri
kasoori
kasturi
kasuri
katira
ke
keri
kevada
kevda
kevra
kewada
kewra
khaman
khand
khari
khas
khathi
kheel
kheer
khichadi
khoya
khurmi
khus
ki
kulcha
kulfa
kulfi
kuttu
laal
lachha
laddoo
