In [5]:
import pandas as pd
import re

Loading dataset

In [6]:
file_path = 'nlp_dataset.csv.gz'

In [7]:
chunk_size = 200000  # Adjust based on memory availability
filtered_chunks = []

for chunk in pd.read_csv(file_path, chunksize=chunk_size, on_bad_lines='skip', encoding='utf-8',compression="gzip", low_memory=False):
    filtered_chunks.append(chunk)

# Concatenate all chunks into a single DataFrame
filtered_df = pd.concat(filtered_chunks, ignore_index=True)
del filtered_chunks

In [52]:
beverage_keywords = [
    "Beverages and beverages preparations", "sodas", "drinks", "juices", "teas",
    "artificially sweetened beverages", "basil seeds drinks", "carbonated drinks", "coffee drinks", "dairy drinks",
    "dehydrated beverages", "energy drinks", "fermented drinks", "flavoured drinks", "herbal tea beverages",
    "hot beverages", "milsubstitutes", "mixed drinks", "non-alcoholic beverages", "still soft drinks",
    "plant-based beverages", "sweetened beverages", "tea-based beverages", "unsweetened beverages",
    "waters", "fr:Citronnades", "Boissons"
]

# Common global drink-related terms for substring matching in main_category
beverage_substrings = [
    "water", "drink","drinks" "juice", "soda", "teas", "coffee", "nectar", "carbonated", "milkshake", 
    "smoothie", "lemonade", "coconut-waters", "non-alcoholic", "infusion", "isotonic",
    "cold-brew", "plant-based-drinks", "kombucha", "protein-shake","tonic", 
    "ginger-beers", "electrolyte-drinks", "beverages", "ciders" 
]

# Convert to lowercase for case-insensitive matching
beverage_keywords_lower = {kw.lower() for kw in beverage_keywords}
beverage_substrings_lower = {kw.lower() for kw in beverage_substrings}

def check_categories(categories_str):
    """Check if any non-alcoholic beverage keyword exists in the 'categories' column."""
    if pd.isna(categories_str):
        return False
    category_list = [c.strip().lower() for c in categories_str.split(",")]
    return any(cat in beverage_keywords_lower for cat in category_list)

def check_main_category(main_category_str):
    """Check if 'main_category' contains any beverage-related substring."""
    if pd.isna(main_category_str):
        return False
    main_category = main_category_str.split(":", 1)[-1].strip().lower()
    parts = main_category.replace("-", " ").split()
    return any(any(sub in part for sub in beverage_substrings_lower) for part in parts)

# Apply conditions to each column
categories_condition = filtered_df['categories'].apply(check_categories)
main_category_condition = filtered_df['main_category'].apply(check_main_category)

# Combine conditions using logical OR
mask = categories_condition | main_category_condition
filtered_df2 = filtered_df[mask]
filtered_df2.shape

(168796, 6)

In [76]:
new_df = filtered_df2.dropna(subset=['ingredients_tags', 'nutriscore_grade'])
new_df = new_df[['ingredients_tags', 'nutriscore_grade']]
new_df2 = new_df[~new_df['nutriscore_grade'].isin(['unknown', 'not-applicable'])]
new_df2.shape

(68424, 2)

In [87]:
import re

def clean_ingredients(text):
    # Keep only ingredients that start with 'en:'
    ingredients = [word for word in text.split(',') if word.strip().startswith('en:')]
    
    # Remove 'en:' prefix
    ingredients = [re.sub(r'^en:', '', ing) for ing in ingredients]
    
    # Remove numbers, special characters, and extra spaces
    clean_text = ' '.join(ingredients)
    clean_text = re.sub(r'[^a-zA-Z\s]', ' ', clean_text).strip().lower()  # Keep only letters
    # clean_text = re.sub(r'\s+', ' ', clean_text).strip().lower()
    
    return clean_text

new_df2['cleaned_ingredients'] = new_df2['ingredients_tags'].apply(clean_ingredients)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df2['cleaned_ingredients'] = new_df2['ingredients_tags'].apply(clean_ingredients)


In [91]:
new_df2.sample(2)

Unnamed: 0,ingredients_tags,nutriscore_grade,cleaned_ingredients
922622,"en:water,en:e501ii,en:e501,en:e170i,en:e170,en...",b,water e ii e e i e e e
1572739,"en:water,en:grape-juice,en:fruit,en:berries,en...",e,water grape juice fruit berries juice fruit ju...


In [80]:

from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=8000, stop_words='english')

# Apply TF-IDF on the cleaned ingredients text
X = tfidf.fit_transform(new_df2['cleaned_ingredients'])

# Check the shape of the resulting matrix (features)
print(X.shape)


(68424, 8000)


In [81]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(new_df2["nutriscore_grade"])

In [82]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Split data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the classifier
clf = RandomForestClassifier(n_estimators=200, random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("Accuracy: ", accuracy_score(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           a       0.79      0.66      0.72       598
           b       0.85      0.77      0.80      3120
           c       0.72      0.73      0.73      3628
           d       0.61      0.42      0.50      2046
           e       0.69      0.85      0.76      4293

    accuracy                           0.73     13685
   macro avg       0.73      0.69      0.70     13685
weighted avg       0.73      0.73      0.72     13685

Accuracy:  0.7272195834855681


In [92]:
# [1659593]
new_df2['ingredients_tags'][922622]

'en:water,en:e501ii,en:e501,en:e170i,en:e170,en:e509,en:e511'

In [36]:
import re

def clean_text(text):
    # 1️⃣ Ensure there is a space after removing language prefixes (en:, fr:, ro:)
    text = re.sub(r'\b[a-z]{2,3}:\s*', ' ', text)  

    # 2️⃣ Remove E-numbers (e300, e322i, etc.)
    text = re.sub(r'\b[eE]\d{3,}[a-zA-Z]*\b', ' ', text)  

    # 3️⃣ Remove emojis
    text = re.sub(r'[^\w\s,]', '', text)  

    # 4️⃣ Remove numbers (like 9.2, 3.4, etc.)
    text = re.sub(r'\b\d+(\.\d+)?\b', ' ', text)  

    # 5️⃣ Replace dashes and underscores with spaces
    text = re.sub(r'[-_,]+', ' ', text)  

    # 6️⃣ Remove extra spaces (so words don’t stick together)
    text = re.sub(r'\s+', ' ', text).strip()  

    # 7️⃣ Convert to lowercase
    text = text.lower()  
    
    return text

# Apply to DataFrame
new_df2['processed_ingre'] = new_df2['ingredients_tags'].apply(clean_text)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df2['processed_ingre'] = new_df2['ingredients_tags'].apply(clean_text)


In [37]:
new_df2['processed_ingre'][1659593]

'boissonrafraichissanteauxjusdagrumes ingredients jusdepamplemousse92sucedecannejusdemandarine34jusdecitron31jusdecitronvert28antioxydantacideascorbiquepreparions apresouvertureconservationlaaurefrigerateuretconsommezladansles3jourpourunedegustationoptimaleaconsommerdepreferenceavantlenelotsurlehautdebouteille water servizfrais conservation'