## Pre-processing

In [4]:
import pandas as pd
import numpy as np

In [5]:
# Load dataset (only first chunk for testing)
chunk_size = 1000000  # Adjust based on memory constraints
chunks = pd.read_csv("en.openfoodfacts.org.products.csv", low_memory=False, chunksize=chunk_size, sep="\t", on_bad_lines='skip')
# chunks = pd.read_csv("en.openfoodfacts.org.products.csv", low_memory=False, sep="\t", on_bad_lines='skip')

df = next(chunks)  # Process only the first chunk

# 1. Handling Missing Values
df.fillna(method='ffill', inplace=True)  # Forward fill for missing values

# 2. Data Type Conversion
datetime_cols = ['created_datetime', 'last_modified_datetime', 'last_updated_datetime']
numeric_cols = [col for col in df.columns if '_100g' in col or col in ['nutriscore_score', 'nova_group']]

df[datetime_cols] = df[datetime_cols].apply(lambda x: pd.to_datetime(x, errors='coerce') if x.name in df.columns else x)
df[numeric_cols] = df[numeric_cols].apply(lambda x: pd.to_numeric(x, errors='coerce') if x.name in df.columns else x)

# 3. Reducing Memory Usage
df[numeric_cols] = df[numeric_cols].astype('float32')

# 4. Filtering Rows: Keep only products sold in India
if 'countries' in df.columns:
    df = df[df['countries'].str.contains('India', na=False, case=False)]

# 5. Cleaning Text Fields
def clean_text(text):
    if isinstance(text, str):
        return text.strip().replace('\n', ' ').replace('\r', '').lower()
    return text

df = df.applymap(clean_text)

# 6. Handling Duplicates
df.drop_duplicates(inplace=True)

# 7. Standardizing Units: Convert kcal to kJ (1 kcal = 4.184 kJ)
if 'energy-kcal_100g' in df.columns:
    df['energy-kj_100g'] = df['energy-kcal_100g'] * 4.184

# Save preprocessed data
df.to_csv("cleaned_dataset_chunk.csv", index=False)
df.to_csv("preprocessed_data_chunk.csv", index=False)

print("Data preprocessing complete for first chunk. Saved as cleaned_dataset_chunk.csv and preprocessed_data_chunk.csv")

  df.fillna(method='ffill', inplace=True)  # Forward fill for missing values
  df = df.applymap(clean_text)


Data preprocessing complete for first chunk. Saved as cleaned_dataset_chunk.csv and preprocessed_data_chunk.csv


## ML Model

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import NearestNeighbors
import numpy as np

In [5]:
# Load dataset
df = pd.read_csv("cleaned_dataset_chunk.csv", dtype={'code': str})

# Drop columns with all NaN values
df = df.dropna(axis=1, how='all')

# Identify numeric features
numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()

# Fill missing values in numeric features with the median
df[numeric_features] = df[numeric_features].fillna(df[numeric_features].median())

# Normalize numeric features
scaler = StandardScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])

# Train Nearest Neighbors model
nn_model = NearestNeighbors(n_neighbors=5, metric='euclidean')
nn_model.fit(df[numeric_features])

def suggest_alternative(barcode):
    barcode = str(barcode)  # Ensure barcode is treated as a string
    product = df[df['code'] == barcode]
    if product.empty:
        return "Product not found"
    
    print(f"Product Name: {product['product_name'].values[0]}")
    
    product_features = product[numeric_features].fillna(0)
    product_features = scaler.transform(product_features)
    
    # Check for NaN values
    if np.isnan(product_features).any():
        print("Warning: NaN values found in product_features after filling.")
        product_features = np.nan_to_num(product_features)
    
    distances, indices = nn_model.kneighbors(product_features)
    
    original_palm_oil = product['palmitic-acid_100g'].values[0]
    alternatives = []
    
    for idx in indices[0]:
        alternative = df.iloc[idx]
        alternative_palm_oil = alternative['palmitic-acid_100g']
        if 0 < alternative_palm_oil < original_palm_oil:  # Relatively less palm oil
            alternatives.append(alternative[['code', 'product_name', 'palmitic-acid_100g']])
    
    return alternatives if alternatives else "No suitable alternative found"

In [6]:
# Example usage
barcode_input = "804410415009"  # Example barcode
print(suggest_alternative(barcode_input))

Product Name: kitkat chunky peanut butter
No suitable alternative found


