In [190]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

In [192]:
df = pd.read_csv("food contamination data_cleaned2.csv")

In [194]:
if "ContaminantName" not in df.columns or "ResultValue" not in df.columns:
    raise KeyError("Dataset must contain 'ContaminantName' and 'ResultValue' columns.")

df["ContaminantName"] = df["ContaminantName"].fillna("Unknown")

if "FoodGroupName" not in df.columns:
    raise KeyError("Dataset must contain 'FoodGroupName' column.")

# Standardize text format
df["ContaminantName"] = df["ContaminantName"].str.strip().str.lower()
df["FoodGroupName"] = df["FoodGroupName"].str.strip().str.lower()

# Initialize and fit LabelEncoders
label_encoder_contaminant = LabelEncoder()
label_encoder_foodgroup = LabelEncoder()

df["ContaminantEncoded"] = label_encoder_contaminant.fit_transform(df["ContaminantName"])
df["FoodGroupEncoded"] = label_encoder_foodgroup.fit_transform(df["FoodGroupName"])

# Log transformation and scaling
df["LogResultValue"] = np.log1p(df["ResultValue"])

In [196]:
scaler = StandardScaler()
df["ScaledLogResult"] = scaler.fit_transform(df[["LogResultValue"]])

In [198]:
X = df[["ContaminantEncoded", "ScaledLogResult", "FoodGroupEncoded"]]


In [200]:
kmeans = KMeans(n_clusters=3, random_state=42, n_init=50)
df["Cluster"] = kmeans.fit_predict(X)

In [202]:
cluster_means = df.groupby("Cluster")["ResultValue"].mean().sort_values().index
cluster_mapping = {cluster_means[0]: "Low", cluster_means[1]: "Medium", cluster_means[2]: "High"}


In [204]:
joblib.dump(label_encoder_contaminant, "contaminant_encoder.pkl")
joblib.dump(label_encoder_foodgroup, "foodgroup_encoder.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(kmeans, "kmeans_model.pkl")


['kmeans_model.pkl']

In [206]:
silhouette = silhouette_score(X, kmeans.labels_)
print(f"Silhouette Score (Clustering Accuracy): {silhouette:.2f}")

Silhouette Score (Clustering Accuracy): 0.60


In [216]:
# Ensure columns exist before dropping to avoid errors
columns_to_drop = ["ScaledLogResult", "ContaminantEncoded", "Cluster"]

# Check if all columns exist before attempting to drop them
existing_columns = [col for col in columns_to_drop if col in df.columns]

if existing_columns:  # Only drop if columns exist
    df.drop(columns=existing_columns, inplace=True)
else:
    print("Warning: Some columns to drop were not found in df.")

print(df.head())  # Check if df is correct


   ContaminationIndividualID  FoodID    CountryName       FoodGroupName  \
0                       1644     121  HONG KONG SAR  legumes and pulses   
1                       1645     121  HONG KONG SAR  legumes and pulses   
2                       1646     121  HONG KONG SAR  legumes and pulses   
3                       1647     121  HONG KONG SAR  legumes and pulses   
4                       1648     121  HONG KONG SAR  legumes and pulses   

  GEMSFoodName  ContaminantID  ContaminantName  ResultValue  Year  \
0       PULSES             36  ethyl carbamate          1.0  2008   
1       PULSES             36  ethyl carbamate          2.8  2008   
2       PULSES             36  ethyl carbamate          3.5  2008   
3       PULSES             36  ethyl carbamate          0.8  2008   
4       PULSES             36  ethyl carbamate         44.0  2008   

   FoodGroupEncoded  LogResultValue  
0                 3        0.693147  
1                 3        1.335001  
2                 3 

In [218]:
def classify_contamination(new_contaminant, new_foodgroup, new_value, kmeans_model, scaler, label_encoder_contaminant, label_encoder_foodgroup, cluster_mapping):
    # Standardize input text
    new_contaminant = new_contaminant.strip().lower()
    new_foodgroup = new_foodgroup.strip().lower()

    # Handle unseen labels dynamically
    if new_contaminant not in label_encoder_contaminant.classes_:
        label_encoder_contaminant.classes_ = np.append(label_encoder_contaminant.classes_, new_contaminant)
    
    if new_foodgroup not in label_encoder_foodgroup.classes_:
        label_encoder_foodgroup.classes_ = np.append(label_encoder_foodgroup.classes_, new_foodgroup)

    # Encode inputs
    encoded_contaminant = label_encoder_contaminant.transform([new_contaminant])[0]
    encoded_foodgroup = label_encoder_foodgroup.transform([new_foodgroup])[0]

    # Scale the log-transformed result value
    log_value = np.log1p(new_value)
    scaled_value = scaler.transform(np.array(log_value).reshape(-1, 1))[0][0]

    # Predict cluster
    cluster = kmeans_model.predict([[encoded_contaminant, scaled_value, encoded_foodgroup]])[0]

    # Return contamination level
    return cluster_mapping.get(cluster, "Unknown")

# Load models for testing
label_encoder_contaminant = joblib.load("contaminant_encoder.pkl")
label_encoder_foodgroup = joblib.load("foodgroup_encoder.pkl")
scaler = joblib.load("scaler.pkl")
kmeans = joblib.load("kmeans_model.pkl")

# Example: Classify a new sample
new_foodgroup = "Milk and Dairy Products"
new_contaminant = "Tin"
new_value = 100  # Contaminant level

classification = classify_contamination(
    new_contaminant, 
    new_foodgroup, 
    new_value, 
    kmeans, 
    scaler, 
    label_encoder_contaminant, 
    label_encoder_foodgroup, 
    cluster_mapping
)

print(f"Predicted Contamination Level for {new_contaminant} in {new_foodgroup} with value {new_value}: {classification}")

Predicted Contamination Level for Tin in Milk and Dairy Products with value 100: Low


