In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Load your data (replace 'your_data.csv' with your actual file)
data = pd.read_csv(r"Datasets\food-contamination-data-cleaned-2.csv")

# Define X and Y
X = data.drop('ResultValue', axis=1) # All columns except ResultValue
y = data['ResultValue'] # The ResultValue column

In [3]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply log transformation to the target variable (y_train and y_test)
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

# Identify categorical and numerical columns (assuming you've done this)
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(exclude=['object']).columns.tolist()

In [4]:
# Create preprocessor (assuming you've done this)
numerical_transformer = 'passthrough'
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_cols),
                                               ('cat', categorical_transformer, categorical_cols)])

# Create the Decision Tree Regressor pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', DecisionTreeRegressor(random_state=42))])

# Define the hyperparameter grid to search
param_grid = {
    'regressor__max_depth': [None, 5, 10, 15, 20],
    'regressor__min_samples_split': [2, 5, 10, 20],
    'regressor__min_samples_leaf': [1, 3, 5, 10],
    'regressor__max_features': ['sqrt', 'log2', None]  # None means consider all features
}

# Create the GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the GridSearchCV object to the training data (using the log-transformed target)
grid_search.fit(X_train, y_train_log)

# Print the best hyperparameters found
print(f"Best hyperparameters: {grid_search.best_params_}")

# Get the best model from GridSearchCV
best_decision_tree_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred_log_tuned = best_decision_tree_model.predict(X_test)

# Inverse transform the predictions back to the original scale
y_pred_original_scale_tuned = np.expm1(y_pred_log_tuned)

Best hyperparameters: {'regressor__max_depth': None, 'regressor__max_features': None, 'regressor__min_samples_leaf': 10, 'regressor__min_samples_split': 2}


In [5]:
# Evaluate the best model on the original scale predictions
mse_tuned = mean_squared_error(y_test, y_pred_original_scale_tuned)
r2_tuned = r2_score(y_test, y_pred_original_scale_tuned)

print("\nResults with Tuned Decision Tree (Log Transformed Target):")
print(f'Decision Tree MSE (Original Scale): {mse_tuned}')
print(f'Decision Tree R-squared (Original Scale): {r2_tuned}')


Results with Tuned Decision Tree (Log Transformed Target):
Decision Tree MSE (Original Scale): 691.1721707067279
Decision Tree R-squared (Original Scale): 0.6134339947857661


In [7]:
#User testing block

def predict_contamination(user_input_dict, best_model, numerical_cols, categorical_cols):
    """
    Predicts the contamination level based on user input.

    Args:
        user_input_dict (dict): A dictionary containing the user's input
                                 for the features (keys should match column names).
        best_model (Pipeline): The trained best Decision Tree Regressor model.
        numerical_cols (list): List of numerical column names.
        categorical_cols (list): List of categorical column names.

    Returns:
        float: The predicted contamination level (original scale).
    """
    user_input_df = pd.DataFrame([user_input_dict])

    # Ensure all necessary columns are present (handle missing if needed)
    for col in numerical_cols:
        if col not in user_input_df.columns:
            user_input_df[col] = 0  # Or some other appropriate default

    for col in categorical_cols:
        if col not in user_input_df.columns:
            user_input_df[col] = 'unknown'  # Or some other appropriate default

    # Preprocess the user input using the same preprocessor in the pipeline
    preprocessed_input = best_model.named_steps['preprocessor'].transform(user_input_df)

    # Make prediction (which will be on the log scale)
    predicted_log = best_model.named_steps['regressor'].predict(preprocessed_input)

    # Inverse transform the prediction to the original scale
    predicted_original_scale = np.expm1(predicted_log)[0]  # [0] to get the single prediction

    return predicted_original_scale

# Example user input
user_input = {
    'FoodID': 123,
    'CountryName': 'USA',
    'FoodGroupName': 'Fruits',
    'GEMSFoodName': 'Apples',
    'ContaminantID': 45,
    'ContaminantName': 'Pesticide X',
    'Year': 2024,
    'ContaminationIndividualID': 789  # Include if it was part of your features
}

#update before every test
model_to_be_tested = y_pred_original_scale_tuned

# Get the prediction
prediction = predict_contamination(user_input, model_to_be_tested, numerical_cols, categorical_cols)

print(f"Predicted Contamination Level: {prediction:.2f} micrograms/kg")

AttributeError: 'numpy.ndarray' object has no attribute 'named_steps'

In [None]:
# Assuming you have a specific sample you want to classify (e.g., the first one in your test set)
sample_index = 7
sample_features = X_test.iloc[[sample_index]]  # Get the features for the sample
predicted_log = best_decision_tree_model.predict(sample_features)
predicted_value = np.expm1(predicted_log)[0] # Inverse transform if needed

# Get the contaminant name for this sample (assuming your test set still has this info)
contaminant = X_test.iloc[sample_index]['ContaminantName'] # Adjust if needed

thresholds = {
    "Ethyl carbamate": 400,
    "Cesium 134": 1000,
    "Cesium 137": 1000,
    "Iodine 131": 100,
    "Cesium total": 1000,
    "Dioxins (WHO TEFs)": 2,
    "Dioxin like PCBs (WHO TEFs)": 3,
    "Lead": 100,
    "Cadmium": 100,
    "Aflatoxin (total)": 15,
    "Aflatoxin B1": 5,
    "Aflatoxin B2": 2,
    "Aflatoxin G1": 2,
    "Aflatoxin G2": 2,
    "Aflatoxin M1": 0.5,
    "Tin": 250,
    "Copper": 10000,
    "Mercury": 50,
    "Methyl mercury": 100,
    "Fumonisin B1": 1000,
    "Fumonisin B2": 1000,
    "Fumonisin B3": 1000,
    "Patulin": 50,
    "Nitrite": 20,
    "Arsenic (total)": 200,
    "Arsenic (inorganic)": 100,
    "Deoxynivalenol": 1000,
    "3-Chloro-1,2-propanediol": 20,
    "Ochratoxin A": 5,
    "Zearalenone": 100,
    "Hexachlorobenzene": 10,
    "Hexachlorocyclohexanes (HCH)": 50,
    "Pyrrolizidine alkaloids":100
}

safety_threshold = thresholds.get(contaminant, float('inf'))
safety_label = "Safe" if predicted_value <= safety_threshold else "Unsafe"

print("Sample is classified as:", safety_label)
print(f"Predicted value for {contaminant}: {predicted_value:.2f}")
print(f"Safety threshold for {contaminant}: {safety_threshold}")