In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import numpy as np
from joblib import dump
import pandas as pd

In [5]:
# Load the dataset
df = pd.read_csv(r'cleaned_encode_food_adulteration.csv')


In [6]:
# Drop the dates column
df = df.drop(columns=['detection_date'])

In [9]:
# Check for missing values and handle them (drop or fill)
df = df.dropna()  # You can also use df.fillna(method='ffill') if needed

# Define input (X) and output (y) variables
X = df.drop(columns=['brand','severity','adulteration_id', 'health_risk', 'action_taken'])  # Replace with actual target column
y = df['severity']  # Replace with actual target column


In [10]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define hyperparameter grid for RandomizedSearchCV
param_dist = {
    "n_estimators": np.arange(100, 1000, 100),  # Increased range
    "max_depth": np.arange(10, 100, 10),       # Increased range
    "min_samples_split": [2, 5, 10, 20],       # Added more options
    "min_samples_leaf": [1, 2, 4, 8],          # Added more options
    "max_features": ["sqrt", "log2", None],    # Added None for all features
    "bootstrap": [True, False]
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42, class_weight="balanced"),  # Added class_weight
    param_distributions=param_dist, 
    n_iter=50,  # Increased iterations
    cv=5, 
    scoring="accuracy", 
    n_jobs=-1, 
    random_state=42
)
random_search.fit(X_train, y_train)

# Get best model
best_model = random_search.best_estimator_

# Predict and evaluate
y_pred = best_model.predict(X_test)

# Print results
print("Best Parameters:", random_search.best_params_)
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Best Parameters: {'n_estimators': np.int64(800), 'min_samples_split': 20, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': np.int64(20), 'bootstrap': True}
Accuracy Score: 0.3924050632911392
Classification Report:
               precision    recall  f1-score   support

           1       0.55      0.50      0.52        36
           2       0.11      0.10      0.10        20
           3       0.41      0.48      0.44        23

    accuracy                           0.39        79
   macro avg       0.35      0.36      0.35        79
weighted avg       0.39      0.39      0.39        79



In [11]:
dump(best_model, "adulteration-prediction-model.joblib")

['adulteration-prediction-model.joblib']