In [89]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import numpy as np
from joblib import dump
import pandas as pd

In [91]:
# Load the dataset
df = pd.read_csv(r'cleaned_encode_food_adulteration.csv')


In [93]:
# Drop the dates column
df = df.drop(columns=['detection_date'])

In [95]:
df_encoded = pd.get_dummies(df[['brand', 'category', 'product_name', 'adulterant', 'detection_method', 'health_risk']], drop_first=True)

# Compute Pearson correlation
correlation_matrix = df_encoded.corr()

# Display the correlation matrix
print(correlation_matrix)


                     brand  category  product_name  adulterant  \
brand             1.000000 -0.059554      0.028361   -0.034726   
category         -0.059554  1.000000     -0.060926    0.083208   
product_name      0.028361 -0.060926      1.000000   -0.056560   
adulterant       -0.034726  0.083208     -0.056560    1.000000   
detection_method  0.074250 -0.016434      0.106940    0.054394   
health_risk       0.006836 -0.024052     -0.015533   -0.035851   

                  detection_method  health_risk  
brand                     0.074250     0.006836  
category                 -0.016434    -0.024052  
product_name              0.106940    -0.015533  
adulterant                0.054394    -0.035851  
detection_method          1.000000    -0.003260  
health_risk              -0.003260     1.000000  


In [97]:
import pandas as pd

# Compute correlation matrix
correlation_matrix = df.corr()

# Display correlation of all features with 'severity'
print(correlation_matrix['severity'].sort_values(ascending=False))


severity            1.000000
Unnamed: 0          0.081316
adulteration_id     0.081316
brand               0.028481
action_taken        0.028245
detection_method    0.019787
health_risk         0.013633
product_name        0.008972
adulterant         -0.005722
category           -0.035889
Name: severity, dtype: float64


In [99]:
print(df.isnull().sum())


Unnamed: 0          0
adulteration_id     0
product_name        0
brand               0
category            0
adulterant          0
detection_method    0
severity            0
health_risk         0
action_taken        0
dtype: int64


In [101]:
df.drop_duplicates(inplace=True)


In [129]:
X = df[['product_name','adulterant']]  # Independent variables
y = df['severity']  # Target variable


In [131]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define hyperparameter grid for RandomizedSearchCV
param_dist = {
   'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42, class_weight="balanced"),  # Added class_weight
    param_distributions=param_dist, 
    n_iter=50,  # Increased iterations
    cv=5, 
    scoring="accuracy", 
    n_jobs=-1, 
    random_state=42
)
random_search.fit(X_train, y_train)

# Get best model
best_model = random_search.best_estimator_

# Predict and evaluate
y_pred = best_model.predict(X_test)

# Print results
print("Best Parameters:", random_search.best_params_)
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Best Parameters: {'n_estimators': 50, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': None, 'bootstrap': True}
Accuracy Score: 0.45569620253164556
Classification Report:
               precision    recall  f1-score   support

           1       0.57      0.44      0.50        36
           2       0.32      0.50      0.39        20
           3       0.50      0.43      0.47        23

    accuracy                           0.46        79
   macro avg       0.46      0.46      0.45        79
weighted avg       0.49      0.46      0.46        79



In [11]:
dump(best_model, "adulteration-prediction-model.joblib")

['adulteration-prediction-model.joblib']