In [1]:
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
import numpy as np

In [3]:
# Path to dataset
DATASET_PATH = r"C:\\Users\\ycong\\Documents\\FYP\\Dataset\\Final_Augmented_dataset_Diseases_and_Symptoms.csv"

# Load dataset
df = pd.read_csv(DATASET_PATH)

if 'diseases' not in df.columns:
    raise ValueError("Dataset must contain 'diseases' column")

# Features and target
symptom_cols = [col for col in df.columns if col != 'diseases']
X = df[symptom_cols]
y = df['diseases']

# Split for evaluation (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
print("Naïve Bayes model trained successfully.")

# ---- Evaluation ----
y_pred = nb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Prediction Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Save model and symptom columns to a file
with open(r"C:\\Users\\ycong\\Documents\\FYP\\FYP_Code\\nb_model.pkl", "wb") as f:
    pickle.dump({'model': nb_model, 'symptom_cols': symptom_cols}, f)

print("Model saved to nb_model.pkl")


Naïve Bayes model trained successfully.
Prediction Accuracy: 0.8363

Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                                                          precision    recall  f1-score   support

                               abdominal aortic aneurysm       1.00      1.00      1.00        20
                                        abdominal hernia       0.90      0.98      0.93        81
                                         abscess of nose       1.00      0.74      0.85        58
                                     abscess of the lung       0.00      0.00      0.00         6
                                  abscess of the pharynx       0.85      0.83      0.84        63
                                    acanthosis nigricans       0.00      0.00      0.00         6
                                               acariasis       1.00      0.60      0.75         5
                                               achalasia       1.00      0.70      0.82        20
                                                    acne       0.97      0.66      0.79       103
                   

In [2]:
# Path to dataset
DATASET_PATH = r"C:\\Users\\ycong\\Documents\\FYP\\Dataset\\Final_Augmented_dataset_Diseases_and_Symptoms.csv"

# Load dataset
df = pd.read_csv(DATASET_PATH)

if 'diseases' not in df.columns:
    raise ValueError("Dataset must contain 'diseases' column")

# Features and target
symptom_cols = [col for col in df.columns if col != 'diseases']
X = df[symptom_cols]
y = df['diseases']

# Split for evaluation (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
print("Naïve Bayes model trained successfully.")

# ---- Cross Validation ----
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(nb_model, X, y, cv=5)  # 5-fold CV
print("\nCross-Validation Accuracy Scores:", cv_scores)
print("Mean CV Accuracy: {:.4f}".format(cv_scores.mean()))

# ---- Evaluation on Test Set ----
y_pred = nb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest Set Prediction Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# ---- Summary Classification Report ----
report = classification_report(y_test, y_pred, output_dict=True)
overall_precision = report['weighted avg']['precision']
overall_recall = report['weighted avg']['recall']
overall_f1 = report['weighted avg']['f1-score']

print("\n----- Summary Report -----")
print(f"Overall Precision : {overall_precision:.4f}")
print(f"Overall Recall    : {overall_recall:.4f}")
print(f"Overall F1-score  : {overall_f1:.4f}")
print("---------------------------")

# ---- Save Model ----
with open(r"C:\\Users\\ycong\\Documents\\FYP\\FYP_Code\\nb_model1.pkl", "wb") as f:
    pickle.dump({'model': nb_model, 'symptom_cols': symptom_cols}, f)

print("Model saved to nb_model1.pkl")

Naïve Bayes model trained successfully.





Cross-Validation Accuracy Scores: [0.83320173 0.84014659 0.8353277  0.83931645 0.83625909]
Mean CV Accuracy: 0.8369

Test Set Prediction Accuracy: 0.8363

Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                                                          precision    recall  f1-score   support

                               abdominal aortic aneurysm       1.00      1.00      1.00        20
                                        abdominal hernia       0.90      0.98      0.93        81
                                         abscess of nose       1.00      0.74      0.85        58
                                     abscess of the lung       0.00      0.00      0.00         6
                                  abscess of the pharynx       0.85      0.83      0.84        63
                                    acanthosis nigricans       0.00      0.00      0.00         6
                                               acariasis       1.00      0.60      0.75         5
                                               achalasia       1.00      0.70      0.82        20
                                                    acne       0.97      0.66      0.79       103
                   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



----- Summary Report -----
Overall Precision : 0.8442
Overall Recall    : 0.8363
Overall F1-score  : 0.8310
---------------------------
Model saved to nb_model1.pkl


In [4]:
# Path to dataset
DATASET_PATH = r"C:\\Users\\ycong\\Documents\\FYP\\Dataset\\Final_Augmented_dataset_Diseases_and_Symptoms.csv"

# Load dataset
df = pd.read_csv(DATASET_PATH)

if 'diseases' not in df.columns:
    raise ValueError("Dataset must contain 'diseases' column")

# Features and target
symptom_cols = [col for col in df.columns if col != 'diseases']
X = df[symptom_cols]
y = df['diseases']

# Split for evaluation (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train Logistic Regression model
log_reg = LogisticRegression(
    max_iter=5000,        # ensure convergence for many classes
    multi_class='multinomial',
    solver='lbfgs'
)

log_reg.fit(X_train, y_train)
print("Logistic Regression model trained successfully.")

# ---- Evaluation ----
y_pred = log_reg.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Prediction Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Save model and symptom columns to a file
with open(r"C:\\Users\\ycong\\Documents\\FYP\\FYP_Code\\log_reg_model.pkl", "wb") as f:
    pickle.dump({'model': log_reg, 'symptom_cols': symptom_cols}, f)

print("Model saved to log_reg_model.pkl")



Logistic Regression model trained successfully.
Prediction Accuracy: 0.8655

Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                                                          precision    recall  f1-score   support

                               abdominal aortic aneurysm       0.74      1.00      0.85        20
                                        abdominal hernia       0.92      0.99      0.95        81
                                         abscess of nose       0.89      0.86      0.88        58
                                     abscess of the lung       1.00      1.00      1.00         6
                                  abscess of the pharynx       0.94      0.98      0.96        63
                                    acanthosis nigricans       0.86      1.00      0.92         6
                                               acariasis       0.83      1.00      0.91         5
                                               achalasia       0.73      0.80      0.76        20
                                                    acne       0.66      0.92      0.77       103
                   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [4]:
# Path to dataset
DATASET_PATH = r"C:\\Users\\ycong\\Documents\\FYP\\Dataset\\Final_Augmented_dataset_Diseases_and_Symptoms.csv"

# Load dataset
df = pd.read_csv(DATASET_PATH)

if 'diseases' not in df.columns:
    raise ValueError("Dataset must contain 'diseases' column")

# Features and target
symptom_cols = [col for col in df.columns if col != 'diseases']
X = df[symptom_cols]
y = df['diseases']

# Split for evaluation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Base model
log_reg = LogisticRegression(
    max_iter=5000,
    multi_class='multinomial',
    solver='lbfgs'
)

# Hyperparameter search space
param_dist = {
    'C': np.logspace(-3, 3, 20),       # regularization strength
    'penalty': ['l2'],                 # only 'l2' supported by lbfgs
    'solver': ['lbfgs']                # fixed
}

# Randomized Search (memory-friendly)
rand_search = RandomizedSearchCV(
    estimator=log_reg,
    param_distributions=param_dist,
    n_iter=10,              # try 10 random combinations
    scoring='accuracy',
    cv=3,                   # 3-fold CV
    verbose=1,
    random_state=42,
    n_jobs=1                # avoid memory issues
)

# Fit RandomizedSearchCV
rand_search.fit(X_train, y_train)
print("\nBest Hyperparameters:", rand_search.best_params_)

# ---- Evaluation ----
best_model = rand_search.best_estimator_
y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"\nPrediction Accuracy (Best Model): {accuracy:.4f}")

print("\nFull Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# ---- Summary Classification Report ----
report = classification_report(y_test, y_pred, output_dict=True)
overall_precision = report['weighted avg']['precision']
overall_recall = report['weighted avg']['recall']
overall_f1 = report['weighted avg']['f1-score']

print("\n----- Summary Report -----")
print(f"Overall Precision : {overall_precision:.4f}")
print(f"Overall Recall    : {overall_recall:.4f}")
print(f"Overall F1-score  : {overall_f1:.4f}")
print("---------------------------")

# ---- Save model ----
with open(r"C:\\Users\\ycong\\Documents\\FYP\\FYP_Code\\log_reg_model_tuned1.pkl", "wb") as f:
    pickle.dump({'model': best_model, 'symptom_cols': symptom_cols}, f)

print("\nTuned Logistic Regression model saved to log_reg_model_tuned.pkl")

Fitting 3 folds for each of 10 candidates, totalling 30 fits





Best Hyperparameters: {'solver': 'lbfgs', 'penalty': 'l2', 'C': np.float64(112.88378916846884)}

Prediction Accuracy (Best Model): 0.8662

Full Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                                                          precision    recall  f1-score   support

                               abdominal aortic aneurysm       0.77      1.00      0.87        20
                                        abdominal hernia       0.92      0.99      0.95        81
                                         abscess of nose       0.76      0.88      0.82        58
                                     abscess of the lung       1.00      1.00      1.00         6
                                  abscess of the pharynx       0.94      0.97      0.95        63
                                    acanthosis nigricans       0.75      1.00      0.86         6
                                               acariasis       0.38      1.00      0.56         5
                                               achalasia       0.64      0.90      0.75        20
                                                    acne       0.70      0.89      0.78       103
                   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



----- Summary Report -----
Overall Precision : 0.8768
Overall Recall    : 0.8662
Overall F1-score  : 0.8669
---------------------------

Tuned Logistic Regression model saved to log_reg_model_tuned.pkl


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:

# Path to dataset
DATASET_PATH = r"C:\\Users\\ycong\\Documents\\FYP\\Dataset\\Final_Augmented_dataset_Diseases_and_Symptoms.csv"

# Load dataset
df = pd.read_csv(DATASET_PATH)

if 'diseases' not in df.columns:
    raise ValueError("Dataset must contain 'diseases' column")

# Features and target
symptom_cols = [col for col in df.columns if col != 'diseases']
X = df[symptom_cols]
y = df['diseases']

# Split for evaluation (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train Random Forest model (memory optimized)
rf_model = RandomForestClassifier(
    n_estimators=100,      # reduce number of trees
    max_depth=20,          # limit depth of trees
    max_features='sqrt',   # reduce number of features considered per split
    random_state=42,
    n_jobs=2               # use fewer cores to avoid RAM overload
)

rf_model.fit(X_train, y_train)
print("Random Forest model trained successfully.")

# ---- Evaluation ----
y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Prediction Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Save model and symptom columns to a file
with open(r"C:\\Users\\ycong\\Documents\\FYP\\FYP_Code\\rf_model.pkl", "wb") as f:
    pickle.dump({'model': rf_model, 'symptom_cols': symptom_cols}, f)

print("Model saved to rf_model.pkl")

Random Forest model trained successfully.
Prediction Accuracy: 0.6513

Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                                                          precision    recall  f1-score   support

                               abdominal aortic aneurysm       1.00      0.30      0.46        20
                                        abdominal hernia       1.00      0.52      0.68        81
                                         abscess of nose       0.95      0.67      0.79        58
                                     abscess of the lung       0.00      0.00      0.00         6
                                  abscess of the pharynx       0.00      0.00      0.00        63
                                    acanthosis nigricans       0.00      0.00      0.00         6
                                               acariasis       0.00      0.00      0.00         5
                                               achalasia       0.00      0.00      0.00        20
                                                    acne       1.00      0.65      0.79       103
                   

In [2]:
# Path to dataset
DATASET_PATH = r"C:\\Users\\ycong\\Documents\\FYP\\Dataset\\Final_Augmented_dataset_Diseases_and_Symptoms.csv"

# Load dataset
df = pd.read_csv(DATASET_PATH)

if 'diseases' not in df.columns:
    raise ValueError("Dataset must contain 'diseases' column")

# Features and target
symptom_cols = [col for col in df.columns if col != 'diseases']
X = df[symptom_cols]
y = df['diseases']

# Split for evaluation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Base Random Forest
rf = RandomForestClassifier(
    random_state=42,
    n_jobs=2  # avoid using too much RAM
)

# Randomized search space
param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# RandomizedSearchCV
rand_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=10,        # try 10 random combinations
    scoring='accuracy',
    cv=3,             # 3-fold CV
    verbose=1,
    random_state=42,
    n_jobs=1          # memory-friendly
)

# Fit RandomizedSearchCV
rand_search.fit(X_train, y_train)
print("\nBest Hyperparameters:", rand_search.best_params_)

# ---- Evaluation ----
best_rf = rand_search.best_estimator_
y_pred = best_rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"\nPrediction Accuracy (Best RF Model): {accuracy:.4f}")

print("\nFull Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# ---- Summary Classification Report ----
report = classification_report(y_test, y_pred, output_dict=True)
overall_precision = report['weighted avg']['precision']
overall_recall = report['weighted avg']['recall']
overall_f1 = report['weighted avg']['f1-score']

print("\n----- Summary Report -----")
print(f"Overall Precision : {overall_precision:.4f}")
print(f"Overall Recall    : {overall_recall:.4f}")
print(f"Overall F1-score  : {overall_f1:.4f}")
print("---------------------------")

# ---- Save model ----
with open(r"C:\\Users\\ycong\\Documents\\FYP\\FYP_Code\\rf_model_random_tuned1.pkl", "wb") as f:
    pickle.dump({'model': best_rf, 'symptom_cols': symptom_cols}, f)

print("\nRandomized-tuned Random Forest model saved to rf_model_random_tuned.pkl")

Fitting 3 folds for each of 10 candidates, totalling 30 fits





Best Hyperparameters: {'n_estimators': 50, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': None}

Prediction Accuracy (Best RF Model): 0.8412

Full Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                                                          precision    recall  f1-score   support

                               abdominal aortic aneurysm       1.00      1.00      1.00        20
                                        abdominal hernia       0.93      0.94      0.93        81
                                         abscess of nose       0.77      0.86      0.81        58
                                     abscess of the lung       1.00      1.00      1.00         6
                                  abscess of the pharynx       0.90      0.89      0.90        63
                                    acanthosis nigricans       0.71      0.83      0.77         6
                                               acariasis       0.83      1.00      0.91         5
                                               achalasia       0.67      0.90      0.77        20
                                                    acne       0.71      0.73      0.72       103
                   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



----- Summary Report -----
Overall Precision : 0.8435
Overall Recall    : 0.8412
Overall F1-score  : 0.8410
---------------------------

Randomized-tuned Random Forest model saved to rf_model_random_tuned.pkl


In [None]:
# Path to dataset
DATASET_PATH = r"C:\\Users\\ycong\\Documents\\FYP\\Dataset\\Final_Augmented_dataset_Diseases_and_Symptoms.csv"

# Load dataset
df = pd.read_csv(DATASET_PATH)

if 'diseases' not in df.columns:
    raise ValueError("Dataset must contain 'diseases' column")

# Features and target
symptom_cols = [col for col in df.columns if col != 'diseases']
X = df[symptom_cols]
y = df['diseases']

# Split for evaluation (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train Linear SVM model
svm_model = LinearSVC(
    C=1.0,          # regularization strength
    max_iter=5000   # ensure convergence
)

svm_model.fit(X_train, y_train)
print("Linear SVM model trained successfully.")

# ---- Evaluation ----
y_pred = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Prediction Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Save model and symptom columns to a file
with open(r"C:\\Users\\ycong\\Documents\\FYP\\FYP_Code\\svm_model.pkl", "wb") as f:
    pickle.dump({'model': svm_model, 'symptom_cols': symptom_cols}, f)

print("Model saved to svm_model.pkl")

Linear SVM model trained successfully.
Prediction Accuracy: 0.8613

Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                                                          precision    recall  f1-score   support

                               abdominal aortic aneurysm       0.91      1.00      0.95        20
                                        abdominal hernia       0.94      0.99      0.96        81
                                         abscess of nose       0.83      0.86      0.85        58
                                     abscess of the lung       1.00      1.00      1.00         6
                                  abscess of the pharynx       0.94      0.95      0.94        63
                                    acanthosis nigricans       1.00      0.50      0.67         6
                                               acariasis       1.00      1.00      1.00         5
                                               achalasia       0.88      0.75      0.81        20
                                                    acne       0.75      0.80      0.77       103
                   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [5]:
# Path to dataset
DATASET_PATH = r"C:\\Users\\ycong\\Documents\\FYP\\Dataset\\Final_Augmented_dataset_Diseases_and_Symptoms.csv"

# Load dataset
df = pd.read_csv(DATASET_PATH)

if 'diseases' not in df.columns:
    raise ValueError("Dataset must contain 'diseases' column")

# Features and target
symptom_cols = [col for col in df.columns if col != 'diseases']
X = df[symptom_cols]
y = df['diseases']

# Split for evaluation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Base Linear SVM
svm_model = LinearSVC(
    max_iter=5000,
    random_state=42
)

# Randomized search space
param_dist = {
    'C': np.logspace(-3, 3, 20),       # regularization strength
    'loss': ['hinge', 'squared_hinge']
}

# RandomizedSearchCV
rand_search = RandomizedSearchCV(
    estimator=svm_model,
    param_distributions=param_dist,
    n_iter=10,        # try 10 random combinations
    scoring='accuracy',
    cv=3,             # 3-fold CV
    verbose=1,
    random_state=42,
    n_jobs=1          # memory-friendly
)

# Fit RandomizedSearchCV
rand_search.fit(X_train, y_train)
print("\nBest Hyperparameters:", rand_search.best_params_)

# ---- Evaluation ----
best_svm = rand_search.best_estimator_
y_pred = best_svm.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"\nPrediction Accuracy (Best SVM Model): {accuracy:.4f}")

print("\nFull Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# ---- Summary Classification Report ----
report = classification_report(y_test, y_pred, output_dict=True)
overall_precision = report['weighted avg']['precision']
overall_recall = report['weighted avg']['recall']
overall_f1 = report['weighted avg']['f1-score']

print("\n----- Summary Report -----")
print(f"Overall Precision : {overall_precision:.4f}")
print(f"Overall Recall    : {overall_recall:.4f}")
print(f"Overall F1-score  : {overall_f1:.4f}")
print("---------------------------")

# ---- Save model ----
with open(r"C:\\Users\\ycong\\Documents\\FYP\\FYP_Code\\svm_model_random_tuned1.pkl", "wb") as f:
    pickle.dump({'model': best_svm, 'symptom_cols': symptom_cols}, f)

print("\nRandomized-tuned SVM model saved to svm_model_random_tuned.pkl")

Fitting 3 folds for each of 10 candidates, totalling 30 fits





Best Hyperparameters: {'loss': 'squared_hinge', 'C': np.float64(0.6951927961775606)}

Prediction Accuracy (Best SVM Model): 0.8611

Full Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                                                          precision    recall  f1-score   support

                               abdominal aortic aneurysm       0.91      1.00      0.95        20
                                        abdominal hernia       0.94      0.99      0.96        81
                                         abscess of nose       0.83      0.86      0.85        58
                                     abscess of the lung       1.00      1.00      1.00         6
                                  abscess of the pharynx       0.94      0.95      0.94        63
                                    acanthosis nigricans       1.00      0.50      0.67         6
                                               acariasis       1.00      1.00      1.00         5
                                               achalasia       0.88      0.75      0.81        20
                                                    acne       0.75      0.80      0.77       103
                   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



----- Summary Report -----
Overall Precision : 0.8689
Overall Recall    : 0.8611
Overall F1-score  : 0.8609
---------------------------

Randomized-tuned SVM model saved to svm_model_random_tuned.pkl


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
