<a href="https://colab.research.google.com/github/acesur/Machine-Learning-/blob/main/03_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

import kagglehub

# Download latest version
path = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000")

print("Path to dataset files:", path)

# Paths to your data
from google.colab import drive
drive.mount('/content/drive/')

# Define paths to your data
# IMAGES_PATH_PART1 = "/content/drive/My Drive/Colab Notebooks/dataset/archive (2)/HAM10000_images_part_1"
# IMAGES_PATH_PART2 = "/content/drive/My Drive/Colab Notebooks/dataset/archive (2)/HAM10000_images_part_2"
# METADATA_PATH = "/content/drive/My Drive/Colab Notebooks/dataset/archive (2)/HAM10000_metadata.csv"
IMAGES_PATH_PART1 = "/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_images_part_1"
IMAGES_PATH_PART2 = "/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_images_part_2"
METADATA_PATH = "/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_metadata.csv"
# Load the extracted features
feature_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/dataset/abcd_features.csv')
print(f"Loaded feature data with shape: {feature_df.shape}")
print(f"Feature columns: {feature_df.columns.tolist()}")

# Load metadata
metadata = pd.read_csv(METADATA_PATH)
print(f"Loaded metadata with shape: {metadata.shape}")
print(f"Metadata columns: {metadata.columns.tolist()}")

# Merge features with metadata to get additional info (age, sex, localization)
# Keep only the rows that have extracted features
df = feature_df.merge(metadata, on=['image_id', 'dx'], how='inner')
print(f"After merging, dataframe shape: {df.shape}")
print(f"Final dataframe columns: {df.columns.tolist()}")

# Check the class distribution
print("\nClass distribution:")
print(df['dx'].value_counts())

# Prepare features and target
# Note: We're keeping only the columns that exist in our dataframe
# Exclude non-feature columns
non_feature_cols = ['image_id', 'dx', 'lesion_id', 'dx_type', 'age', 'sex', 'localization']
feature_cols = [col for col in df.columns if col not in non_feature_cols]

X = df[feature_cols]
y = df['dx']

print(f"\nFeature columns used: {feature_cols}")

# Handle missing values if any
X = X.fillna(0)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle class imbalance with SMOTE (apply only to training data)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
print(f"After SMOTE, training set size: {X_train_resampled.shape[0]}")
print("Class distribution after SMOTE:")
print(pd.Series(y_train_resampled).value_counts())

# SVM Model
print("\nTraining SVM model...")
svm = SVC(probability=True, random_state=42)
svm.fit(X_train_resampled, y_train_resampled)

# KNN Model
print("Training KNN model...")
knn = KNeighborsClassifier()
knn.fit(X_train_resampled, y_train_resampled)

# Random Forest (as one of the ensemble methods)
print("Training Random Forest model...")
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_resampled, y_train_resampled)

# Create a simple voting ensemble
print("Training Ensemble model...")
ensemble = VotingClassifier(
    estimators=[
        ('svm', svm),
        ('knn', knn),
        ('rf', rf)
    ],
    voting='soft'
)
ensemble.fit(X_train_resampled, y_train_resampled)

# Evaluate models
def evaluate_model(model, X, y, model_name="Model"):
    y_pred = model.predict(X)
    print(f"\n{model_name} Evaluation:")
    print(classification_report(y, y_pred))

    # Confusion Matrix
    cm = confusion_matrix(y, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=np.unique(y),
                yticklabels=np.unique(y))
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()

print("\nEvaluating models on test set...")
# Evaluate each model
evaluate_model(svm, X_test_scaled, y_test, "SVM")
evaluate_model(knn, X_test_scaled, y_test, "KNN")
evaluate_model(rf, X_test_scaled, y_test, "Random Forest")
evaluate_model(ensemble, X_test_scaled, y_test, "Ensemble")

# Check if 'mel' exists in our classes before creating ROC curve
if 'mel' in y.unique():
    # ROC curve for melanoma detection (binary classification: MEL vs rest)
    plt.figure(figsize=(10, 8))

    # Convert to binary problem: melanoma vs. non-melanoma
    y_test_binary = (y_test == 'mel').astype(int)

    for model, name in zip([svm, knn, rf, ensemble],
                          ['SVM', 'KNN', 'Random Forest', 'Ensemble']):
        y_score = model.predict_proba(X_test_scaled)

        # Get the column index for melanoma class
        if 'mel' in model.classes_:
            mel_idx = list(model.classes_).index('mel')

            # Compute ROC curve and AUC
            fpr, tpr, _ = roc_curve(y_test_binary, y_score[:, mel_idx])
            roc_auc = auc(fpr, tpr)

            plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.3f})')

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve for Melanoma Detection')
    plt.legend(loc="lower right")
    plt.show()
else:
    print("\nNote: 'mel' class not found in the dataset. Cannot create melanoma-specific ROC curve.")

# Feature importance (for Random Forest)
if hasattr(rf, 'feature_importances_'):
    plt.figure(figsize=(12, 6))
    feature_importance = pd.DataFrame({
        'Feature': feature_cols,
        'Importance': rf.feature_importances_
    }).sort_values('Importance', ascending=False)

    sns.barplot(x='Importance', y='Feature', data=feature_importance)
    plt.title('Feature Importance (Random Forest)')
    plt.tight_layout()
    plt.show()

Path to dataset files: /kaggle/input/skin-cancer-mnist-ham10000
Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
Loaded feature data with shape: (100, 8)
Feature columns: ['asymmetry', 'border_irregularity', 'color_variance_r', 'color_variance_g', 'color_variance_b', 'diameter', 'image_id', 'dx']
Loaded metadata with shape: (10015, 7)
Metadata columns: ['lesion_id', 'image_id', 'dx', 'dx_type', 'age', 'sex', 'localization']
After merging, dataframe shape: (100, 13)
Final dataframe columns: ['asymmetry', 'border_irregularity', 'color_variance_r', 'color_variance_g', 'color_variance_b', 'diameter', 'image_id', 'dx', 'lesion_id', 'dx_type', 'age', 'sex', 'localization']

Class distribution:
dx
bkl    99
nv      1
Name: count, dtype: int64

Feature columns used: ['asymmetry', 'border_irregularity', 'color_variance_r', 'color_variance_g', 'color_variance_b', 'diameter']


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.