In [None]:
pip install medmnist torch torchvision matplotlib seaborn scikit-learn umap-learn

In [None]:
import numpy as np
import pandas as pd

# Load the npz file directly
data = np.load('pathmnist.npz')

# Combine all splits
all_images = np.concatenate([
    data['train_images'],
    data['val_images'],
    data['test_images']
], axis=0)

all_labels = np.concatenate([
    data['train_labels'],
    data['val_labels'],
    data['test_labels']
], axis=0).flatten()

# Flatten each image from (28, 28, 3) → 1D vector
flat_images = all_images.reshape((all_images.shape[0], -1))

# Create DataFrame: first column is label, rest are pixels
df_full = pd.DataFrame(flat_images)
df_full.insert(0, "label", all_labels)

# Preview
print(df_full.head())

# Optional: Save to CSV
# df_full.to_csv("pathmnist_full_dataset.csv", index=False)

In [None]:
# Step 1: Define the mapping from label numbers to medical class names
label_map = {
    0: 'ADI',
    1: 'BACK',
    2: 'DEB',
    3: 'LYM',
    4: 'MUC',
    5: 'MUS',
    6: 'NORM',
    7: 'STR',
    8: 'TUM'
}

# Step 2: Apply stratified sampling (4500 rows per class)
df_sampled = df_full.groupby('label').apply(lambda x: x.sample(n=4500, random_state=42)).reset_index(drop=True)

# Step 3: Replace numeric labels with string labels using the mapping
df_sampled['label'] = df_sampled['label'].map(label_map)

# Save to CSV
df_sampled.to_csv("pathmnist_stratified_4500_per_class_labeled.csv", index=False)

# Confirm shape
print(df_sampled.shape)

In [None]:
# Step 4: Check and drop missing values
missing_before = df_sampled.isnull().sum().sum()
df_sampled = df_sampled.dropna()
missing_after = df_sampled.isnull().sum().sum()

# Step 5: Check and drop duplicate rows
duplicates_before = df_sampled.duplicated().sum()
df_sampled = df_sampled.drop_duplicates()
duplicates_after = df_sampled.duplicated().sum()

# Save cleaned dataset again
df_sampled.to_csv("pathmnist_balanced_labeled_cleaned.csv", index=False)

# Print summary
print(f"Missing values removed: {missing_before - missing_after}")
print(f"Duplicate rows removed: {duplicates_before - duplicates_after}")
print(f"Final shape: {df_sampled.shape}")


In [None]:
# Step 6: One-hot encode the label column and keep the original label
if 'label' in df_sampled.columns:
    df_encoded = pd.get_dummies(df_sampled, columns=['label'])
    df_encoded['label'] = df_sampled['label']  # reattach the original label
else:
    print("Error: 'label' column not found in df_sampled.")

# Save the final one-hot encoded dataset
df_encoded.to_csv("pathmnist_preprocessed_final.csv", index=False)

# Confirmation message
print(f"\n✅ Final one-hot encoded dataset saved as 'pathmnist_final_preprocessed.csv'.")
print(f"Final DataFrame shape: {df_encoded.shape}")