In [2]:
# Now you can access the files using the paths provided in the previous response:
ground_truth_path = 'H:/My Drive/A Multimodal Generative AI System for Skin Lesion Diagnosis and Explanation/data/ISIC_2020_Training_GroundTruth.csv'
metadata_path = 'H:/My Drive/A Multimodal Generative AI System for Skin Lesion Diagnosis and Explanation/data/ISIC_2020_Training_Metadata_v2.csv'


# You can then read the CSV files using pandas, for example:
import pandas as pd

ground_truth_df = pd.read_csv(ground_truth_path)
metadata_df = pd.read_csv(metadata_path)

# Now you can work with the dataframes (ground_truth_df and metadata_df) in your Colab notebook.

In [3]:
display(ground_truth_df.head())
display(metadata_df.head())

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0


Unnamed: 0,image_name,patient_id,lesion_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
0,ISIC_2637011,IP_7279968,IL_7972535,male,45.0,head/neck,unknown,benign,0
1,ISIC_0015719,IP_3075186,IL_4649854,female,45.0,upper extremity,unknown,benign,0
2,ISIC_0052212,IP_2842074,IL_9087444,female,50.0,lower extremity,nevus,benign,0
3,ISIC_0068279,IP_6890425,IL_4255399,female,45.0,head/neck,unknown,benign,0
4,ISIC_0074268,IP_8723313,IL_6898037,female,55.0,upper extremity,unknown,benign,0


In [7]:
%pip install pillow

import os
from PIL import Image
import numpy as np

# Directory containing the images
image_dir = 'H:/My Drive/A Multimodal Generative AI System for Skin Lesion Diagnosis and Explanation/data/images/train'

# List all files in the directory
image_files = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith('.jpg')]

print(f"Number of image files found: {len(image_files)}")

# Attempt to open the first image to confirm access
if image_files:
    first_image_file = image_files[0]
    try:
        img = Image.open(first_image_file)
        print(f"\nSuccessfully opened the first image: {first_image_file}")
        # You can optionally display the image using matplotlib
        # import matplotlib.pyplot as plt
        # plt.imshow(img)
        # plt.axis('off')
        # plt.show()
    except Exception as e:
        print(f"\nError opening the first image {first_image_file}: {e}")
else:
    print("\nNo image files found.")

Defaulting to user installation because normal site-packages is not writeable
Collecting pillow
  Downloading pillow-11.3.0-cp313-cp313-win_amd64.whl.metadata (9.2 kB)
Downloading pillow-11.3.0-cp313-cp313-win_amd64.whl (7.0 MB)
   ---------------------------------------- 0.0/7.0 MB ? eta -:--:--
   - -------------------------------------- 0.3/7.0 MB ? eta -:--:--
   --- ------------------------------------ 0.5/7.0 MB 1.4 MB/s eta 0:00:05
   ---- ----------------------------------- 0.8/7.0 MB 1.3 MB/s eta 0:00:05
   ------ --------------------------------- 1.0/7.0 MB 1.3 MB/s eta 0:00:05
   ------- -------------------------------- 1.3/7.0 MB 1.3 MB/s eta 0:00:05
   --------- ------------------------------ 1.6/7.0 MB 1.3 MB/s eta 0:00:05
   ---------- ----------------------------- 1.8/7.0 MB 1.3 MB/s eta 0:00:05
   ------------ --------------------------- 2.1/7.0 MB 1.3 MB/s eta 0:00:04
   ------------- -------------------------- 2.4/7.0 MB 1.3 MB/s eta 0:00:04
   --------------- ------

In [None]:
import seaborn as sns

import matplotlib.pyplot as plt

# Set up the plotting style
sns.set(style="whitegrid")

# 1. Distribution of target (benign vs malignant)
plt.figure(figsize=(6,4))
sns.countplot(x='target', data=ground_truth_df)
plt.title('Distribution of Benign vs Malignant Lesions')
plt.xlabel('Target (0: Benign, 1: Malignant)')
plt.ylabel('Count')
plt.show()

# 2. Age distribution
plt.figure(figsize=(8,4))
sns.histplot(ground_truth_df['age_approx'].dropna(), bins=30, kde=True)
plt.title('Age Distribution of Patients')
plt.xlabel('Approximate Age')
plt.ylabel('Frequency')
plt.show()

# 3. Sex distribution
plt.figure(figsize=(6,4))
sns.countplot(x='sex', data=ground_truth_df)
plt.title('Sex Distribution')
plt.xlabel('Sex')
plt.ylabel('Count')
plt.show()

# 4. Anatomical site distribution
plt.figure(figsize=(10,4))
sns.countplot(y='anatom_site_general_challenge', data=ground_truth_df, order=ground_truth_df['anatom_site_general_challenge'].value_counts().index)
plt.title('Distribution of Anatomical Sites')
plt.xlabel('Count')
plt.ylabel('Anatomical Site')
plt.show()

# 5. Diagnosis distribution (top 10)
plt.figure(figsize=(12,6))
top_diagnoses = ground_truth_df['diagnosis'].value_counts().nlargest(10)
sns.barplot(x=top_diagnoses.index, y=top_diagnoses.values)
plt.title('Top 10 Diagnoses')
plt.xlabel('Diagnosis')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# 6. Correlation heatmap for numeric columns
plt.figure(figsize=(6,4))
numeric_cols = ground_truth_df.select_dtypes(include=['float64', 'int64'])
sns.heatmap(numeric_cols.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()