In [1]:
import pandas as pd
import numpy as np
import os

In [19]:
# general file directory 
!tree -L 1 ../data/raw/ham_1000_archive/

[1;36m../data/raw/ham_1000_archive/[0m
├── [1;36mHAM10000_images_part_1[0m
├── [1;36mHAM10000_images_part_2[0m
├── HAM10000_metadata.csv
├── hmnist_28_28_L.csv
├── hmnist_28_28_RGB.csv
├── hmnist_8_8_L.csv
└── hmnist_8_8_RGB.csv

3 directories, 5 files


In [None]:
# view ham1000 info files
!du -h ../data/raw/ham_1000_archive/*.csv

552K	../data/raw/ham_1000_archive/HAM10000_metadata.csv
 29M	../data/raw/ham_1000_archive/hmnist_28_28_L.csv
 88M	../data/raw/ham_1000_archive/hmnist_28_28_RGB.csv
2.4M	../data/raw/ham_1000_archive/hmnist_8_8_L.csv
7.2M	../data/raw/ham_1000_archive/hmnist_8_8_RGB.csv


In [2]:
df_meta = pd.read_csv("../data/raw/ham_1000_archive/HAM10000_metadata.csv")
df_meta.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [None]:
df_28 = pd.read_csv("../data/raw/ham_1000_archive/hmnist_28_28_L.csv")
# columns = 28 * 28 + 1 (label)
df_28.head(2)

Unnamed: 0,pixel0000,pixel0001,pixel0002,pixel0003,pixel0004,pixel0005,pixel0006,pixel0007,pixel0008,pixel0009,...,pixel0775,pixel0776,pixel0777,pixel0778,pixel0779,pixel0780,pixel0781,pixel0782,pixel0783,label
0,169,171,170,177,181,182,181,185,194,192,...,184,186,185,180,157,140,140,159,165,2
1,19,57,105,140,149,148,144,155,170,170,...,172,175,160,144,114,89,47,18,18,2


In [20]:
df_8 = pd.read_csv("../data/raw/ham_1000_archive/hmnist_8_8_L.csv")
df_8.head(2)

Unnamed: 0,pixel0000,pixel0001,pixel0002,pixel0003,pixel0004,pixel0005,pixel0006,pixel0007,pixel0008,pixel0009,...,pixel0055,pixel0056,pixel0057,pixel0058,pixel0059,pixel0060,pixel0061,pixel0062,pixel0063,label
0,172,182,191,183,180,181,165,164,173,192,...,159,171,181,201,192,184,183,171,157,2
1,98,149,170,193,183,162,164,100,137,175,...,135,83,159,186,185,192,181,143,58,2


In [26]:
df_28_RGB = pd.read_csv("../data/raw/ham_1000_archive/hmnist_28_28_RGB.csv")
df_28_RGB.head(2)

Unnamed: 0,pixel0000,pixel0001,pixel0002,pixel0003,pixel0004,pixel0005,pixel0006,pixel0007,pixel0008,pixel0009,...,pixel2343,pixel2344,pixel2345,pixel2346,pixel2347,pixel2348,pixel2349,pixel2350,pixel2351,label
0,192,153,193,195,155,192,197,154,185,202,...,173,124,138,183,147,166,185,154,177,2
1,25,14,30,68,48,75,123,93,126,158,...,60,39,55,25,14,28,25,14,27,2


In [27]:
from glob import glob

def create_base_df(base_dir='../data/raw/ham_1000_archive/'):
    
    # Load the metadata file

    df = pd.read_csv(os.path.join(base_dir, 'HAM10000_metadata.csv'))

    # map lesion types to their full names
    lesion_type_dict = {
        'nv': 'Melanocytic nevi',
        'mel': 'Melanoma',
        'bkl': 'Benign keratosis-like lesions ',
        'bcc': 'Basal cell carcinoma',
        'akiec': 'Actinic keratoses',
        'vasc': 'Vascular lesions',
        'df': 'Dermatofibroma'
    }

    # Merge images from both folders into one dictionary
    imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x
                        for x in glob(os.path.join(base_dir, '*', '*.jpg'))}


    df['path'] = df['image_id'].map(imageid_path_dict.get)
    df['cell_type'] = df['dx'].map(lesion_type_dict.get) 
    df['cell_type_idx'] = pd.Categorical(df['cell_type']).codes
    df['target'] = np.where(df['cell_type'] == 'Melanoma', 1, 0)
    return df

df = create_base_df()

In [None]:
df["ta"]

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,path,cell_type,cell_type_idx,target
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,../data/raw/ham_1000_archive/HAM10000_images_p...,Benign keratosis-like lesions,2,0
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,../data/raw/ham_1000_archive/HAM10000_images_p...,Benign keratosis-like lesions,2,0
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,../data/raw/ham_1000_archive/HAM10000_images_p...,Benign keratosis-like lesions,2,0
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,../data/raw/ham_1000_archive/HAM10000_images_p...,Benign keratosis-like lesions,2,0
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,../data/raw/ham_1000_archive/HAM10000_images_p...,Benign keratosis-like lesions,2,0
...,...,...,...,...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,akiec,histo,40.0,male,abdomen,../data/raw/ham_1000_archive/HAM10000_images_p...,Actinic keratoses,0,0
10011,HAM_0002867,ISIC_0033550,akiec,histo,40.0,male,abdomen,../data/raw/ham_1000_archive/HAM10000_images_p...,Actinic keratoses,0,0
10012,HAM_0002867,ISIC_0033536,akiec,histo,40.0,male,abdomen,../data/raw/ham_1000_archive/HAM10000_images_p...,Actinic keratoses,0,0
10013,HAM_0000239,ISIC_0032854,akiec,histo,80.0,male,face,../data/raw/ham_1000_archive/HAM10000_images_p...,Actinic keratoses,0,0


In [25]:
df_larger = pd.merge(df_meta, df_28, left_on='image_id', right_on='id')
df_larger.head()

KeyError: 'id'