<a href="https://colab.research.google.com/github/c4bath/cf860/blob/main/AptosSamplerTrain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# APTOS 2019 Blindness Detection Dataset

Sampling (for compute and memory resource constraints):

* 10% of the 3,662 images from the initial APTOS train_images

* creates .csv file train_small with 'id_code' and 'diagnosis' for the corresponding sample set

* Original class balances preserved

A clinician has rated each image for the severity of diabetic retinopathy on a scale of 0 to 4:

0 - No DR

1 - Mild

2 - Moderate

3 - Severe

4 - Proliferative DR


train_small2: 366 files



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Import libraries
import os
import pandas as pd
import numpy as np
import shutil
from sklearn.model_selection import train_test_split


In [3]:
df = pd.read_csv('/content/drive/MyDrive/cfPublicData/aptos2/train.csv')


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3662 entries, 0 to 3661
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id_code    3662 non-null   object
 1   diagnosis  3662 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 57.3+ KB


In [5]:
source_dir = '/content/drive/MyDrive/cfPublicData/aptos2/train_images/'
destination_dir = '/content/drive/MyDrive/cfPublicData/aptos2/train_small2/'
if not os.path.exists(destination_dir):
    os.makedirs(destination_dir)


In [6]:
class_counts = df['diagnosis'].value_counts()
small_set_counts = (class_counts * 0.1).round().astype(int)

In [7]:
train_small_df = pd.DataFrame()
for diagnosis in small_set_counts.index:
    # Get a subset of the dataframe with the current diagnosis
    subset_df = df[df['diagnosis'] == diagnosis]
    # Perform stratified split to maintain class balance
    _, small_subset_df = train_test_split(
        subset_df,
        test_size=small_set_counts[diagnosis],
        random_state=27,
        stratify=subset_df['diagnosis']
    )
    train_small_df = pd.concat([train_small_df, small_subset_df])

In [8]:
for index, row in train_small_df.iterrows():
    filename = row['id_code'] + '.png'
    shutil.copy(os.path.join(source_dir, filename), os.path.join(destination_dir, filename))


In [9]:
train_small_df.to_csv('/content/drive/MyDrive/cfPublicData/aptos2/train_small2.csv', index=False)

In [14]:
print("Class distribution in the original data:")
print(df['diagnosis'].value_counts(normalize=True))

print("\nClass distribution in the sampled data (10%):")
print(train_small_df['diagnosis'].value_counts(normalize=True))
print(train_small_df.info())

Class distribution in the original data:
0    0.492900
2    0.272802
1    0.101038
4    0.080557
3    0.052703
Name: diagnosis, dtype: float64

Class distribution in the sampled data (10%):
0    0.491803
2    0.273224
1    0.101093
4    0.081967
3    0.051913
Name: diagnosis, dtype: float64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 366 entries, 2779 to 3531
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id_code    366 non-null    object
 1   diagnosis  366 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 8.6+ KB
None
