In [2]:
import pandas as pd
import numpy as np
import os 
import random


In [3]:
DATA_DIR = "/apps/local/shared/HC701/assessment/assignment_2/task_2/TBX11K"
USED_CLASSES = ["tb", "health"]
# PLEASE NOTE THAT CLASS LABEL 0 WILL BE FOR "TB" AND CLASS LABEL 1 WILL BE FOR "HEALTHY"

In [4]:
sorted_imgs_dict = {}

for i in USED_CLASSES:
    print(f'{i}: {len(os.listdir(os.path.join(DATA_DIR, "imgs", i)))}')
    sorted_imgs_dict[i] = sorted(os.listdir(os.path.join(DATA_DIR, "imgs", i)))

tb: 800
health: 3800


- Split data for training and testing. You need to use the first (sorted in an ascending order by ID)
20% of images per class for testing and the remaining 80% for training. 

In [5]:
imgs_list = {} 

for j in USED_CLASSES:
    # test_samples = random.sample(sorted_imgs_dict[j], k =int(0.2*len(sorted_imgs_dict[j])))
    test_samples = sorted_imgs_dict[j][:int(0.2*len(sorted_imgs_dict[j]))]
    train_samples = sorted_imgs_dict[j][int(0.2*len(sorted_imgs_dict[j])):]

    # with full path
    train_samples = [os.path.join(DATA_DIR, "imgs", j, i) for i in train_samples]
    test_samples = [os.path.join(DATA_DIR, "imgs", j, i) for i in test_samples]
    imgs_list[j] = [test_samples, train_samples] # e.g., tb : [[test_imgs], [train_imgs]] 



In [6]:
for i in USED_CLASSES:
    print(f'{i} ==> Train: {len(imgs_list[i][0])}, Test: {len(imgs_list[i][1])}')

tb ==> Train: 160, Test: 640
health ==> Train: 760, Test: 3040


### Just some sanity checks

In [7]:
from os.path import exists

for x in range(2):
    for i in imgs_list['tb'][x]:
        if exists(os.path.join(DATA_DIR,"imgs", "tb",  i)):
            pass
        else: 
            print(os.path.join(DATA_DIR, "tb", "imgs", i))

## Creating Dataframes

In [8]:
complete_df_dict = {} # test: {img:[img_1_url, img_2_url, ...], label:[img_1_label, img_1_label, ...]}

for i, mydataset in enumerate(['test', 'train']):
    temp_imgs_array = []
    temp_labels_array = []
    for j, cls in enumerate(USED_CLASSES):
        temp_imgs_array = temp_imgs_array + imgs_list[cls][i]
        temp_labels_array = temp_labels_array + [j for _ in imgs_list[cls][i]]
    complete_df_dict[mydataset] = {'img':temp_imgs_array, "label":temp_labels_array}        

In [9]:
# more sanity checks
from os.path import exists
for i in complete_df_dict['train']['img']:
    if exists(os.path.join(DATA_DIR, "imgs", "tb", i)):
        pass
    else: 
        print(i)

In [10]:
train_df = pd.DataFrame.from_dict(complete_df_dict["train"])
test_df = pd.DataFrame.from_dict(complete_df_dict["test"])

In [11]:
train_df.head()

Unnamed: 0,img,label
0,/apps/local/shared/HC701/assessment/assignment...,0
1,/apps/local/shared/HC701/assessment/assignment...,0
2,/apps/local/shared/HC701/assessment/assignment...,0
3,/apps/local/shared/HC701/assessment/assignment...,0
4,/apps/local/shared/HC701/assessment/assignment...,0


In [12]:
# saving in the cwd
train_df.to_csv("./csv/train.csv")
test_df.to_csv("./csv/test.csv")

### Report the number and the range of filenames for each class in training and testing sets

In [13]:
for label, cls_name in enumerate(USED_CLASSES):
    train_imgs = train_df[train_df['label'] == label]['img'].to_list()
    test_imgs = test_df[test_df['label'] == label]['img'].to_list()
    print(f"Class: {cls_name}")
    print(" ")
    print(f"Number of images used for training: {len(train_imgs)}")
    print(f"Number of images used for testing: {len(test_imgs)}")
    print(" ")
    # print first 5 images and last 5 images
    print(f"Images range: ")
    print(f"First 5 training images: {[i[-10:] for i in train_imgs[:5]]}")
    print(f"Last 5 training images: {[i[-10:] for i in train_imgs[-5:]]}")
    print("")
    print(f"First 5 testing images: {[i[-10:] for i in test_imgs[:5]]}")
    print(f"Last 5 testing images: {[i[-10:] for i in test_imgs[-5:]]}")

    print("")
    print("==============================================================")
    print("")

Class: tb
 
Number of images used for training: 640
Number of images used for testing: 160
 
Images range: 
First 5 training images: ['tb0250.png', 'tb0251.png', 'tb0252.png', 'tb0253.png', 'tb0255.png']
Last 5 training images: ['tb1192.png', 'tb1194.png', 'tb1196.png', 'tb1197.png', 'tb1199.png']

First 5 testing images: ['tb0003.png', 'tb0004.png', 'tb0005.png', 'tb0006.png', 'tb0007.png']
Last 5 testing images: ['tb0241.png', 'tb0242.png', 'tb0244.png', 'tb0246.png', 'tb0248.png']


Class: health
 
Number of images used for training: 3040
Number of images used for testing: 760
 
Images range: 
First 5 training images: ['/h0995.png', '/h0996.png', '/h0997.png', '/h0998.png', '/h0999.png']
Last 5 training images: ['/h4995.png', '/h4996.png', '/h4998.png', '/h4999.png', '/h5000.png']

First 5 testing images: ['/h0001.png', '/h0003.png', '/h0004.png', '/h0005.png', '/h0006.png']
Last 5 testing images: ['/h0989.png', '/h0990.png', '/h0991.png', '/h0992.png', '/h0993.png']




In [14]:
data_stats = {}
data_stats["class_name"] = []
data_stats["dataset"] = []
data_stats["image_count"] = []
data_stats["first_img"] = []
data_stats["last_img"] = []

for label, cls_name in enumerate(USED_CLASSES):
    train_imgs = train_df[train_df['label'] == label]['img'].to_list()
    test_imgs = test_df[test_df['label'] == label]['img'].to_list()

    # TEST SET ================================

    data_stats["class_name"] = data_stats["class_name"]+[cls_name]
    data_stats["dataset"] = data_stats["dataset"]+["test"]
    data_stats["image_count"] = data_stats["image_count"]+[len(test_imgs)]

    data_stats["first_img"] = data_stats["first_img"]+[test_imgs[0][-10:]]
    data_stats["last_img"] = data_stats["last_img"]+[test_imgs[-1][-10:]]

    # TRAIN SET ================================

    data_stats["class_name"] = data_stats["class_name"]+[cls_name]
    data_stats["dataset"] = data_stats["dataset"]+["train"]
    data_stats["image_count"] = data_stats["image_count"]+[len(train_imgs)]

    data_stats["first_img"] = data_stats["first_img"]+[train_imgs[0][-10:]]
    data_stats["last_img"] = data_stats["last_img"]+[train_imgs[-1][-10:]]



data_stats


{'class_name': ['tb', 'tb', 'health', 'health'],
 'dataset': ['test', 'train', 'test', 'train'],
 'image_count': [160, 640, 760, 3040],
 'first_img': ['tb0003.png', 'tb0250.png', '/h0001.png', '/h0995.png'],
 'last_img': ['tb0248.png', 'tb1199.png', '/h0993.png', '/h5000.png']}

In [15]:
data_stats_df = pd.DataFrame.from_dict(data_stats).set_index(["class_name", "dataset"])
# print(data.to_latex())
data_stats_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,image_count,first_img,last_img
class_name,dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tb,test,160,tb0003.png,tb0248.png
tb,train,640,tb0250.png,tb1199.png
health,test,760,/h0001.png,/h0993.png
health,train,3040,/h0995.png,/h5000.png


In [16]:
print(data_stats_df.to_latex())

\begin{tabular}{llrll}
\toprule
       &       &  image\_count &   first\_img &    last\_img \\
class\_name & dataset &              &             &             \\
\midrule
tb & test &          160 &  tb0003.png &  tb0248.png \\
       & train &          640 &  tb0250.png &  tb1199.png \\
health & test &          760 &  /h0001.png &  /h0993.png \\
       & train &         3040 &  /h0995.png &  /h5000.png \\
\bottomrule
\end{tabular}



  print(data_stats_df.to_latex())
