<a href="https://colab.research.google.com/github/alfiannnas/tbc-detection-app/blob/main/TBC_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TBC Detection using CNN

## Import Library

In [1]:
import shutil
import kagglehub
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import random

## Import Dataset

In [2]:
# Download latest version
path = kagglehub.dataset_download("tawsifurrahman/tuberculosis-tb-chest-xray-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/tawsifurrahman/tuberculosis-tb-chest-xray-dataset?dataset_version_number=3...


100%|██████████| 663M/663M [00:08<00:00, 77.4MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/tawsifurrahman/tuberculosis-tb-chest-xray-dataset/versions/3


In [3]:
# Path Dataset from KaggleHub
src_path = "/root/.cache/kagglehub/datasets/tawsifurrahman/tuberculosis-tb-chest-xray-dataset/versions/3"

# Destination Path
dst_path = "/content/dataset"

# Copy Dataset from KaggleHub into Destination Path
shutil.copytree(src_path, dst_path, dirs_exist_ok=True)

print("Dataset copied to:", dst_path)


Dataset copied to: /content/dataset


## Load Dataset

### Check Dataset

In [10]:
# Check and Count our Dataset
path = '/content/dataset/TB_Chest_Radiography_Database'
counter = 0
dir_names=['Normal', 'Tuberculosis']

for file in (dir_names):
    dir = os.listdir(f"{path}/{file}")

    print(f"Data for {file} is {len(dir)}")

Data for Normal is 3500
Data for Tuberculosis is 700


### Load Metadata

In [4]:
# Load Metadata
df_norm = pd.read_excel('/content/dataset/TB_Chest_Radiography_Database/Normal.metadata.xlsx')
df_tb = pd.read_excel('/content/dataset/TB_Chest_Radiography_Database/Tuberculosis.metadata.xlsx')

In [5]:
# Check Sample Data of Normal Metadata
df_norm

Unnamed: 0,FILE NAME,FORMAT,SIZE,URL
0,Normal-1,PNG,512*512,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
1,Normal-2,PNG,512*512,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
2,Normal-3,PNG,512*512,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
3,Normal-4,PNG,512*512,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
4,Normal-5,PNG,512*512,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
...,...,...,...,...
3495,Normal-3496,PNG,512*512,https://www.kaggle.com/c/rsna-pneumonia-detect...
3496,Normal-3497,PNG,512*512,https://www.kaggle.com/c/rsna-pneumonia-detect...
3497,Normal-3498,PNG,512*512,https://www.kaggle.com/c/rsna-pneumonia-detect...
3498,Normal-3499,PNG,512*512,https://www.kaggle.com/c/rsna-pneumonia-detect...


In [6]:
# Check Sample Data of Tuberculosis Metadata
df_tb

Unnamed: 0,FILE NAME,FORMAT,SIZE,URL
0,Tuberculosis-1,PNG,512*512,http://tuberculosis.by/
1,Tuberculosis-2,PNG,512*512,http://tuberculosis.by/
2,Tuberculosis-3,PNG,512*512,http://tuberculosis.by/
3,Tuberculosis-4,PNG,512*512,http://tuberculosis.by/
4,Tuberculosis-5,PNG,512*512,http://tuberculosis.by/
...,...,...,...,...
695,Tuberculosis-696,PNG,512*512,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
696,Tuberculosis-697,PNG,512*512,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
697,Tuberculosis-698,PNG,512*512,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
698,Tuberculosis-699,PNG,512*512,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...


In [7]:
# Add Label for Every row of metadata ['Tuberculosis']
df_norm['Tuberculosis'] = float(0)
df_tb['Tuberculosis'] = float(1)

# Remove Unused Column
df_norm.drop(columns = ['FORMAT', 'SIZE', 'URL'], inplace = True)
df_tb.drop(columns = ['FORMAT', 'SIZE', 'URL'], inplace = True)


In [8]:
# Merge DF Normal and DF Tuberculosis into 1 Dataframe
df = pd.concat([df_norm, df_tb])
df = df.sample(frac=1).reset_index(drop=True)

# Set Label
label = ['Tuberculosis']

In [9]:
# Check Our Result of Dataframe After Merged
df

Unnamed: 0,FILE NAME,Tuberculosis
0,Normal-3114,0.0
1,Tuberculosis-214,1.0
2,Tuberculosis-236,1.0
3,Normal-2151,0.0
4,Normal-2417,0.0
...,...,...
4195,Normal-2948,0.0
4196,Normal-2509,0.0
4197,Normal-293,0.0
4198,Normal-1624,0.0


## Split Data

In [11]:
# Create Folder Training, Validation, and Testing

root_path = './'
train_dir = os.path.join(root_path, 'training')
val_dir = os.path.join(root_path, 'validation')
test_dir = os.path.join(root_path, 'testing')

# Create All Necessary Directories

for split in ['training', 'validation', 'testing']:
    for label in ['Normal', 'Tuberculosis']:
        os.makedirs(os.path.join(root_path, split, label), exist_ok=True)

os.makedirs(os.path.join(root_path, 'Images'), exist_ok=True)

In [12]:
# Set Path Directory of Data Folder That Has Been Created Before

IMAGE_DIR = os.path.join('./', 'Images')

TRAINING_DIR = "./training"
VAL_DIR = "./validation"
TESTING_DIR = "./testing"

TRAINING_NORM_DIR = os.path.join(TRAINING_DIR, "Normal/")
VAL_NORM_DIR = os.path.join(VAL_DIR, "Normal/")
TESTING_NORM_DIR = os.path.join(TESTING_DIR, "Normal/")

TRAINING_TB_DIR = os.path.join(TRAINING_DIR, "Tuberculosis/")
VAL_TB_DIR = os.path.join(VAL_DIR, "Tuberculosis/")
TESTING_TB_DIR = os.path.join(TESTING_DIR, "Tuberculosis/")

In [13]:
# Set Path Directory Source of Dataset

PATH_NORM = os.path.join(path, 'Normal')
PATH_TB = os.path.join(path, 'Tuberculosis')

In [14]:
# Function to Split Data. So the Data from the source its copied into Training, Validation, and Testing Folder

def split_data(SOURCE, TRAINING, VAL, TESTING, SPLIT_SIZE_TRAIN = 0.8, SPLIT_SIZE_VAL = 0.1):

  zero_dir = [fn for fn in os.listdir(SOURCE) if os.path.getsize(os.path.join(SOURCE, fn)) == 0]
  for fn in zero_dir :
    print(fn,'Skipping, Ignore Zero Length!')

  use_dir = [fn for fn in os.listdir(SOURCE) if fn not in zero_dir]
  random.seed(2)
  train_dt = random.sample(use_dir, np.int64(SPLIT_SIZE_TRAIN * len(use_dir)))
  test_dt = [fn for fn in use_dir if fn not in train_dt]
  val_dt = random.sample(test_dt, np.int64(SPLIT_SIZE_VAL * len(use_dir)))
  test_dt = [fn for fn in test_dt if fn not in val_dt]
  for fn in train_dt :
    shutil.copy(os.path.join(SOURCE, fn), TRAINING)
  for fn in val_dt :
    shutil.copy(os.path.join(SOURCE, fn), VAL)
  for fn in test_dt:
    shutil.copy(os.path.join(SOURCE, fn), TESTING)


split_data(PATH_NORM, TRAINING_NORM_DIR, VAL_NORM_DIR,  TESTING_NORM_DIR)
split_data(PATH_TB, TRAINING_TB_DIR, VAL_TB_DIR,  TESTING_TB_DIR)

In [15]:
# Function to count files in a directory
def count_files(path):
    return len(os.listdir(path))

# --- Normal ---
print("Normal Data Split:")
print(f"Training: {count_files(TRAINING_NORM_DIR)} images")
print(f"Validation: {count_files(VAL_NORM_DIR)} images")
print(f"Testing: {count_files(TESTING_NORM_DIR)} images\n")

# --- Tuberculosis ---
print("Tuberculosis Data Split:")
print(f"Training: {count_files(TRAINING_TB_DIR)} images")
print(f"Validation: {count_files(VAL_TB_DIR)} images")
print(f"Testing: {count_files(TESTING_TB_DIR)} images")


Normal Data Split:
Training: 2800 images
Validation: 350 images
Testing: 350 images

Tuberculosis Data Split:
Training: 560 images
Validation: 70 images
Testing: 70 images
