Referenced: https://www.kaggle.com/datasets/masoudnickparvar/brain-tumor-mri-dataset/discussion/482896

In [2]:
import hashlib
import os
import pandas as pd

In [4]:
def compute_hash(file):
    hasher = hashlib.md5()
    with open(file, 'rb') as f:
        buf = f.read()
        hasher.update(buf)
    return hasher.hexdigest()

LABELS = ['glioma', 'meningioma', 'notumor', 'pituitary']
def list_files(hash_dict):
    for data_type in ['Training', 'Testing']:
        for label in LABELS:
            folder_path = os.path.join(data_type, label)
            for root, dirs, files in os.walk(folder_path):
                for file in files:
                    if file.endswith(".jpg"):
                        file_path = os.path.join(root, file)
                        file_hash = compute_hash(file_path)
                        if file_hash in hash_dict:
                            hash_dict[file_hash].append(file_path)
                        else:
                            hash_dict[file_hash] = [file_path]

hash_dict = {}
list_files(hash_dict)


In [5]:
len(hash_dict)

6726

In [11]:
def remove_duplicates(hash_dict):
    duplicate_count = 0
    for hash_value, file_paths in hash_dict.items():
        if len(file_paths) > 1:
            for file_path in file_paths[1:]:
                print(f"Removing duplicate (hash : {hash_value}) : {file_path}")
                os.remove(file_path)
                duplicate_count += 1
    print(f"Number of duplicates : {duplicate_count}")

In [12]:
remove_duplicates(hash_dict)

Removing duplicate (hash : b0481c04cf82055b53bed325556f744e) : Testing\meningioma\Te-me_0135.jpg
Removing duplicate (hash : ab525628d00898aa6b1088d9c1d5a785) : Training\meningioma\Tr-me_0281.jpg
Removing duplicate (hash : af3a4cf5d54bd454b733bfe3d2177b4a) : Training\meningioma\Tr-me_0211.jpg
Removing duplicate (hash : af3a4cf5d54bd454b733bfe3d2177b4a) : Training\meningioma\Tr-me_0366.jpg
Removing duplicate (hash : a501e6ea3748d3aebea8e8735268ec5b) : Training\meningioma\Tr-me_0238.jpg
Removing duplicate (hash : 333ca271271562e025951545e76a8b0b) : Training\meningioma\Tr-me_0251.jpg
Removing duplicate (hash : 3cb1d6594ea1a7391608e9bc8245529c) : Testing\meningioma\Te-me_0143.jpg
Removing duplicate (hash : 832c3e0243e8fafe4313d899e233285d) : Training\meningioma\Tr-me_0617.jpg
Removing duplicate (hash : 7be661c7ec1d3c648894b51bb7ffdf3e) : Training\notumor\Tr-no_1059.jpg
Removing duplicate (hash : 713df8b9d91cb6721c6e2478df82fc02) : Training\notumor\Tr-no_0994.jpg
Removing duplicate (hash : 5

In [9]:
def get_class_paths(path):
    """
    path: should be either "Training" or "Testing"
    """
    classes = []
    class_paths = []
    
    for label in os.listdir(path):
        label_path = os.path.join(path, label)

        if os.path.isdir(label_path):
            for image in os.listdir(label_path):
                image_path = os.path.join(label_path, image)

                classes.append(label)
                class_paths.append(image_path)
    
    df = pd.DataFrame({
        'Class Path': class_paths,
        'Class': classes
    })

    return df

In [None]:
train_df = get_class_paths("Training")
val_counts = pd.DataFrame(train_df['Class'].value_counts())
total_count = len(train_df)
print()

Ratio of class Class to total count: pituitary     0.261728
notumor       0.257562
meningioma    0.241442
glioma        0.239268
Name: Class, dtype: float64


In [29]:
test_df = get_class_paths("Testing")
val_counts = pd.DataFrame(test_df['Class'].value_counts())
total_count = len(test_df)
for c in val_counts:
    print(f"Ratio of class {c} to total count: {val_counts[c]/total_count}")

Ratio of class Class to total count: notumor       0.256432
meningioma    0.250622
glioma        0.248133
pituitary     0.244813
Name: Class, dtype: float64


In [None]:
# move all data to one folder

for path in ["Training", "Testing"]:
    for label in os.listdir(path):
        label_path = os.path.join(path, label)
        new_label_path = os.path.join("allData", label)
        if not os.path.isdir(new_label_path):
            os.makedirs(new_label_path)
        if os.path.isdir(label_path):
            for image in os.listdir(label_path):
                image_path = os.path.join(label_path, image)
                new_image_path = os.path.join(new_label_path, image)
                os.rename(image_path, new_image_path)


In [10]:
df = get_class_paths("allData")

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['Class'])

In [15]:
train_df['Class'].value_counts()/len(train_df)

pituitary     0.258736
notumor       0.257249
meningioma    0.243123
glioma        0.240892
Name: Class, dtype: float64

In [16]:
test_df['Class'].value_counts()/len(test_df)

pituitary     0.258544
notumor       0.257801
meningioma    0.242942
glioma        0.240713
Name: Class, dtype: float64