### Download and extract the labeled images in the [Hyper Kvasir Dataset](https://datasets.simula.no/hyper-kvasir/).

In [1]:
!mkdir kvasir
!wget https://datasets.simula.no/downloads/hyper-kvasir/hyper-kvasir-labeled-images.zip

import zipfile
zip_ref = zipfile.ZipFile('/content/hyper-kvasir-labeled-images.zip')
zip_ref.extractall('/content/kvasir')
zip_ref.close()

--2024-01-05 13:07:56--  https://datasets.simula.no/downloads/hyper-kvasir/hyper-kvasir-labeled-images.zip
Resolving datasets.simula.no (datasets.simula.no)... 128.39.36.14
Connecting to datasets.simula.no (datasets.simula.no)|128.39.36.14|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3928814344 (3.7G) [application/zip]
Saving to: ‘hyper-kvasir-labeled-images.zip’


2024-01-05 13:10:35 (23.7 MB/s) - ‘hyper-kvasir-labeled-images.zip’ saved [3928814344/3928814344]



### Removing unnecessary classes. We only need 'esophagitis-a' and 'esophagitis-b-d' classes.

In [2]:
!rm -r /content/kvasir/labeled-images/upper-gi-tract/pathological-findings/barretts
!rm -r /content/kvasir/labeled-images/upper-gi-tract/pathological-findings/barretts-short-segment

### Randomly moving 10% of the images from each sub-directory to a new folder named Validation_dir.

These 10% of images will never be used for training the model. They will only be used for testing the models as unseen data.

In [3]:
import os
import random
import shutil

# Define paths
source_dir = '/content/kvasir/labeled-images/upper-gi-tract/pathological-findings'
validation_dir = '/content/eshphagitis-severity-validation'

# Create validation directory if it doesn't exist
if not os.path.exists(validation_dir):
    os.makedirs(validation_dir)

# Get list of category folders in the source directory
categories = os.listdir(source_dir)

# Function to move a percentage of images from source to destination
def move_random_images(category, percentage=0.1):
    source_category_path = os.path.join(source_dir, category)
    validation_category_path = os.path.join(validation_dir, category)

    # Create category folder in validation directory
    if not os.path.exists(validation_category_path):
        os.makedirs(validation_category_path)

    # Get list of images in the category folder
    images = os.listdir(source_category_path)
    num_images_to_move = int(percentage * len(images))

    # Randomly select images
    random_images = random.sample(images, num_images_to_move)

    # Move selected images to validation directory
    for image in random_images:
        source_path = os.path.join(source_category_path, image)
        destination_path = os.path.join(validation_category_path, image)
        shutil.move(source_path, destination_path)

# Move 10% of images from each category to the validation directory
for category in categories:
    move_random_images(category, percentage=0.1)

### Verifying the correct distribution of images

In [4]:
def count_images_in_directory(directory):
    count = 0
    for root, dirs, files in os.walk(directory):
        count += len(files)
    return count

# Count images in the source directory
total_source_images = 0
for category in categories:
    source_category_path = os.path.join(source_dir, category)
    total_source_images += count_images_in_directory(source_category_path)

# Count images in the validation directory
total_validation_images = count_images_in_directory(validation_dir)

# Print the results
print(f"Total source images: {total_source_images}")
print(f"Total validation images: {total_validation_images}")


Total source images: 597
Total validation images: 66


### Verifying data distribution for each class

In [5]:
def count_images_in_directory_recursive_with_subdirectories(directory):
    counts = {}

    for root, dirs, files in os.walk(directory):
        current_dir = os.path.relpath(root, directory)

        # Skip counting if it's the main directory
        if current_dir == '.':
            continue

        counts[current_dir] = len(files)

    return counts

# Count images in the source dataset with subdirectories
source_counts = count_images_in_directory_recursive_with_subdirectories(source_dir)

# Count images in the validation dataset with subdirectories
validation_counts = count_images_in_directory_recursive_with_subdirectories(validation_dir)

# Print the results
print("Source Dataset:")
for subdirectory, count in source_counts.items():
    print(f"{subdirectory}: {count} images.")

print("\nValidation Dataset:")
for subdirectory, count in validation_counts.items():
    print(f"{subdirectory}: {count} images.")

Source Dataset:
esophagitis-a: 363 images.
esophagitis-b-d: 234 images.

Validation Dataset:
esophagitis-a: 40 images.
esophagitis-b-d: 26 images.


### Following part is Optional
We compress (zip) the data and upload them to our personal drives. After separating 10% image, we have made both datasets available at Google drive as well as Onedrive.

### Microsoft Onedrive Links

Esophagitis Severity Dataset: https://pern-my.sharepoint.com/:u:/g/personal/m21f0034ai010_fecid_paf-iast_edu_pk/ETUX9oiTyy1Dobrv87loEoQB8DTIkZpeCpZdhyGgW9a5MQ?e=stvKpT&download=1

Esophagitis Severity Validation Dataset: https://pern-my.sharepoint.com/:u:/g/personal/m21f0034ai010_fecid_paf-iast_edu_pk/EeQQ4EsiCa9DqXpKH04T_lgB29wtQVSBnPvABYtFaVL8hw?e=V3m0Bo&download=1


### Google Drive Links

Esophagitis Severity Dataset: https://drive.google.com/file/d/12VX4JNZCdQeKYjbl8hCZhyNRCBR-wFLA/view?usp=sharing

Esophagitis Severity Validation Dataset: https://drive.google.com/file/d/18M-VYljIJuWxKam2gf15jcY_cRQyvJNK/view?usp=sharing

In [4]:
from google.colab import files

# Zip the source dataset
!zip -r esophagitis-severity.zip /content/kvasir/labeled-images/upper-gi-tract/pathological-findings

# Zip the validation dataset
!zip -r eshphagitis-severity-validation.zip /content/eshphagitis-severity-validation

ModuleNotFoundError: No module named 'google.colab'

In [8]:
# Download the files
files.download("/content/esophagitis-severity.zip")
files.download("/content/eshphagitis-severity-validation.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>