### Download and extract the [Kvasir Dataset-v2](https://datasets.simula.no/kvasir/).

In [1]:
!mkdir kvasir
!wget https://datasets.simula.no/downloads/kvasir/kvasir-dataset-v2.zip

#unzip the data
import zipfile
zip_ref = zipfile.ZipFile('/content/kvasir-dataset-v2.zip')
zip_ref.extractall('/content/kvasir')
zip_ref.close()

--2024-01-03 16:17:18--  https://datasets.simula.no/downloads/kvasir/kvasir-dataset-v2.zip
Resolving datasets.simula.no (datasets.simula.no)... 128.39.36.14
Connecting to datasets.simula.no (datasets.simula.no)|128.39.36.14|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2489312085 (2.3G) [application/zip]
Saving to: ‘kvasir-dataset-v2.zip’


2024-01-03 16:19:35 (17.5 MB/s) - ‘kvasir-dataset-v2.zip’ saved [2489312085/2489312085]



### Randomly moving 10% of the images from each sub-directory to a new folder named Validation_dir.

These 10% of images will never be used for training the model. They will only be used for testing the models as unseen data.

In [2]:
import os
import random
import shutil

# Define paths
source_dir = '/content/kvasir/kvasir-dataset-v2'
validation_dir = '/content/kvasir-validation'

# Create validation directory if it doesn't exist
if not os.path.exists(validation_dir):
    os.makedirs(validation_dir)

# Get list of category folders in the source directory
categories = os.listdir(source_dir)

# Function to move a percentage of images from source to destination
def move_random_images(category, percentage=0.1):
    source_category_path = os.path.join(source_dir, category)
    validation_category_path = os.path.join(validation_dir, category)

    # Create category folder in validation directory
    if not os.path.exists(validation_category_path):
        os.makedirs(validation_category_path)

    # Get list of images in the category folder
    images = os.listdir(source_category_path)
    num_images_to_move = int(percentage * len(images))

    # Randomly select images
    random_images = random.sample(images, num_images_to_move)

    # Move selected images to validation directory
    for image in random_images:
        source_path = os.path.join(source_category_path, image)
        destination_path = os.path.join(validation_category_path, image)
        shutil.move(source_path, destination_path)

# Move 10% of images from each category to the validation directory
for category in categories:
    move_random_images(category, percentage=0.1)

### Verifying the correct distribution of images

In [3]:
def count_images_in_directory(directory):
    count = 0
    for root, dirs, files in os.walk(directory):
        count += len(files)
    return count

# Count images in the source directory
total_source_images = 0
for category in categories:
    source_category_path = os.path.join(source_dir, category)
    total_source_images += count_images_in_directory(source_category_path)

# Count images in the validation directory
total_validation_images = count_images_in_directory(validation_dir)

# Print the results
print(f"Total source images: {total_source_images}")
print(f"Total validation images: {total_validation_images}")


Total source images: 7200
Total validation images: 800


### Verifying data distribution for each class

In [4]:
def count_images_in_directory_recursive_with_subdirectories(directory):
    counts = {}

    for root, dirs, files in os.walk(directory):
        current_dir = os.path.relpath(root, directory)

        # Skip counting if it's the main directory
        if current_dir == '.':
            continue

        counts[current_dir] = len(files)

    return counts

# Count images in the source dataset with subdirectories
source_counts = count_images_in_directory_recursive_with_subdirectories(source_dir)

# Count images in the validation dataset with subdirectories
validation_counts = count_images_in_directory_recursive_with_subdirectories(validation_dir)

# Print the results
print("Source Dataset:")
for subdirectory, count in source_counts.items():
    print(f"{subdirectory}: {count} images.")

print("\nValidation Dataset:")
for subdirectory, count in validation_counts.items():
    print(f"{subdirectory}: {count} images.")

Source Dataset:
polyps: 900 images.
normal-cecum: 900 images.
dyed-lifted-polyps: 900 images.
dyed-resection-margins: 900 images.
esophagitis: 900 images.
normal-pylorus: 900 images.
normal-z-line: 900 images.
ulcerative-colitis: 900 images.

Validation Dataset:
polyps: 100 images.
normal-cecum: 100 images.
dyed-lifted-polyps: 100 images.
dyed-resection-margins: 100 images.
esophagitis: 100 images.
normal-pylorus: 100 images.
normal-z-line: 100 images.
ulcerative-colitis: 100 images.


### Following part is Optional
We compress (zip) the data and upload them to our personal drives. After separating 10% image, we have made both datasets available at Google drive as well as Onedrive.

### Microsoft Onedrive Links

kvasir Dataset: https://pern-my.sharepoint.com/:u:/g/personal/m21f0034ai010_fecid_paf-iast_edu_pk/ERzdk7nD_aBEgnA0NS-DlzwBjM2v0jo8iH-laBgi1YOkEg?e=KEupm9&download=1

Kvasir Validation Dataset: https://pern-my.sharepoint.com/:u:/g/personal/m21f0034ai010_fecid_paf-iast_edu_pk/EenAReYD4WdPiNrvXXFJdMEBOIgB4xt0GGvGQsPeMDQY5Q?e=eqmfXg&download=1

### Google Drive Links

kvasir Dataset: https://drive.google.com/file/d/1RRRNHm4q4JDpkYweTg_fkgpQ1uzsdU6c/view?usp=drive_link

Kvasir Validation Dataset: https://drive.google.com/file/d/1-1HDEF8eyB9imvt4CC1EcE5kgW_39H7_/view?usp=drive_link

In [5]:
import shutil
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Zip the source dataset
!zip -r kvasir-dataset.zip /content/kvasir/kvasir-dataset-v2

# Zip the validation dataset
!zip -r kvasir-validation-dataset.zip /content/kvasir-validation

# Move the zip files to Google Drive
shutil.move('/content/kvasir-dataset.zip', '/content/drive/MyDrive/kvasir-dataset.zip')
shutil.move('/content/kvasir-validation-dataset.zip', '/content/drive/MyDrive/kvasir-validation-dataset.zip')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: content/kvasir/kvasir-dataset-v2/dyed-resection-margins/3f338dcc-eaca-43f1-bb81-8b21ad73b816.jpg (deflated 1%)
  adding: content/kvasir/kvasir-dataset-v2/dyed-resection-margins/3c735bf5-7d7b-47dc-9956-f8228927e9d7.jpg (deflated 1%)
  adding: content/kvasir/kvasir-dataset-v2/dyed-resection-margins/b7af9552-741b-4246-b61c-ad60d7b88827.jpg (deflated 2%)
  adding: content/kvasir/kvasir-dataset-v2/dyed-resection-margins/71e96dbd-ddc7-463c-a34c-492ad913de23.jpg (deflated 2%)
  adding: content/kvasir/kvasir-dataset-v2/dyed-resection-margins/b58caa3b-a8fd-4a80-9de1-494db4830b28.jpg (deflated 1%)
  adding: content/kvasir/kvasir-dataset-v2/dyed-resection-margins/67adc1dd-dbe4-46aa-86f9-0bebb1a4f5cd.jpg (deflated 1%)
  adding: content/kvasir/kvasir-dataset-v2/dyed-resection-margins/026c7b7a-f09e-424a-a27f-3b6081ed8187.jpg (deflated 3%)
  adding: content/kvasir/kvasir-dataset-v2/dyed-resection-margins/221ed5dd-e8c0-4c67-aa1

'/content/drive/MyDrive/kvasir-validation-dataset.zip'