## 🔹 Load and Verify Dataset Structure

In [None]:
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np
from glob import glob

# Define dataset path
dataset_path = r"C:\Users\anujp\Desktop\Brain Tumor MRI Dataset\archive\Train"

# Get class names (subfolders)
classes = ["Glioma", "Meningioma", "Pituitary", "No Tumor"]


In [None]:
# Print dataset structure
for cls in classes:
    img_count = len(os.listdir(f"{dataset_path}/{cls}/images"))
    lbl_count = len(os.listdir(f"{dataset_path}/{cls}/labels"))
    print(f"📂 {cls}: {img_count} images, {lbl_count} labels")

## 🔹 Count Images per Class

In [None]:
# Count images in each class
data_distribution = {cls: len(os.listdir(f"{dataset_path}/{cls}/images")) for cls in classes}

# Plot
plt.figure(figsize=(8, 5))
plt.bar(data_distribution.keys(), data_distribution.values(), color=["blue", "green", "red", "purple"])
plt.xlabel("Tumor Type")
plt.ylabel("Number of Images")
plt.title("Class Distribution")
plt.show()


## 🔹 Visualize Random Images from Each Class

In [None]:
# Function to display sample images from each class
def display_sample_images(class_name):
    image_path = glob(f"{dataset_path}/{class_name}/images/*.jpg")[0]
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    plt.figure(figsize=(4, 4))
    plt.imshow(img)
    plt.axis("off")
    plt.title(class_name)
    plt.show()

# Display one image per class
for cls in classes:
    display_sample_images(cls)


🔹 Check Sample Bounding Box Annotation

In [None]:
# Read a sample annotation file
sample_label = glob(f"{dataset_path}/Glioma/labels/*.txt")[0]

with open(sample_label, "r") as file:
    annotations = file.readlines()

# Print the first few annotations
print("\nBounding Box Annotations (YOLO Format):")
print("\n".join(annotations[:5]))


## 🔹 Overlay Bounding Box on Image

In [None]:
def plot_bounding_boxes(image_path, label_path):
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    h, w, _ = img.shape

    with open(label_path, "r") as file:
        labels = file.readlines()

    for label in labels:
        class_id, x_center, y_center, width, height = map(float, label.split())
        x1, y1 = int((x_center - width / 2) * w), int((y_center - height / 2) * h)
        x2, y2 = int((x_center + width / 2) * w), int((y_center + height / 2) * h)

        cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 2)
        cv2.putText(img, classes[int(class_id)], (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 0), 2)

    plt.figure(figsize=(6, 6))
    plt.imshow(img)
    plt.axis("off")
    plt.show()

# Test with a sample image
sample_img = glob(f"{dataset_path}/Glioma/images/*.jpg")[0]
sample_lbl = sample_img.replace("images", "labels").replace(".jpg", ".txt")
plot_bounding_boxes(sample_img, sample_lbl)


## 🔹 Check for Missing Labels

In [None]:
missing_labels = []

for cls in classes:
    image_files = glob(f"{dataset_path}/{cls}/images/*.jpg")
    for img in image_files:
        label_path = img.replace("images", "labels").replace(".jpg", ".txt")
        if not os.path.exists(label_path):
            missing_labels.append(label_path)

print(f"❌ Missing Labels: {len(missing_labels)}")
if missing_labels:
    print("Examples:", missing_labels[:5])


In [None]:
import os
import cv2
import numpy as np
from glob import glob

# Define dataset path
dataset_path = r"C:\Users\anujp\Desktop\Brain Tumor MRI Dataset\archive\Train"

# Get all image paths
image_paths = glob(f"{dataset_path}/*/images/*.jpg")

# Dictionary to store unique image sizes
unique_sizes = set()

# Loop through images and get their sizes
for img_path in image_paths:
    img = cv2.imread(img_path)
    if img is not None:
        unique_sizes.add(img.shape[:2])  # (Height, Width)

# Print unique image sizes
print("📏 Unique Image Sizes Found:")
for size in unique_sizes:
    print(size)

In [14]:
# Check color channels
color_channels = set()

for img_path in image_paths:
    img = cv2.imread(img_path)
    if img is not None:
        color_channels.add(img.shape[2])  # Number of channels

print("🎨 Color Channels Found:")
print(color_channels)

🎨 Color Channels Found:
{3}


In [None]:
import matplotlib.pyplot as plt

# Collect pixel values
pixel_values = []

for img_path in image_paths:
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)  # Read as grayscale
    if img is not None:
        pixel_values.extend(img.flatten())

# Plot histogram of pixel values
plt.hist(pixel_values, bins=50, color='blue', alpha=0.7)
plt.title("Pixel Intensity Distribution")
plt.xlabel("Pixel Value")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Get class names (subfolders)
classes = ["Glioma", "Meningioma", "Pituitary", "No Tumor"]

# Print class distribution
for cls in classes:
    img_count = len(os.listdir(f"{dataset_path}/{cls}/images"))
    print(f"📂 {cls}: {img_count} images")

In [None]:
# Check bounding box validity
def is_bbox_valid(bbox, img_width, img_height):
    x_center, y_center, width, height = bbox
    x_min = x_center - width / 2
    y_min = y_center - height / 2
    x_max = x_center + width / 2
    y_max = y_center + height / 2
    return (0 <= x_min < x_max <= img_width) and (0 <= y_min < y_max <= img_height)

invalid_bboxes = 0

for cls in classes:
    label_paths = glob(f"{dataset_path}/{cls}/labels/*.txt")
    for label_path in label_paths:
        with open(label_path, 'r') as f:
            lines = f.readlines()
        img_path = label_path.replace("labels", "images").replace(".txt", ".jpg")
        img = cv2.imread(img_path)
        if img is not None:
            img_height, img_width = img.shape[:2]
            for line in lines:
                bbox = list(map(float, line.strip().split()[1:]))  # Skip class_id
                if not is_bbox_valid(bbox, img_width, img_height):
                    invalid_bboxes += 1

print(f"❌ Invalid Bounding Boxes: {invalid_bboxes}")

In [None]:
import os
import cv2
from collections import defaultdict

def analyze_image_shapes(image_dir, image_ext=".jpg"):
    shapes_count = defaultdict(int)
    channel_count = defaultdict(int)

    for fname in os.listdir(image_dir):
        if fname.endswith(image_ext):
            path = os.path.join(image_dir, fname)
            img = cv2.imread(path)
            if img is not None:
                h, w, c = img.shape
                shapes_count[(h, w)] += 1
                channel_count[c] += 1

    print("Unique (height, width) combos and their frequencies:")
    for shape, count in shapes_count.items():
        print(f"  {shape}: {count}")

    print("\nChannel distributions (e.g., 1=grayscale, 3=color):")
    for c, count in channel_count.items():
        print(f"  {c}-channel: {count}")
analyze_image_shapes(image_paths)