# Computer Vision VOC Dataset Analysis - E4S Container Version

**Run this notebook inside your E4S Singularity container to access the VOC dataset.**

Based on the system analysis, the VOC dataset should be accessible through container-specific paths.

In [None]:
# Environment Debug - Find the correct path inside the container
import os
import subprocess

print("=== E4S CONTAINER ENVIRONMENT DEBUG ===")
print(f"Current working directory: {os.getcwd()}")
print(f"User: {os.environ.get('USER', 'unknown')}")
print(f"Home directory: {os.environ.get('HOME', 'unknown')}")

print("\n=== CHECKING POSSIBLE VOC DATASET PATHS ===")
possible_paths = [
    '/opt/hccs_shared/Share/VOC2012_train_val',                    # Direct host path
    '/SharedContent/LocalShare/VOC2012_train_val',                 # Container shared content
    '/home/hccsadmin1/SharedContent/LocalShare/VOC2012_train_val', # User shared content
    '/e4sonpremvm/instructor_data/hccsadmin1/Share/VOC2012_train_val', # Instructor data
    '/mnt/VOC2012_train_val',                                      # Mount point
    '/data/VOC2012_train_val',                                     # Data directory
    './VOC2012_train_val'                                          # Current directory
]

working_path = None
for path in possible_paths:
    if os.path.exists(path):
        print(f"✅ EXISTS: {path}")
        # Check if it has the expected structure
        annotations_path = os.path.join(path, 'VOC2012_train_val', 'Annotations')
        if os.path.exists(annotations_path):
            xml_files = [f for f in os.listdir(annotations_path) if f.endswith('.xml')]
            xml_count = len(xml_files)
            print(f"   ✅ Annotations folder found with {xml_count} XML files")
            if xml_count > 1000:  # Reasonable threshold for VOC dataset
                working_path = path
                print(f"   🎯 USING THIS PATH: {path}")
                break
        else:
            print(f"   ❌ No Annotations folder found")
    else:
        print(f"❌ NOT FOUND: {path}")

if working_path:
    print(f"\n🎉 SUCCESS! Using path: {working_path}")
else:
    print(f"\n❌ VOC dataset not found in any expected location")
    print("Searching filesystem for VOC-related directories...")
    
    # Extended search
    for root_dir in ['/', '/opt', '/mnt', '/data', '/home', '/SharedContent']:
        if os.path.exists(root_dir):
            try:
                for item in os.listdir(root_dir):
                    if any(keyword in item.lower() for keyword in ['voc', 'pascal', '2012']):
                        full_path = os.path.join(root_dir, item)
                        print(f"🔍 FOUND: {full_path}")
                        if os.path.isdir(full_path):
                            subcontents = os.listdir(full_path)[:5]
                            print(f"   Contents (first 5): {subcontents}")
            except (PermissionError, OSError):
                pass

In [None]:
# Import necessary libraries for VOC dataset analysis
import pandas as pd
import xml.etree.ElementTree as ET
from PIL import Image, ImageDraw, ImageFont

# Use the working path found above, or set manually if needed
if 'working_path' in locals() and working_path:
    shared_folder_path = working_path
else:
    # Manual fallback - update this based on the debug output above
    shared_folder_path = '/opt/hccs_shared/Share/VOC2012_train_val'  # UPDATE THIS LINE
    print(f"⚠️  Using fallback path: {shared_folder_path}")
    print("If the path above doesn't work, run the debug cell and update this line with the correct path.")

# Set up paths
voc_root = os.path.join(shared_folder_path, 'VOC2012_train_val')
annotations_dir = os.path.join(voc_root, 'Annotations')
images_dir = os.path.join(voc_root, 'JPEGImages')

print(f"Using dataset path: {voc_root}")
print(f"Annotations directory: {annotations_dir}")
print(f"Images directory: {images_dir}")

# Verify paths exist
if os.path.exists(annotations_dir) and os.path.exists(images_dir):
    print("✅ All paths verified!")
else:
    print("❌ Path verification failed!")
    print(f"Annotations exists: {os.path.exists(annotations_dir)}")
    print(f"Images exists: {os.path.exists(images_dir)}")

In [None]:
def parse_voc_annotations(annotations_dir):
    """Parses all XML files in the Annotations directory."""
    xml_data = []
    
    if not os.path.exists(annotations_dir):
        print(f"❌ Annotations directory not found: {annotations_dir}")
        return pd.DataFrame()
    
    xml_files = [f for f in os.listdir(annotations_dir) if f.endswith('.xml')]
    print(f"Found {len(xml_files)} XML annotation files")
    
    # Process files with progress indication
    for i, xml_file in enumerate(xml_files):
        if i % 5000 == 0:  # Progress indicator
            print(f"Processing file {i+1}/{len(xml_files)}")
        
        try:
            tree = ET.parse(os.path.join(annotations_dir, xml_file))
            root = tree.getroot()
            
            image_name = root.find('filename').text
            
            # Find every object in the image
            for obj in root.findall('object'):
                label = obj.find('name').text
                bbox = obj.find('bndbox')
                
                # Get bounding box coordinates (handle float values properly)
                xmin = int(float(bbox.find('xmin').text))
                ymin = int(float(bbox.find('ymin').text))
                xmax = int(float(bbox.find('xmax').text))
                ymax = int(float(bbox.find('ymax').text))
                
                xml_data.append({
                    'image_name': image_name,
                    'label': label,
                    'xmin': xmin,
                    'ymin': ymin,
                    'xmax': xmax,
                    'ymax': ymax
                })
        except Exception as e:
            print(f"Error processing {xml_file}: {e}")
    
    return pd.DataFrame(xml_data)

# Parse the annotations
print(f"\nParsing annotations from: {annotations_dir}")
voc_df = parse_voc_annotations(annotations_dir)

if not voc_df.empty:
    print(f"Successfully parsed {len(voc_df)} annotations from {voc_df['image_name'].nunique()} images.")
    print("\nHere are the first 5 entries in the DataFrame:")
    display(voc_df.head())
else:
    print("❌ No annotations were parsed. Check the path configuration above.")

In [None]:
def visualize_image(image_name, dataframe):
    """Draws bounding boxes on a given image."""
    # Get the full path to the image
    img_path = os.path.join(images_dir, image_name)
    
    if not os.path.exists(img_path):
        print(f"❌ Image not found: {img_path}")
        return
    
    # Open the image
    img = Image.open(img_path).convert("RGB")
    draw = ImageDraw.Draw(img)
    
    # Get all annotations for this specific image
    image_annotations = dataframe[dataframe['image_name'] == image_name]
    
    # Draw a rectangle and label for each object
    colors = ['red', 'blue', 'green', 'yellow', 'purple', 'orange', 'cyan', 'magenta']
    for i, (_, row) in enumerate(image_annotations.iterrows()):
        color = colors[i % len(colors)]
        box = [row['xmin'], row['ymin'], row['xmax'], row['ymax']]
        draw.rectangle(box, outline=color, width=3)
        
        # Add text label (adjust position to avoid overlap)
        label_text = row['label']
        text_y = max(0, row['ymin']-25)
        draw.text((row['xmin'], text_y), label_text, fill=color)

    print(f"Displaying image '{image_name}' with {len(image_annotations)} bounding boxes:")
    display(img)
    
    # Show object details
    print("Objects detected:")
    for _, row in image_annotations.iterrows():
        print(f"  - {row['label']}: ({row['xmin']}, {row['ymin']}) to ({row['xmax']}, {row['ymax']})")

# Visualize a random image from the dataset
if 'voc_df' in locals() and not voc_df.empty:
    sample_image = voc_df['image_name'].sample(1).iloc[0]
    visualize_image(sample_image, voc_df)
    
    print(f"\n" + "="*60)
    print("DATASET STATISTICS")
    print("="*60)
    print(f"Total annotations: {len(voc_df):,}")
    print(f"Unique images: {voc_df['image_name'].nunique():,}")
    print(f"Average annotations per image: {len(voc_df)/voc_df['image_name'].nunique():.1f}")
    
    print(f"\nTop 10 object classes:")
    class_counts = voc_df['label'].value_counts().head(10)
    for class_name, count in class_counts.items():
        percentage = (count / len(voc_df)) * 100
        print(f"  {class_name:12}: {count:5,} annotations ({percentage:4.1f}%)")
    
    print(f"\nImages with most annotations:")
    image_counts = voc_df['image_name'].value_counts().head(5)
    for image_name, count in image_counts.items():
        print(f"  {image_name}: {count} objects")
else:
    print("⚠️ No dataset loaded. Please run the previous cells to load the data.")

In [None]:
# Interactive Analysis - Explore specific classes or images
if 'voc_df' in locals() and not voc_df.empty:
    print("=== INTERACTIVE ANALYSIS ===")
    
    # Show available classes
    unique_classes = sorted(voc_df['label'].unique())
    print(f"Available object classes ({len(unique_classes)}): {', '.join(unique_classes)}")
    
    # Example: Find images with specific objects
    target_class = 'person'  # Change this to any class you want to explore
    
    class_images = voc_df[voc_df['label'] == target_class]['image_name'].value_counts()
    print(f"\nImages with '{target_class}' (showing top 5):")
    for image, count in class_images.head(5).items():
        print(f"  {image}: {count} {target_class}(s)")
    
    # Visualize an image with the target class
    if not class_images.empty:
        target_image = class_images.index[0]  # Image with most instances
        print(f"\nVisualizing image with most '{target_class}' instances:")
        visualize_image(target_image, voc_df)
    
    print(f"\n💡 TIP: To explore different classes, change the 'target_class' variable above and re-run this cell.")
    print(f"Available classes: {', '.join(unique_classes[:10])}...")
else:
    print("⚠️ No dataset loaded. Please run the previous cells first.")

In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Computer Vision VOC Dataset Analysis - E4S Container Version\n",
    "\n",
    "**Run this notebook inside your E4S Singularity container to access the VOC dataset.**\n",
    "\n",
    "Based on the system analysis, the VOC dataset should be accessible through container-specific paths."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Environment Debug - Find the correct path inside the container\n",
    "import os\n",
    "import subprocess\n",
    "\n",
    "print(\"=== E4S CONTAINER ENVIRONMENT DEBUG ===\")\n",
    "print(f\"Current working directory: {os.getcwd()}\")\n",
    "print(f\"User: {os.environ.get('USER', 'unknown')}\")\n",
    "print(f\"Home directory: {os.environ.get('HOME', 'unknown')}\")\n",
    "\n",
    "print(\"\\n=== CHECKING POSSIBLE VOC DATASET PATHS ===\")\n",
    "possible_paths = [\n",
    "    '/opt/hccs_shared/Share/VOC2012_train_val',                    # Direct host path\n",
    "    '/SharedContent/LocalShare/VOC2012_train_val',                 # Container shared content\n",
    "    '/home/hccsadmin1/SharedContent/LocalShare/VOC2012_train_val', # User shared content\n",
    "    '/e4sonpremvm/instructor_data/hccsadmin1/Share/VOC2012_train_val', # Instructor data\n",
    "    '/mnt/VOC2012_train_val',                                      # Mount point\n",
    "    '/data/VOC2012_train_val',                                     # Data directory\n",
    "    './VOC2012_train_val'                                          # Current directory\n",
    "]\n",
    "\n",
    "working_path = None\n",
    "for path in possible_paths:\n",
    "    if os.path.exists(path):\n",
    "        print(f\"✅ EXISTS: {path}\")\n",
    "        # Check if it has the expected structure\n",
    "        annotations_path = os.path.join(path, 'VOC2012_train_val', 'Annotations')\n",
    "        if os.path.exists(annotations_path):\n",
    "            xml_files = [f for f in os.listdir(annotations_path) if f.endswith('.xml')]\n",
    "            xml_count = len(xml_files)\n",
    "            print(f\"   ✅ Annotations folder found with {xml_count} XML files\")\n",
    "            if xml_count > 1000:  # Reasonable threshold for VOC dataset\n",
    "                working_path = path\n",
    "                print(f\"   🎯 USING THIS PATH: {path}\")\n",
    "                break\n",
    "        else:\n",
    "            print(f\"   ❌ No Annotations folder found\")\n",
    "    else:\n",
    "        print(f\"❌ NOT FOUND: {path}\")\n",
    "\n",
    "if working_path:\n",
    "    print(f\"\\n🎉 SUCCESS! Using path: {working_path}\")\n",
    "else:\n",
    "    print(f\"\\n❌ VOC dataset not found in any expected location\")\n",
    "    print(\"Searching filesystem for VOC-related directories...\")\n",
    "    \n",
    "    # Extended search\n",
    "    for root_dir in ['/', '/opt', '/mnt', '/data', '/home', '/SharedContent']:\n",
    "        if os.path.exists(root_dir):\n",
    "            try:\n",
    "                for item in os.listdir(root_dir):\n",
    "                    if any(keyword in item.lower() for keyword in ['voc', 'pascal', '2012']):\n",
    "                        full_path = os.path.join(root_dir, item)\n",
    "                        print(f\"🔍 FOUND: {full_path}\")\n",
    "                        if os.path.isdir(full_path):\n",
    "                            subcontents = os.listdir(full_path)[:5]\n",
    "                            print(f\"   Contents (first 5): {subcontents}\")\n",
    "            except (PermissionError, OSError):\n",
    "                pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import necessary libraries for VOC dataset analysis\n",
    "import pandas as pd\n",
    "import xml.etree.ElementTree as ET\n",
    "from PIL import Image, ImageDraw, ImageFont\n",
    "\n",
    "# Use the working path found above, or set manually if needed\n",
    "if 'working_path' in locals() and working_path:\n",
    "    shared_folder_path = working_path\n",
    "else:\n",
    "    # Manual fallback - update this based on the debug output above\n",
    "    shared_folder_path = '/opt/hccs_shared/Share/VOC2012_train_val'  # UPDATE THIS LINE\n",
    "    print(f\"⚠️  Using fallback path: {shared_folder_path}\")\n",
    "    print(\"If the path above doesn't work, run the debug cell and update this line with the correct path.\")\n",
    "\n",
    "# Set up paths\n",
    "voc_root = os.path.join(shared_folder_path, 'VOC2012_train_val')\n",
    "annotations_dir = os.path.join(voc_root, 'Annotations')\n",
    "images_dir = os.path.join(voc_root, 'JPEGImages')\n",
    "\n",
    "print(f\"Using dataset path: {voc_root}\")\n",
    "print(f\"Annotations directory: {annotations_dir}\")\n",
    "print(f\"Images directory: {images_dir}\")\n",
    "\n",
    "# Verify paths exist\n",
    "if os.path.exists(annotations_dir) and os.path.exists(images_dir):\n",
    "    print(\"✅ All paths verified!\")\n",
    "else:\n",
    "    print(\"❌ Path verification failed!\")\n",
    "    print(f\"Annotations exists: {os.path.exists(annotations_dir)}\")\n",
    "    print(f\"Images exists: {os.path.exists(images_dir)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_voc_annotations(annotations_dir):\n",
    "    \"\"\"Parses all XML files in the Annotations directory.\"\"\"\n",
    "    xml_data = []\n",
    "    \n",
    "    if not os.path.exists(annotations_dir):\n",
    "        print(f\"❌ Annotations directory not found: {annotations_dir}\")\n",
    "        return pd.DataFrame()\n",
    "    \n",
    "    xml_files = [f for f in os.listdir(annotations_dir) if f.endswith('.xml')]\n",
    "    print(f\"Found {len(xml_files)} XML annotation files\")\n",
    "    \n",
    "    # Process files with progress indication\n",
    "    for i, xml_file in enumerate(xml_files):\n",
    "        if i % 5000 == 0:  # Progress indicator\n",
    "            print(f\"Processing file {i+1}/{len(xml_files)}\")\n",
    "        \n",
    "        try:\n",
    "            tree = ET.parse(os.path.join(annotations_dir, xml_file))\n",
    "            root = tree.getroot()\n",
    "            \n",
    "            image_name = root.find('filename').text\n",
    "            \n",
    "            # Find every object in the image\n",
    "            for obj in root.findall('object'):\n",
    "                label = obj.find('name').text\n",
    "                bbox = obj.find('bndbox')\n",
    "                \n",
    "                # Get bounding box coordinates (handle float values properly)\n",
    "                xmin = int(float(bbox.find('xmin').text))\n",
    "                ymin = int(float(bbox.find('ymin').text))\n",
    "                xmax = int(float(bbox.find('xmax').text))\n",
    "                ymax = int(float(bbox.find('ymax').text))\n",
    "                \n",
    "                xml_data.append({\n",
    "                    'image_name': image_name,\n",
    "                    'label': label,\n",
    "                    'xmin': xmin,\n",
    "                    'ymin': ymin,\n",
    "                    'xmax': xmax,\n",
    "                    'ymax': ymax\n",
    "                })\n",
    "        except Exception as e:\n",
    "            print(f\"Error processing {xml_file}: {e}\")\n",
    "    \n",
    "    return pd.DataFrame(xml_data)\n",
    "\n",
    "# Parse the annotations\n",
    "print(f\"\\nParsing annotations from: {annotations_dir}\")\n",
    "voc_df = parse_voc_annotations(annotations_dir)\n",
    "\n",
    "if not voc_df.empty:\n",
    "    print(f\"Successfully parsed {len(voc_df)} annotations from {voc_df['image_name'].nunique()} images.\")\n",
    "    print(\"\\nHere are the first 5 entries in the DataFrame:\")\n",
    "    display(voc_df.head())\n",
    "else:\n",
    "    print(\"❌ No annotations were parsed. Check the path configuration above.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def visualize_image(image_name, dataframe):\n",
    "    \"\"\"Draws bounding boxes on a given image.\"\"\"\n",
    "    # Get the full path to the image\n",
    "    img_path = os.path.join(images_dir, image_name)\n",
    "    \n",
    "    if not os.path.exists(img_path):\n",
    "        print(f\"❌ Image not found: {img_path}\")\n",
    "        return\n",
    "    \n",
    "    # Open the image\n",
    "    img = Image.open(img_path).convert(\"RGB\")\n",
    "    draw = ImageDraw.Draw(img)\n",
    "    \n",
    "    # Get all annotations for this specific image\n",
    "    image_annotations = dataframe[dataframe['image_name'] == image_name]\n",
    "    \n",
    "    # Draw a rectangle and label for each object\n",
    "    colors = ['red', 'blue', 'green', 'yellow', 'purple', 'orange', 'cyan', 'magenta']\n",
    "    for i, (_, row) in enumerate(image_annotations.iterrows()):\n",
    "        color = colors[i % len(colors)]\n",
    "        box = [row['xmin'], row['ymin'], row['xmax'], row['ymax']]\n",
    "        draw.rectangle(box, outline=color, width=3)\n",
    "        \n",
    "        # Add text label (adjust position to avoid overlap)\n",
    "        label_text = row['label']\n",
    "        text_y = max(0, row['ymin']-25)\n",
    "        draw.text((row['xmin'], text_y), label_text, fill=color)\n",
    "\n",
    "    print(f\"Displaying image '{image_name}' with {len(image_annotations)} bounding boxes:\")\n",
    "    display(img)\n",
    "    \n",
    "    # Show object details\n",
    "    print(\"Objects detected:\")\n",
    "    for _, row in image_annotations.iterrows():\n",
    "        print(f\"  - {row['label']}: ({row['xmin']}, {row['ymin']}) to ({row['xmax']}, {row['ymax']})\")\n",
    "\n",
    "# Visualize a random image from the dataset\n",
    "if 'voc_df' in locals() and not voc_df.empty:\n",
    "    sample_image = voc_df['image_name'].sample(1).iloc[0]\n",
    "    visualize_image(sample_image, voc_df)\n",
    "    \n",
    "    print(f\"\\n\" + \"=\"*60)\n",
    "    print(\"DATASET STATISTICS\")\n",
    "    print(\"=\"*60)\n",
    "    print(f\"Total annotations: {len(voc_df):,}\")\n",
    "    print(f\"Unique images: {voc_df['image_name'].nunique():,}\")\n",
    "    print(f\"Average annotations per image: {len(voc_df)/voc_df['image_name'].nunique():.1f}\")\n",
    "    \n",
    "    print(f\"\\nTop 10 object classes:\")\n",
    "    class_counts = voc_df['label'].value_counts().head(10)\n",
    "    for class_name, count in class_counts.items():\n",
    "        percentage = (count / len(voc_df)) * 100\n",
    "        print(f\"  {class_name:12}: {count:5,} annotations ({percentage:4.1f}%)\")\n",
    "    \n",
    "    print(f\"\\nImages with most annotations:\")\n",
    "    image_counts = voc_df['image_name'].value_counts().head(5)\n",
    "    for image_name, count in image_counts.items():\n",
    "        print(f\"  {image_name}: {count} objects\")\n",
    "else:\n",
    "    print(\"⚠️ No dataset loaded. Please run the previous cells to load the data.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Interactive Analysis - Explore specific classes or images\n",
    "if 'voc_df' in locals() and not voc_df.empty:\n",
    "    print(\"=== INTERACTIVE ANALYSIS ===\")\n",
    "    \n",
    "    # Show available classes\n",
    "    unique_classes = sorted(voc_df['label'].unique())\n",
    "    print(f\"Available object classes ({len(unique_classes)}): {', '.join(unique_classes)}\")\n",
    "    \n",
    "    # Example: Find images with specific objects\n",
    "    target_class = 'person'  # Change this to any class you want to explore\n",
    "    \n",
    "    class_images = voc_df[voc_df['label'] == target_class]['image_name'].value_counts()\n",
    "    print(f\"\\nImages with '{target_class}' (showing top 5):\")\n",
    "    for image, count in class_images.head(5).items():\n",
    "        print(f\"  {image}: {count} {target_class}(s)\")\n",
    "    \n",
    "    # Visualize an image with the target class\n",
    "    if not class_images.empty:\n",
    "        target_image = class_images.index[0]  # Image with most instances\n",
    "        print(f\"\\nVisualizing image with most '{target_class}' instances:\")\n",
    "        visualize_image(target_image, voc_df)\n",
    "    \n",
    "    print(f\"\\n💡 TIP: To explore different classes, change the 'target_class' variable above and re-run this cell.\")\n",
    "    print(f\"Available classes: {', '.join(unique_classes[:10])}...\")\n",
    "else:\n",
    "    print(\"⚠️ No dataset loaded. Please run the previous cells first.\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}