# Computer Vision VOC Dataset Analysis - Updated Path

**Dataset Location**: `/home/hccsadmin1/Documents/VOC2012_train_val`

This notebook contains the complete computer vision analysis with the correct path for your VOC dataset now located in the Documents folder.

In [None]:
# Import necessary libraries
import os
import pandas as pd
import xml.etree.ElementTree as ET
from PIL import Image, ImageDraw, ImageFont
import matplotlib.pyplot as plt

# ---------------------------------------------------------------------------
# 1. DEFINE PATHS - UPDATED FOR NEW LOCATION
# ---------------------------------------------------------------------------
# VOC dataset is now located in /home/hccsadmin1/Documents/VOC2012_train_val
shared_folder_path = '/home/hccsadmin1/Documents'

# Define the path to the Pascal VOC 2012 dataset root
voc_root = os.path.join(shared_folder_path, 'VOC2012_train_val')

annotations_dir = os.path.join(voc_root, 'Annotations')
images_dir = os.path.join(voc_root, 'JPEGImages')

print(f"VOC Dataset Path: {voc_root}")
print(f"Annotations: {annotations_dir}")
print(f"Images: {images_dir}")

# Verify paths exist
print(f"\nPath validation:")
print(f"Dataset root exists: {os.path.exists(voc_root)}")
print(f"Annotations exists: {os.path.exists(annotations_dir)}")
print(f"Images exists: {os.path.exists(images_dir)}")

In [None]:
# ---------------------------------------------------------------------------
# 2. PARSE XML ANNOTATIONS AND CREATE A DATAFRAME
# ---------------------------------------------------------------------------
def parse_voc_annotations(annotations_dir):
    """Parses all XML files in the Annotations directory."""
    xml_data = []
    
    # Loop through every annotation file
    for xml_file in os.listdir(annotations_dir):
        if not xml_file.endswith('.xml'):
            continue
        
        tree = ET.parse(os.path.join(annotations_dir, xml_file))
        root = tree.getroot()
        
        image_name = root.find('filename').text
        
        # Find every object in the image
        for obj in root.findall('object'):
            label = obj.find('name').text
            bbox = obj.find('bndbox')
            
            # Get bounding box coordinates - FIXED to handle floating-point values
            xmin = int(float(bbox.find('xmin').text))
            ymin = int(float(bbox.find('ymin').text))
            xmax = int(float(bbox.find('xmax').text))
            ymax = int(float(bbox.find('ymax').text))
            
            xml_data.append({
                'image_name': image_name,
                'label': label,
                'xmin': xmin,
                'ymin': ymin,
                'xmax': xmax,
                'ymax': ymax
            })
    
    # Return a pandas DataFrame
    return pd.DataFrame(xml_data)

print(f"Parsing annotations from: {annotations_dir}")
try:
    # Create the DataFrame
    voc_df = parse_voc_annotations(annotations_dir)
    print("✅ Successfully parsed all annotations.")
    print(f"Total annotations: {len(voc_df)}")
    print(f"Unique images: {voc_df['image_name'].nunique()}")
    print(f"Unique classes: {voc_df['label'].nunique()}")
    print("\nHere are the first 5 entries in the DataFrame:")
    display(voc_df.head())
except FileNotFoundError:
    print(f"❌ ERROR: Directory not found at '{annotations_dir}'.")
    print("Please check that the dataset path is correct.")
except Exception as e:
    print(f"❌ ERROR: {e}")

print("-" * 50)

In [None]:
# ---------------------------------------------------------------------------
# 3. ANALYZE CLASS DISTRIBUTION
# ---------------------------------------------------------------------------
if 'voc_df' in locals() and not voc_df.empty:
    print("=== CLASS DISTRIBUTION ===")
    class_counts = voc_df['label'].value_counts()
    print(class_counts)
    
    # Plot class distribution
    plt.figure(figsize=(12, 8))
    class_counts.plot(kind='bar')
    plt.title('VOC2012 Object Class Distribution')
    plt.xlabel('Object Class')
    plt.ylabel('Number of Annotations')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("No data available for analysis")

# Computer Vision - VOC2012 Dataset Parser

This notebook parses Pascal VOC 2012 annotations and visualizes bounding boxes. It handles both direct path access and symbolic link access through SharedContent folder.

In [None]:
# Import necessary libraries
import os
import pandas as pd
import xml.etree.ElementTree as ET
from PIL import Image, ImageDraw, ImageFont

## 1. Define Paths with Multiple Access Options

The notebook tries multiple path configurations to handle different access scenarios:
1. Direct access to `/opt/hccs_shared/Share/VOC2012_train_val`
2. Access through SharedContent symbolic link
3. Local development path

In [None]:
# Define multiple possible paths for the dataset
possible_paths = [
    # Direct access (works when running Python directly)
    '/opt/hccs_shared/Share/VOC2012_train_val',
    
    # Through SharedContent symbolic link (for Jupyter notebook access)
    '/home/hccsadmin1/SharedContent/VOC2012_train_val',
    './SharedContent/VOC2012_train_val',
    '../SharedContent/VOC2012_train_val',
    
    # Local development paths
    './VOC2012_train_val',
    '../VOC2012_train_val'
]

# Find the correct path
shared_folder_path = None
for path in possible_paths:
    if os.path.exists(path):
        shared_folder_path = path
        print(f"✅ Found dataset at: {shared_folder_path}")
        break

if shared_folder_path is None:
    print("❌ Dataset not found at any of the expected locations:")
    for path in possible_paths:
        print(f"  - {path}")
    print("\nPlease check your dataset location and update the paths above.")
else:
    # Define the standard path to the Pascal VOC 2012 dataset root
    voc_root = os.path.join(shared_folder_path, 'VOC2012_train_val')
    annotations_dir = os.path.join(voc_root, 'Annotations')
    images_dir = os.path.join(voc_root, 'JPEGImages')
    
    print(f"📁 Annotations directory: {annotations_dir}")
    print(f"🖼️  Images directory: {images_dir}")
    print(f"📊 Directory exists: {os.path.exists(annotations_dir)}")

## 2. Parse XML Annotations and Create DataFrame

This function parses all XML files in the Annotations directory and extracts bounding box information.

In [None]:
def parse_voc_annotations(annotations_dir):
    """Parses all XML files in the Annotations directory."""
    if not os.path.exists(annotations_dir):
        print(f"❌ Annotations directory not found: {annotations_dir}")
        return pd.DataFrame()
    
    xml_data = []
    xml_files = [f for f in os.listdir(annotations_dir) if f.endswith('.xml')]
    
    print(f"📝 Found {len(xml_files)} XML annotation files")
    
    # Loop through every annotation file
    for xml_file in xml_files:
        try:
            tree = ET.parse(os.path.join(annotations_dir, xml_file))
            root = tree.getroot()
            
            image_name = root.find('filename').text
            
            # Find every object in the image
            for obj in root.findall('object'):
                label = obj.find('name').text
                bbox = obj.find('bndbox')
                
                # Get bounding box coordinates (handle float coordinates)
                xmin = int(float(bbox.find('xmin').text))
                ymin = int(float(bbox.find('ymin').text))
                xmax = int(float(bbox.find('xmax').text))
                ymax = int(float(bbox.find('ymax').text))
                
                xml_data.append({
                    'image_name': image_name,
                    'label': label,
                    'xmin': xmin,
                    'ymin': ymin,
                    'xmax': xmax,
                    'ymax': ymax
                })
        except Exception as e:
            print(f"⚠️  Error parsing {xml_file}: {e}")
            continue
    
    print(f"✅ Successfully parsed {len(xml_data)} annotations")
    return pd.DataFrame(xml_data)

In [None]:
# Parse annotations if we found a valid path
if shared_folder_path and os.path.exists(annotations_dir):
    print(f"🔍 Parsing annotations from: {annotations_dir}")
    voc_df = parse_voc_annotations(annotations_dir)
    
    if not voc_df.empty:
        print("✅ Successfully parsed all annotations.")
        print("📊 Here are the first 5 entries in the DataFrame:")
        display(voc_df.head())
        
        print(f"\n📈 Dataset Statistics:")
        print(f"  - Total annotations: {len(voc_df)}")
        print(f"  - Unique images: {voc_df['image_name'].nunique()}")
        print(f"  - Unique labels: {voc_df['label'].nunique()}")
        print(f"  - Labels: {sorted(voc_df['label'].unique())}")
    else:
        print("❌ No annotations were parsed successfully")
else:
    print("❌ Cannot proceed without valid dataset path")

## 3. Visualize Sample Image with Bounding Boxes

This function draws bounding boxes on a sample image to visualize the annotations.

In [None]:
def visualize_image(image_name, dataframe, images_dir):
    """Draws bounding boxes on a given image."""
    # Get the full path to the image
    img_path = os.path.join(images_dir, image_name)
    
    if not os.path.exists(img_path):
        print(f"❌ Image not found: {img_path}")
        return
    
    try:
        # Open the image
        img = Image.open(img_path).convert("RGB")
        draw = ImageDraw.Draw(img)
        
        # Get all annotations for this specific image
        image_annotations = dataframe[dataframe['image_name'] == image_name]
        
        print(f"🖼️  Image: {image_name}")
        print(f"📦 Found {len(image_annotations)} bounding boxes")
        
        # Draw a rectangle and label for each object
        colors = ['red', 'blue', 'green', 'yellow', 'orange', 'purple', 'cyan']
        for i, (_, row) in enumerate(image_annotations.iterrows()):
            box = [row['xmin'], row['ymin'], row['xmax'], row['ymax']]
            color = colors[i % len(colors)]
            draw.rectangle(box, outline=color, width=3)
            
            # Add text label with background
            label_text = row['label']
            text_bbox = draw.textbbox((row['xmin'], row['ymin']), label_text)
            draw.rectangle(text_bbox, fill=color)
            draw.text((row['xmin'], row['ymin']), label_text, fill="white")
            
            print(f"  - {label_text}: ({row['xmin']}, {row['ymin']}) to ({row['xmax']}, {row['ymax']})")

        print(f"🎨 Displaying image '{image_name}' with bounding boxes:")
        display(img)
        
    except Exception as e:
        print(f"❌ Error visualizing image {image_name}: {e}")

In [None]:
# Visualize a random image from the dataset
if 'voc_df' in locals() and not voc_df.empty and shared_folder_path:
    try:
        sample_image = voc_df['image_name'].sample(1).iloc[0]
        visualize_image(sample_image, voc_df, images_dir)
    except Exception as e:
        print(f"❌ Error during visualization: {e}")
else:
    print("⚠️  No data available for visualization")

## 4. Dataset Analysis

Let's analyze the dataset to understand the distribution of object classes and annotations.

In [None]:
# Analyze the dataset
if 'voc_df' in locals() and not voc_df.empty:
    print("📊 Dataset Analysis:")
    print("=" * 50)
    
    # Class distribution
    class_counts = voc_df['label'].value_counts()
    print(f"\n🏷️  Object Class Distribution:")
    for label, count in class_counts.head(10).items():
        print(f"  {label}: {count} annotations")
    
    # Images with most annotations
    image_counts = voc_df['image_name'].value_counts()
    print(f"\n🖼️  Images with Most Annotations:")
    for image, count in image_counts.head(5).items():
        print(f"  {image}: {count} objects")
    
    # Bounding box size analysis
    voc_df['width'] = voc_df['xmax'] - voc_df['xmin']
    voc_df['height'] = voc_df['ymax'] - voc_df['ymin']
    voc_df['area'] = voc_df['width'] * voc_df['height']
    
    print(f"\n📏 Bounding Box Statistics:")
    print(f"  Average width: {voc_df['width'].mean():.1f} pixels")
    print(f"  Average height: {voc_df['height'].mean():.1f} pixels")
    print(f"  Average area: {voc_df['area'].mean():.1f} square pixels")
    
    display(voc_df[['label', 'width', 'height', 'area']].describe())
else:
    print("⚠️  No data available for analysis")

## 5. Troubleshooting

If you're having issues accessing the dataset, try these steps:

1. **Check if the symbolic link works:**
   ```bash
   ls -la ~/SharedContent/
   ```

2. **Check if you can access the dataset directly:**
   ```bash
   ls /opt/hccs_shared/Share/VOC2012_train_val/
   ```

3. **Check Jupyter permissions:**
   ```bash
   whoami
   groups
   ```

4. **Alternative: Copy dataset to accessible location:**
   ```bash
   cp -r /opt/hccs_shared/Share/VOC2012_train_val ~/VOC2012_train_val
   ```