# Day 1: Pandas for Vision Datasets
## CV Bootcamp 2024

Pandas is essential for managing and organizing image datasets, labels, and metadata.

## Why Pandas for Computer Vision?

- **Dataset Management:** Organize thousands of image files
- **Metadata Handling:** Track labels, paths, splits, dimensions
- **Data Analysis:** Explore class distributions, image properties
- **Data Cleaning:** Handle missing data, duplicates
- **Train/Val/Test Splits:** Stratified splitting, maintaining balance

In [None]:
import pandas as pd
import numpy as np

print(f"Pandas version: {pd.__version__}")

## 1. Creating DataFrames for Vision Datasets

In [None]:
# Sample dataset structure
data = {
    'filename': ['img_001.jpg', 'img_002.jpg', 'img_003.jpg', 'img_004.jpg', 'img_005.jpg'],
    'label': ['cat', 'dog', 'cat', 'bird', 'dog'],
    'width': [640, 480, 800, 1024, 640],
    'height': [480, 480, 600, 768, 480],
    'split': ['train', 'train', 'val', 'test', 'train']
}

df = pd.DataFrame(data)
print("Dataset DataFrame:")
print(df)

## 2. Basic DataFrame Operations

In [None]:
print("First 5 rows:")
print(df.head())

print("\nLast 5 rows:")
print(df.tail())

print(f"\nShape: {df.shape}")  # (rows, columns)
print(f"Columns: {list(df.columns)}")

In [None]:
print("Data types:")
print(df.dtypes)

print("\nDataset info:")
print(df.info())

In [None]:
print("Statistical summary:")
print(df.describe())

## 3. Selecting Data

In [None]:
# Select single column (returns Series)
labels = df['label']
print("Labels:")
print(labels)
print(f"Type: {type(labels)}")

In [None]:
# Select multiple columns (returns DataFrame)
subset = df[['filename', 'label', 'width', 'height']]
print("Subset:")
print(subset)
print(f"Type: {type(subset)}")

## 4. Filtering Rows

In [None]:
# Create larger sample dataset
np.random.seed(42)
n_samples = 100

large_df = pd.DataFrame({
    'filename': [f'img_{i:04d}.jpg' for i in range(n_samples)],
    'label': np.random.choice(['cat', 'dog', 'bird'], n_samples),
    'width': np.random.randint(400, 1200, n_samples),
    'height': np.random.randint(300, 900, n_samples)
})

print(f"Created dataset with {len(large_df)} images")
print(large_df.head())

In [None]:
# Filter by condition
cats = large_df[large_df['label'] == 'cat']
print(f"Found {len(cats)} cat images")
print(cats.head())

In [None]:
# Filter by size
large_images = large_df[large_df['width'] > 800]
print(f"\nFound {len(large_images)} large images (width > 800)")
print(large_images.head())

In [None]:
# Multiple conditions
large_dogs = large_df[(large_df['label'] == 'dog') & (large_df['width'] > 800)]
print(f"\nFound {len(large_dogs)} large dog images")
print(large_dogs.head())

In [None]:
# Filter using isin()
dogs_or_cats = large_df[large_df['label'].isin(['dog', 'cat'])]
print(f"\nFound {len(dogs_or_cats)} dog or cat images")

## 5. Groupby and Aggregation

In [None]:
# Count images per class
class_counts = large_df['label'].value_counts()
print("Class distribution:")
print(class_counts)

In [None]:
# Group by label and calculate statistics
stats = large_df.groupby('label')[['width', 'height']].agg(['mean', 'std', 'min', 'max'])
print("\nImage size statistics by class:")
print(stats)

In [None]:
# Count per group
counts = large_df.groupby('label').size()
print("\nImages per class:")
print(counts)

## 6. Data Cleaning

In [None]:
# Create dataset with missing values
df_with_missing = pd.DataFrame({
    'filename': ['img1.jpg', 'img2.jpg', None, 'img4.jpg', 'img5.jpg'],
    'label': ['cat', None, 'dog', 'bird', 'cat'],
    'width': [640, 480, 800, None, 640]
})

print("Dataset with missing values:")
print(df_with_missing)

print("\nMissing values per column:")
print(df_with_missing.isnull().sum())

In [None]:
# Drop rows with any missing values
df_clean = df_with_missing.dropna()
print("After dropping rows with missing values:")
print(df_clean)

In [None]:
# Drop rows with missing values in specific column
df_clean_filename = df_with_missing.dropna(subset=['filename'])
print("After dropping rows with missing filename:")
print(df_clean_filename)

In [None]:
# Fill missing values
df_filled = df_with_missing.fillna({'label': 'unknown', 'width': 640})
print("After filling missing values:")
print(df_filled)

In [None]:
# Drop duplicates
df_with_dupes = pd.DataFrame({
    'filename': ['img1.jpg', 'img2.jpg', 'img1.jpg', 'img3.jpg'],
    'label': ['cat', 'dog', 'cat', 'bird']
})

print("With duplicates:")
print(df_with_dupes)

df_unique = df_with_dupes.drop_duplicates(subset=['filename'])
print("\nAfter removing duplicates:")
print(df_unique)

## 7. Dataset Splitting (Train/Val/Test)

In [None]:
from sklearn.model_selection import train_test_split

# Stratified split (maintains class distribution)
train_df, test_df = train_test_split(
    large_df,
    test_size=0.2,           # 20% for test
    random_state=42,         # Reproducible split
    stratify=large_df['label']  # Maintain class balance
)

# Further split train into train + validation
train_df, val_df = train_test_split(
    train_df,
    test_size=0.2,           # 20% of train = validation
    random_state=42,
    stratify=train_df['label']
)

print(f"Train: {len(train_df)} ({len(train_df)/len(large_df)*100:.1f}%)")
print(f"Val: {len(val_df)} ({len(val_df)/len(large_df)*100:.1f}%)")
print(f"Test: {len(test_df)} ({len(test_df)/len(large_df)*100:.1f}%)")

In [None]:
# Add split column to main dataframe
large_df['split'] = 'train'
large_df.loc[large_df.index.isin(val_df.index), 'split'] = 'val'
large_df.loc[large_df.index.isin(test_df.index), 'split'] = 'test'

print("Dataset with split column:")
print(large_df.head(10))

In [None]:
# Verify class distribution is maintained
print("Class distribution per split:")
split_dist = large_df.groupby(['split', 'label']).size().unstack(fill_value=0)
print(split_dist)

print("\nPercentage distribution:")
print(split_dist.div(split_dist.sum(axis=1), axis=0) * 100)

## 8. Merging DataFrames

In [None]:
# Create two separate dataframes
df_images = pd.DataFrame({
    'image_id': [1, 2, 3, 4],
    'filepath': ['img1.jpg', 'img2.jpg', 'img3.jpg', 'img4.jpg']
})

df_labels = pd.DataFrame({
    'image_id': [1, 2, 3, 4],
    'label': ['cat', 'dog', 'cat', 'bird']
})

print("Images DataFrame:")
print(df_images)
print("\nLabels DataFrame:")
print(df_labels)

In [None]:
# Merge (SQL-like join)
df_merged = pd.merge(df_images, df_labels, on='image_id')
print("\nMerged DataFrame:")
print(df_merged)

## 9. Saving and Loading

In [None]:
# Save to CSV (most common)
# large_df.to_csv('dataset.csv', index=False)
print("Dataframe ready to save with: df.to_csv('dataset.csv', index=False)")

# Load from CSV
# loaded_df = pd.read_csv('dataset.csv')

# Save to JSON
# large_df.to_json('dataset.json', orient='records')

# Save to Excel
# large_df.to_excel('dataset.xlsx', index=False)

print("DataFrame ready for export!")

## Summary

You've learned:
- ✓ Creating DataFrames for vision datasets
- ✓ Selecting and filtering data
- ✓ Groupby and aggregation for dataset analysis
- ✓ Handling missing data and duplicates
- ✓ Stratified train/val/test splitting
- ✓ Merging datasets
- ✓ Saving and loading datasets

**Key Takeaway:** Pandas makes dataset management organized and reproducible!