# Visualization

This notebook visualizes the PCam dataset [1], retrieved from [Kaggle](https://www.kaggle.com/c/histopathologic-cancer-detection/)

**References:**

[1] B. S. Veeling, J. Linmans, J. Winkens, T. Cohen, M. Welling. "Rotation Equivariant CNNs for Digital Pathology". arXiv:1806.03962

In [1]:
import os

from google.colab import drive
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
import seaborn as sns
import torch
from torch.utils import data
from torch.utils.data.dataset import Dataset
from torchvision import datasets, transforms

In [2]:
DATASET_PATH = '/content/gdrive/My Drive/CS184A/Dataset/'

In [3]:
drive.mount('/content/gdrive')
os.chdir(DATASET_PATH)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
class PCam(Dataset):
    """The Patch Camelyon (PCam) dataset [1].
    
    Retrieved from https://www.kaggle.com/c/histopathologic-cancer-detection/.

    [1] B. S. Veeling, J. Linmans, J. Winkens, T. Cohen, M. Welling. "Rotation 
        Equivariant CNNs for Digital Pathology". arXiv:1806.03962
    """

    def __init__(self, image_dir, csv_path, transform=None):
        """ 
        Args:
            image_dir: Folder with image data in file system.
            csv_path: CSV file with image labels.
            transform: Transforms to apply before loading.
        """
        self.labels_df = pd.read_csv(csv_path)
        self.image_dir = image_dir
        self.transform = transform
        print(self.labels_df.head(5))

    def __len__(self):
        """Get the size of the PCam dataset."""
        return len(self.labels_df)

    def __getitem__(self, idx):
        """Get the (image, label) at a given index in the PCam dataset."""
        if torch.is_tensor(idx):
            idx = idx.to_list()
        image_id = self.labels_df.iloc[idx, 0]
        print('getting', image_id)
        print(image_id + '.tif')
        image_path = os.path.join(self.image_dir, image_id + '.tif')
        print(image_path)
        image = Image.open(image_path)
        print(image)
        label = self.labels_df.iloc[idx, 1]
        label = np.array([label]).astype('int')
        print(label)
        print(self.transform)
        if self.transform:
            label = self.transform(label)
        print(label)
        return (image, label)

In [5]:
image_dir = os.path.join(DATASET_PATH, 'train')
csv_path = os.path.join(DATASET_PATH, 'train_labels.csv')
pcam_dataset = PCam(image_dir, csv_path, transforms.Compose([transforms.ToTensor()]))
train_loader = data.DataLoader(pcam_dataset, batch_size=4, num_workers=0)

                                         id  label
0  f38a6374c348f90b587e046aac6079959adf3835      0
1  c18f2d887b7ae4f6742ee445113fa1aef383ed77      1
2  755db6279dae599ebb4d39a9123cce439965282d      0
3  bc3f0c64fb968ff4a8bd33af6971ecae77c75e08      0
4  068aba587a4950175d04c680d38943fd488d6a9d      0
