<a href="https://colab.research.google.com/github/mgfrantz/CodingNomads-Intro-To-Deep-Learning-Labs/blob/master/Torch_Data_API_workset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch
from torch import nn
import os
from sklearn.preprocessing import StandardScaler
from multiprocessing import cpu_count

In [None]:
# Load data into memory
# These datasets come come pre-packaged with colab, so it's best to run this lesson there.
housing = pd.read_csv('sample_data/california_housing_train.csv')
housing_test = pd.read_csv('sample_data/california_housing_test.csv')

In [None]:
housing.head()

In [None]:
housing.agg(['mean','std'])

In [None]:
x_train = housing.drop('median_house_value', axis=1)
y_train = housing.median_house_value.values

x_valid = housing_test.drop('median_house_value', axis=1)
y_valid = housing_test.median_house_value.values

In [None]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)

In [None]:
# Check that the means of each column are close to 0
assert np.allclose(x_train_scaled.mean(axis=0), np.zeros(x_train_scaled.shape[1]))
# Check that the stds of each column are close to 1
assert np.allclose(x_train_scaled.std(axis=0), np.ones(x_train_scaled.shape[1]))

In [None]:
class HousingDataset(Dataset):
    def __init__(self, X, y):
        super().__init__()
        self.X = X
        self.y = y
    
    def __len__(self):
        # What are some other ways we could do this?
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [None]:
train_ds = HousingDataset(x_train_scaled, y_train)
valid_ds = HousingDataset(x_valid_scaled, y_valid)

In [None]:
# What are the number of items in each dataset?
len(train_ds), len(valid_ds)

In [None]:
# What is the x and y at a given index?
idx = 8

In [None]:
x, y = train_ds[idx]
x, y

## Exercise 4.1: Perform some sanity checks

Use `assert` to perform the following sanity checks. 
For a random `(x, y)` item from `train_ds` ad a given index...

* check that the `x` value is the same as the corresponding value from `x_train_scaled` at the same index
* check that the `y` value is the same as the corresponding value from `y_train` at the same index

In [None]:
# Choose an index in your dataset
idx = ...
# Fetch an item at that index from train_ds
x, y = ...
# check that the x value is the same as the corresponding value from x_train_scaled at the same index
assert ...
# check that the y value is the same as the corresponding value from y_train at the same index
assert ...

In [None]:
BATCH_SIZE = 64
N_WORKERS = cpu_count()
print(f"""
In this example, each batch will contain {BATCH_SIZE} items.
We will use {N_WORKERS} workers to load data more efficiently.
""")

In [None]:
train_dl = DataLoader(
    train_ds, 
    batch_size=BATCH_SIZE, 
    shuffle=True, # We generally want to shuffle the train dataloader
    num_workers=N_WORKERS
)

## Exercise 4.2: Create the `valid_dl`

Create the validation dataloader below.
This can be mostly the same as the `train_dl`, but `shuffle` should be set to `False`.
Think critically about how you're going to evaluate your model.
Why is setting `shuffle=False` in the validation dataloader a good idea?

In [None]:
valid_dl = ...

In [None]:
# Pull one batch of data
for batch in train_dl:
    break

In [None]:
# What's the type? The length?
type(batch), len(batch)

In [None]:
# This looks like our X
batch[0].shape

In [None]:
# This looks like our y
batch[1].shape

In [None]:
for x_batch, y_batch in train_dl:
    break

In [None]:
x_batch.shape, y_batch.shape

In [None]:
x_batch

In [None]:
y_batch

## Exercise 4.3: Exploring the `DataLoader`

Explore the `train_dl` object to answer the following questions.

1. How many batches are there in `train_dl`? How many items?
1. Are all the batches the same size? What's the minimum and maximum batch sizes? If there is a difference here, why might that be occurring?
1. Can we index into `train_dl`? Does this behavior make sense? Why?

In [None]:
# Your code here

---

In [None]:
# We're just using fastai for the datasets for now.
# We'll learn how to use it for modeling later on.
!pip install -Uqq fastai

In [None]:
# Download and extract the data
from fastai.data.all import URLs, untar_data
from fastcore.basics import Path
from PIL import Image
import numpy as np

path = untar_data(URLs.CIFAR)

## Exercise 4.4: Exploring data on disk

The variable `path` refers to where our raw data is.
The directory structure is listed above.
In this exercise, we'll explore what's in that directory.
Using `path`, answer the following questions:

1. What files or directories are in `path`?
1. What is contained in `path/'train'`?
1. Find the paths for 10 images of airplanes from the train dataset.
1. If we had to label the image from its path, how might we do that?

In [None]:
# Your code here

In [None]:
def list_png_files(path):
    return list(path.glob('**/*.png'))

In [None]:
sample_files = list_png_files(path/'train')[:10]
sample_files

In [None]:
def label_from_path_parent(path:Path) -> str:
    return path.parent.name

In [None]:
# Sanity check for label_from_parent_path
assert label_from_path_parent(Path('/root/.fastai/data/cifar10/train/horse/42500_horse.png')) == 'horse'

In [None]:
def load_image_and_label(path):
    img = Image.open(path)
    label = label_from_path_parent(path)
    return img, label

In [None]:
img, label = load_image_and_label(sample_files[0])
print(label)
img

In [None]:
# Let's resize this image and inspect what it looks like
img.resize((224,224))

## Exercise 4.5: The `CifarDataset`

Use the `list_image_files` and `load_image_and_label` functions to complete the `CifarDataset`.

In [None]:
class CifarDataset(Dataset):
    def __init__(self, path):
        self.path = path
        self.files = ...
    
    def __len__(self):
        return ...

    def __getitem__(self, idx):
        return ...

In [None]:
train_cifar = CifarDataset(path/'train')

In [None]:
img, label = train_cifar[8000]
print(label)
img.resize((224, 224))

In [None]:
def img_to_scaled_tensor(img, channels_first=True):
    t = torch.tensor(np.array(img) / 255).float()
    if channels_first:
        return t.permute(2, 0, 1)
    return t

In [None]:
img_t = img_to_scaled_tensor(img)
img_t.shape

With regard to our labels, they're still in string format, which is not something our loss functions understand.
Let's convert these to label indices.

In [None]:
classes = {d.name:i  for i, d in enumerate((path/'train').ls())}
classes

In [None]:
def class_to_idx(class_name):
    return classes.get(class_name)

In [None]:
def collate_fn(batch):
    xs, ys = list(zip(*batch))
    xs = torch.cat([img_to_scaled_tensor(i).unsqueeze(0) for i in xs], dim=0)
    ys = torch.tensor([class_to_idx(i) for i in ys])

    return xs, ys

In [None]:
# Test the collate function
items = (train_cifar[0], train_cifar[1])
items

In [None]:
x_b, y_b = collate_fn(items)

In [None]:
x_b.shape, y_b.shape

In [None]:
train_cifar_dl = DataLoader(
    train_cifar, 
    batch_size=BATCH_SIZE, 
    num_workers=N_WORKERS,
    shuffle=True, 
    collate_fn=collate_fn
)

In [None]:
for x_b, y_b in train_cifar_dl:
    break

In [None]:
x_b.shape

In [None]:
y_b.shape