<a href="https://colab.research.google.com/github/xprotobeast2/Humpback_Whale_Identification/blob/master/Humpback_Whale.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Retrieve and Unpack Data

In [9]:
import os
from PIL import Image
import pandas as pd
import numpy as np
import json
from time import time

import torch
from torchvision import transforms
from torch import nn
import torch.utils.data as torchdata



from sklearn.preprocessing import LabelEncoder


np.random.seed(111)
torch.cuda.manual_seed_all(111)
torch.manual_seed(111)

<torch._C.Generator at 0x7f44a71b5510>

In [5]:
a = torch.Tensor([1]).cuda()
print(a)

torch.cuda.is_available()

tensor([1.], device='cuda:0')


True

In [7]:
!pip install kaggle
!mkdir .kaggle
!mkdir data



token = {"username":"ramg95","key":"f713f68e67eed775abc23149ec728820"}

with open('/content/.kaggle/kaggle.json', 'w') as file:
    json.dump(token, file)
    
!cp /content/.kaggle/kaggle.json ~/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

!kaggle config set -n path -v "/content/data"
!kaggle competitions download -c humpback-whale-identification

!unzip -q data/competitions/humpback-whale-identification/test.zip -d data/test/
!unzip -q data/competitions/humpback-whale-identification/train.zip -d data/train/
!mv data/competitions/humpback-whale-identification/*.csv data/
!rm -rf data/competitions

mkdir: cannot create directory ‘.kaggle’: File exists
mkdir: cannot create directory ‘data’: File exists
- path is now set to: /content/data
Downloading sample_submission.csv to /content/data/competitions/humpback-whale-identification
  0% 0.00/498k [00:00<?, ?B/s]
100% 498k/498k [00:00<00:00, 69.1MB/s]
Downloading train.csv to /content/data/competitions/humpback-whale-identification
  0% 0.00/594k [00:00<?, ?B/s]
100% 594k/594k [00:00<00:00, 84.4MB/s]
Downloading test.zip to /content/data/competitions/humpback-whale-identification
 99% 1.34G/1.35G [00:25<00:00, 58.5MB/s]
100% 1.35G/1.35G [00:25<00:00, 56.5MB/s]
Downloading train.zip to /content/data/competitions/humpback-whale-identification
100% 4.15G/4.16G [01:03<00:00, 70.6MB/s]
100% 4.16G/4.16G [01:03<00:00, 69.8MB/s]


In [0]:
class Humpback_Whale_Dataset(torchdata.Dataset):
    def __init__(self, data_root , fold='train', known_only=False, validation_fraction=0.1, transforms=None):
        
        self.root = data_root
        fold = fold.lower()

        self.train = False
        self.test = False
        self.val = False

        if fold == "train":
            self.train = True
        elif fold == "test":
            self.test = True
        elif fold == "val":
            self.val = True
        else:
            raise RuntimeError("Not train-val-test")
        
        # Load data based on fold name
        if self.train or self.val:
            data_info = pd.read_csv(data_root+'train.csv', header=0)
            
            if known_only:
                data_info = data_info[data_info.Id != 'new_whale']
            
            
            self.images = data_info.Image.values
            self.labels = data_info.Id.values

            # Now make a mask for every nth image (training and validation)
            step = int(1/validation_fraction)
            p = np.arange(0, len(self.images), step)
            val_mask = np.zeros(len(self.images), dtype=bool)
            train_mask = np.ones(len(self.images), dtype=bool)
            train_mask[p] = False
            val_mask[p] = True
            
            # set data and label values
            self.data = np.array(self.images[train_mask]) if self.train else np.array(self.images[val_mask])
            self.labels = np.array(self.labels[train_mask]) if self.train else np.array(self.labels[val_mask])
            self.data_path = data_root+'train/'
            
        elif self.test:
            
            # Directly load names from os.listdir
            self.data = np.array(os.listdir(data_root+'test/'))
            self.labels = np.zeros(len(self.data))
            self.data_path = data_root+'test/'
        
        self.length = len(self.data)
        self.transforms = transforms

    def __getitem__(self, index):
        
        img_file = os.path.join(self.data_path, self.data[index])
        image = Image.open(img_file)
        label = self.labels[index]
        
        if self.transforms is not None:
            image = self.transforms(image)

        return (image, label)
    
    def __len__(self):
        return self.length
    
    def __repr__(self):
        fmt_str = 'Dataset ' + self.__class__.__name__ + '\n'
        fmt_str += '    Number of datapoints: {}\n'.format(self.__len__())
        tmp = 'train' if self.train is True else 'test'
        fmt_str += '    Split: {}\n'.format(tmp)
        fmt_str += '    Root Location: {}\n'.format(self.root)
        tmp = '    Transforms (if any): '
        fmt_str += '{0}{1}\n'.format(tmp, self.transforms.__repr__().replace('\n', '\n' + ' ' * len(tmp)))       
        return fmt_str
    

In [0]:
DATA_ROOT = "data/"
TRAIN_BS = 16
TEST_BS = 128
IMAGE_RESIZED_DIM = 100

transform = transforms.Compose([
        transforms.Grayscale(1),
        transforms.Resize((IMAGE_RESIZED_DIM,IMAGE_RESIZED_DIM)),
        transforms.ToTensor()
    ])

train_set = Humpback_Whale_Dataset(data_root=DATA_ROOT, 
                                   fold='train', 
                                   known_only=True,
                                   validation_fraction = 0.2,
                                   transforms=transform)
val_set = Humpback_Whale_Dataset(data_root=DATA_ROOT, 
                                   fold='val', 
                                   validation_fraction = 0.2,
                                   known_only=True,
                                   transforms=transform)
test_set = Humpback_Whale_Dataset(data_root=DATA_ROOT, 
                                   fold='test',
                                   known_only=True,
                                   transforms=transform)

trainloader = torchdata.DataLoader(train_set, batch_size=TRAIN_BS,
                                          shuffle=True, num_workers=2)
valloader = torchdata.DataLoader(val_set, batch_size=TEST_BS,
                                         shuffle=False, num_workers=2)
testloader = torchdata.DataLoader(test_set, batch_size=TEST_BS,
                                         shuffle=False, num_workers=2)

In [0]:
trainloader.dataset

Dataset Humpback_Whale_Dataset
    Number of datapoints: 12557
    Split: train
    Root Location: data/
    Transforms (if any): Compose(
                             Grayscale(num_output_channels=1)
                             Resize(size=(100, 100), interpolation=PIL.Image.BILINEAR)
                             ToTensor()
                         )

In [0]:
valloader.dataset

Dataset Humpback_Whale_Dataset
    Number of datapoints: 3140
    Split: test
    Root Location: data/
    Transforms (if any): Compose(
                             Grayscale(num_output_channels=1)
                             Resize(size=(100, 100), interpolation=PIL.Image.BILINEAR)
                             ToTensor()
                         )

In [0]:
testloader.dataset

Dataset Humpback_Whale_Dataset
    Number of datapoints: 7960
    Split: test
    Root Location: data/
    Transforms (if any): Compose(
                             Grayscale(num_output_channels=1)
                             Resize(size=(100, 100), interpolation=PIL.Image.BILINEAR)
                             ToTensor()
                         )

In [0]:
fig, ax = plt.subplots(10,1)
fig.set_size_inches(10,30)
for i in range(10):
    ax[i].imshow(plt.imread(DATA_ROOT+'train/'+known_whales.Image.iloc[i]))

# Model Pipelines

In [0]:
def validate(model, valloader, has_gpu):
    
    correct = total = 0.0
    class_correct = list(0. for i in range(TOTAL_CLASSES))
    class_total = list(0. for i in range(TOTAL_CLASSES))

    model.eval()

    with torch.no_grad():
        for images, labels in valloader:
            if is_gpu:
                images = images.cuda()
                labels = labels.cuda()

            # Predict and compute accuracy    
            outputs = model(images)
            predicted = torch.argmax(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum()


            c = (predicted == labels).squeeze()
            for i in range(len(labels)):
                label = labels[i]
                class_correct[label] += c[i]
                class_total[label] += 1

    class_accuracy = 100 * np.divide(class_correct, class_total)
    return 100*correct/total, class_accuracy

def train(model, optimizer, loss_function, trainloader, valloader,  has_gpu=True, num_epochs):
    
    model.train()
    for epoch in range(num_epochs):
        correct = total = running_loss = 0.0
        for images, labels in trainloader:
            # Train here
            if has_gpu:
                images = images.cuda()
                labels = labels.cuda()
            
            # Forward, and loss compute
            optimizer.zero_grad()
            outputs = model(images)
            loss = loss_function(outputs, labels)
            
            
            # Calculate gradients, update parameters
            loss.backward()
            optimizer.step()
            
            # Update train accuracy values
            correct += (torch.argmax(outputs, 1) == labels).sum().float()
            total += labels.size(0)
            running_loss += loss.item()
            
            
            
        print("Took %f seconds"%(time() - a))
        print('Accuracy of the network on the train images: %f %%' % ((correct/total)*100))
        print('Accuracy of the network on the val images: %d %%' % (val_accuracy))
    

In [0]:
gnb.score(X_train[:1000], Y_train[:1000])

In [0]:
X_train.shape
Y_train.shape

Techniques to apply to data:

Problem: Data is not the same size
1. Vector Quantization
    

*   Pick a 2-D patch_size, and chop image up into patches with/without overlap. 
*   Make a 


2. Downsampling
3. Upsampling


Problem : High dimension , Solution: Dimensionality Reduction
1. PCA
2. Autoencoders?

Problem: Data imbalance Solution: Augmentation
1. Data imbalance is when we have lots of data for one class, but only like 1 example for others. The way we handle this is either to use Machine learning models with data weighting. One common way is oversampling/augmentation.

we can make copies with modifications of the examples in the classes with lower representation. 

# TODO



1.   Train SVM to recognize a unique whale. Train num_classes SVMs.
2.   Random Forest Classifiers
3.   CNN based architectures


*   Model from scratch
*   Pretrained/Transfer Learning


MOST PROMISING PIPELINE:

https://medium.com/@ageitgey/machine-learning-is-fun-part-4-modern-face-recognition-with-deep-learning-c3cffc121d78




