## Set project directory

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd drive/My\ Drive/CV_incubator/IncubatorCVProject

/content/drive/My Drive/CV_incubator/IncubatorCVProject


In [3]:
!ls src

Basic_CNN_Architecture.py   dataset.py		    __pycache__
BasicCNN_withfeat_64x64.py  feature_engineering.py  train_valid.py
data_loader.py		    main_two.py		    utils.py


## Import libraries and load data

In [4]:
import pandas as pd
from torchvision.transforms import ToTensor,Resize,Compose
from src.data_loader import SquarePadding
from src.dataset import generate_label,DogDataset
import numpy as np
from torch.utils.data.sampler import SubsetRandomSampler
import torch
import pickle
import time
from src.BasicCNN_withfeat_64x64 import BasicCNN_w_features_64x64 
from src.train_valid import train, validation
import torch.nn as nn
import torch.optim as optim

In [5]:
# Path to the folder where all images are stored
data_path = '../dog-breed-identification'
# Path to the label.csv file
label_path = '../dog-breed-identification/labels.csv'

#### Label

In [6]:
label = pd.read_csv("../dog-breed-identification/labels.csv",index_col = 'id')
label.head()

Unnamed: 0_level_0,breed
id,Unnamed: 1_level_1
000bec180eb18c7604dcecc8fe0dba07,boston_bull
001513dfcb2ffafc82cccf4d8bbaba97,dingo
001cdf01b096e06d78e9e5112d419397,pekinese
00214f311d5d2247d5dfe4fe24b2303d,bluetick
0021f9ceb3235effd7fcde7f7538ed62,golden_retriever


In [7]:
label_csv, lab2idx, idx2lab = generate_label(data_path,label_path)

In [8]:
label_csv.head(5)

Unnamed: 0,path,label_idx
0,train/dd1d181a7224fa5a1a7c1fae05eec93d.jpg,42
1,train/e4f5d391d0eab2c83493f2110a743da3.jpg,112
2,train/e49f8aaa63a2ad36d11ff50fd53e25cf.jpg,85
3,train/e1e8cefa88b84062d11722537ec61214.jpg,18
4,train/deaba13cbf116d0dda2868a55c697d0b.jpg,50


#### Pytorch dataset

In [9]:
dog_dataset = DogDataset(label_csv,data_path,Compose([
    SquarePadding(),
    Resize((64,64)),
    ToTensor()]))

In [10]:
#Variables for splitting the dataset into train/test
validation_split = .1
test_split = .1
batch_size = 16
shuffle_dataset = True
random_seed = 42

# Split 
dataset_size = len(dog_dataset)
indices = list(range(dataset_size))
split_idx1 = int(np.floor((validation_split+test_split) * dataset_size))
split_idx2 = int(np.floor(test_split * dataset_size))
if shuffle_dataset:
    np.random.seed(random_seed)
    np.random.shuffle(indices)
    
test_indices, val_indices, train_indices = indices[:split_idx2], indices[split_idx2:split_idx1], indices[split_idx1:]

# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)

train_loader = torch.utils.data.DataLoader(dog_dataset, batch_size=batch_size, 
                                           sampler=train_sampler)
valid_loader = torch.utils.data.DataLoader(dog_dataset, batch_size=batch_size,
                                                sampler=valid_sampler)

#### Engineered features(eigenvectors)


In [11]:
file = open('features.p', 'rb')
features = pickle.load(file)
print(features.shape)
train_feat = valid_feat = features

torch.Size([10222, 300])


## Training

In [12]:
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")
    
    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P4


In [None]:
num_classes = sum([1 for k in lab2idx])

# training process
# to be finished later
model = BasicCNN_w_features_64x64(num_classes=num_classes)
model.to(device)
train_loss = nn.CrossEntropyLoss()
train_loss.to(device)
valid_loss = nn.CrossEntropyLoss()
valid_loss.to(device)
def train_valid(optimizer = optim.Adam(model.parameters()), epochs = 20, model = model,
                train_criterion = train_loss, train_loader = train_loader,
                valid_criterion = valid_loss, valid_loader = valid_loader,
                device = device, train_feat = train_feat, valid_feat = valid_feat):

    start_epoch = 1
    #or: best_val_acc = 0
    best_val_loss = np.inf

    history = {"train_loss":[], "train_acc":[],
                "valid_loss":[], "valid_acc":[], "valid_preds_list":[],
                "valid_truelabels_list":[], "valid_probas_list":[], "valid_auc_score":[]}

    start_time = time.time()

    for epoch in range(start_epoch, epochs + 1):

        train_loss, train_acc = train(epoch, model, optimizer, train_criterion, 
                                      train_loader, device, train_feat)
        history["train_loss"].append(train_loss)
        history["train_acc"].append(train_acc)

        print('epoch: ', epoch)
        print('{}: loss: {:.4f} acc: {:.4f}'.format('training', train_loss, train_acc))

        valid_loss, valid_acc, valid_preds_list, valid_truelabels_list, valid_probas_list, valid_auc_score = validation(epoch, model, optimizer, 
                                                        valid_criterion, valid_loader, 
                                                        device, valid_feat)
        history["valid_loss"].append(valid_loss)
        history["valid_acc"].append(valid_acc)
        history["valid_preds_list"].append(valid_preds_list)
        history["valid_truelabels_list"].append(valid_truelabels_list)
        history["valid_probas_list"].append(valid_probas_list)
        history["valid_auc_score"].append(valid_auc_score)

        print('{}: loss: {:.4f} acc: {:.4f} auc: {:.4f}'.format('validation', valid_loss, valid_acc, valid_auc_score))
        print()

        # save models(use valid loss as best model criterion, please change
        # criterion here if needed(eg. valid acc)
        is_best = valid_loss < best_val_loss
        best_val_loss = min(valid_loss, best_val_loss)

        if is_best:
            # please change model file path here
            best_model_file = "best_models/best_dry_run1.pth"
            torch.save(model.state_dict(), best_model_file)

        # save model from every training epoch
        # can be deleted if do not need this one, or adapt it to save 5th, 10th, 15th ...models
        model_file = "best_models/dry_run1" + str(epoch) + ".pth"

        torch.save(model.state_dict(), model_file)

        # save training/validation results
        with open("history.pkl", "wb") as fout:
            pickle.dump(history, fout)

    print('time elapsed:', time.time() - start_time)

    return history

results = train_valid()