### Set access to google drive


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
%cd drive/My\ Drive/CV_incubator/IncubatorCVProject

/content/drive/My Drive/CV_incubator/IncubatorCVProject


### Import libraries and setup paths


In [5]:
from src.dataloader import SquarePadding
import matplotlib.pyplot as plt
from torchvision.datasets import ImageFolder
from torchvision.transforms import ToTensor,Resize,Compose
from torch.utils.data.sampler import SubsetRandomSampler
import torch
import inspect
import numpy as np
import cv2
import os
import glob
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from PIL import Image
from src.feature_engineering import prepare_spectral_clustering_features,prepare_eigen_component_features

In [None]:
# Path to image folder and label.csv
data_path = '../dog-breed-identification'
label_path = '../dog-breed-identification/labels.csv'

In [None]:
label = pd.read_csv("../dog-breed-identification/labels.csv",index_col = 'id')
label.head()

Unnamed: 0_level_0,breed
id,Unnamed: 1_level_1
000bec180eb18c7604dcecc8fe0dba07,boston_bull
001513dfcb2ffafc82cccf4d8bbaba97,dingo
001cdf01b096e06d78e9e5112d419397,pekinese
00214f311d5d2247d5dfe4fe24b2303d,bluetick
0021f9ceb3235effd7fcde7f7538ed62,golden_retriever


### Data loader




Map image path to label and label index

In [None]:
def generate_label(image_dir,label_path, label_col = 0):
  '''
  label_path(string): Path to the csv file with labels.
  image_dir (string): Directory with all the images.
  label_col: column of image id, default to 0th column in the label.csv
  ''' 
  label_csv = pd.read_csv(label_path,index_col = label_col)
  # create dictionary mapping label to label index
  lab2idx = {lab:idx for idx,lab in enumerate(list(set(label_csv['breed'])))}
  # create dictionary mapping label index back to label
  idx2lab = {idx:lab for idx,lab in enumerate(list(set(label_csv['breed'])))}
  # list of image pathes relative to root directory. Eg. format: subdirectory/image.jpg
  path = ['/'.join(img_fullname.split('/')[-2:]) for root, dirs, files in os.walk(image_dir)
                       for dir in dirs
                       for img_fullname in glob.glob(os.path.join(root,dir,'*'))]
  label_idx = [lab2idx[label_csv.loc[im.split('/')[-1].split('.')[0]][0]] for im in path]
  
  label = pd.DataFrame(columns = ['path','label_idx'])

  label['path'] = path
  label['label_idx'] = label_idx
          
  return label, lab2idx, idx2lab

In [None]:
label_csv, lab2idx, idx2lab = generate_label(data_path,label_path)

In [None]:
label_csv.head(5)

Unnamed: 0,path,label_idx
0,train/dd1d181a7224fa5a1a7c1fae05eec93d.jpg,64
1,train/e4f5d391d0eab2c83493f2110a743da3.jpg,108
2,train/e49f8aaa63a2ad36d11ff50fd53e25cf.jpg,106
3,train/e1e8cefa88b84062d11722537ec61214.jpg,21
4,train/deaba13cbf116d0dda2868a55c697d0b.jpg,6


In [None]:
for lab in lab2idx:
  print(lab,lab2idx[lab])
  break

border_collie 0


In [None]:
for idx in idx2lab:
  print(idx,idx2lab[idx])
  break

0 border_collie


In [None]:

class DogDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, label_csv, root_dir, transform=None):
        """
        Args:
            csv_file (file): Modified csv file with image path and label indexes. 
                             Refer to function 'generate_label' 
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.label_csv = label_csv
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.label_csv)

    def __getitem__(self, idx):
        # if torch.is_tensor(idx):
        #     idx = idx.tolist()

        img_path = os.path.join(self.root_dir,
                                self.label_csv.iloc[idx, 0])
        image = Image.open(img_path)
        label = self.label_csv.iloc[idx, 1]
       
        if self.transform:
            image = self.transform(image)
        

        return image,label

In [None]:
dog_dataset = DogDataset(label_csv,data_path,Compose([
    SquarePadding(),
    Resize((128,128)),
    ToTensor()]))

In [None]:

#Variables for splitting the dataset into train/test
validation_split = .1
test_split = .1
batch_size = 16
shuffle_dataset = True
random_seed = 42

# Split 
dataset_size = len(dog_dataset)
indices = list(range(dataset_size))
split_idx1 = int(np.floor((validation_split+test_split) * dataset_size))
split_idx2 = int(np.floor(test_split * dataset_size))
if shuffle_dataset:
    np.random.seed(random_seed)
    np.random.shuffle(indices)
    
test_indices, val_indices, train_indices = indices[:split_idx2], indices[split_idx2:split_idx1], indices[split_idx1:]

# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)

In [None]:
train_loader = torch.utils.data.DataLoader(dog_dataset, batch_size=batch_size, 
                                           sampler=train_sampler)
validation_loader = torch.utils.data.DataLoader(dog_dataset, batch_size=batch_size,
                                                sampler=valid_sampler)
# test_loader =  torch.utils.data.DataLoader(indoor_dataset, batch_size=batch_size,
#                                                 sampler=test_sampler)

In [None]:
print(dog_dataset.__getitem__(0)[-1])

64


In [None]:
for x,y in train_loader:
  print(x.shape, '\n',type(y))
  break

torch.Size([16, 3, 128, 128]) 
 <class 'torch.Tensor'>


### Combining with engineered features

In [34]:
from src.dataloader import load_dog_data
from sklearn.decomposition import PCA
from src.utils import PCA_images_list, unravel_image, ravel_image_vec, plot_image_grid
import numpy as np
import pickle


In [37]:
(image_list,label_list,label_dict) = load_dog_data(data_path,
              image_shape=(64,64),
              sample_rate=1, 
              simple=False)

In [39]:
(eig_vals,eig_image_list) = PCA_images_list(image_list)

In [None]:
pickle.dump( (eig_vals,eig_image_list), open( "eigvalsvecs.p", "wb" ) )

In [18]:
import torch
import pickle
file = open('eigvalsvecs.p', 'rb')
eigen = pickle.load(file)
eigvec = torch.Tensor(eigen[1])