# Pet Finder Models

Training models predicting how long it will take for a pet to be adopted on the Pet Finder platform. Training data sourced from Kaggle competition.


## Set-up

### Import libraries

In [None]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

import pandas as pd
from zipfile import ZipFile
import numpy as np
import os
import glob
from PIL import Image
import matplotlib.pyplot as plt
import copy
import cv2
import random
import json

import torch
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.sampler import SubsetRandomSampler

### Set Pandas display options

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)
pd.set_option('max_rows', None)
pd.set_option('max_seq_item', None)

### Check GPU is enabled

In [None]:
#check gpu is enabled
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Running on device: {}'.format(device))

Running on device: cuda:0


## Load images

In [None]:
# create connection to google drive
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
# import train_images zip file
fid = drive_.ListFile({'q':"title='train_images.zip'"}).GetList()[0]['id']
f = drive.CreateFile({'id': fid})
f.GetContentFile('train_images.zip')

In [None]:
# unzip images
with ZipFile('train_images.zip', 'r') as zip_file:
   # Extract all the contents of zip file in current directory
   zip_file.extractall('/content')

In [None]:
# change directory to train images folder
os.chdir('/content/train_images')

In [None]:
%%time
# create df with filepath to main image for each pet listing
df = pd.DataFrame(data=os.listdir(),columns=['img_filepath'])
df['PetID'] = df['img_filepath'].apply(lambda x: x.split('-')[0])
df['img_num'] = pd.to_numeric(df['img_filepath'].apply(lambda x: x.split('-')[1].split('.')[0]),errors='coerce')
df = df[df['img_num']==1].reset_index(drop=True)
df.head()

CPU times: user 101 ms, sys: 19.8 ms, total: 121 ms
Wall time: 123 ms


In [None]:
# check how many images there are
print(df.shape)

(14652, 3)


## Extract image embeddings

In [None]:
%%capture
# load pre-trained densenet121 model
model = models.densenet121(pretrained=True).to(device)
# remove output layer to extract embeddings
model = torch.nn.Sequential(*list(model.children())[:-1])
model.eval()

In [None]:
# create custom dataset
class CustomDataset(Dataset):

    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = self.data[idx]
        image = Image.open(img_name)
        if np.array(image).ndim!=3:
            image = Image.fromarray(cv2.cvtColor(np.array(image),cv2.COLOR_GRAY2RGB))  
        if self.transform:
            return self.transform(image)
        else:
            return image

In [None]:
# define transform to convert images to tensor (ready for model)
transform = transforms.Compose([
                                transforms.Resize((224,224)),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                                ])

In [None]:
# load images
images = CustomDataset(df['img_filepath'].tolist(), transform=transform)

# create data loader
data_loader = DataLoader(images, batch_size=128, num_workers=4)

In [None]:
%%time
# extract image embeddings
img_tensors = []
avgpool1d = torch.nn.AvgPool1d(4)
for inputs in data_loader: 
    # extract image embedding
    with torch.no_grad():
        out = model(inputs.to(device))
    torch.cuda.empty_cache() 
    # apply global avg pooling, then avg pooling with kernel size=4
    out = avgpool1d(out.mean([2,3]).unsqueeze(0)).squeeze(0)
    # save output
    img_tensors.append(out)

# concatenate output batches into single numpy array
img_output = torch.cat(img_tensors).detach().cpu().numpy()
del img_tensors

CPU times: user 14.1 s, sys: 8.87 s, total: 23 s
Wall time: 1min 18s


In [None]:
# check shape of image embeddings output
img_output.shape

(14652, 256)

## Save image embeddings

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
os.chdir('/content/drive/My Drive/MSc Data Science/Research Project/data/petfinder-adoption-prediction')

In [None]:
# create folder for saving pre-processed data
if not os.path.exists('data_preprocessed'):
    os.makedirs('data_preprocessed')

In [None]:
df_img_output = pd.merge(df[['PetID']],pd.DataFrame(img_output),left_index=True,right_index=True)
df_img_output.to_csv('data_preprocessed/img_output.csv',index=False)
print(df_img_output.shape)

(14652, 257)
