In [3]:
import petpy
import pandas as pd
import urllib.request
from urllib.error import HTTPError
import time
from datetime import date
import os
from tqdm import tqdm_notebook
from ast import literal_eval

def downloader(filename, image_url):
    full_file_name = filename + '.jpg'
    urllib.request.urlretrieve(image_url,full_file_name)

# Replace xx with API key and secret
pf = petpy.Petfinder(key='xx', secret='xx')

# csv filename
csv_file = 'cats_{}.csv'.format(date.today().strftime("%b-%d-%Y"))

if os.path.exists(csv_file):
    print('Found a csv file for today already, skipping download..')
    pure_cats_w_photos = pd.read_csv(csv_file)
    pure_cats_w_photos['med_photos'] = pure_cats_w_photos.med_photos.apply(literal_eval)
else:
    print('Creating today\'s csv file...')
    cats = pf.animals(results_per_page=100, pages=10, return_df=True, animal_type='cat')
    # Filter pure cats only
    pure_cats_w_photos = cats[(~cats['breeds.mixed']) & (~cats['breeds.unknown']) & 
                              (cats['breeds.secondary'].isna()) & ~cats['breeds.primary'].isna() & 
                              cats['photos']][['id', 'url', 'type', 'age', 'gender', 
                                      'size', 'coat', 'breeds.primary', 'photos']]

    pure_cats_w_photos.rename(columns={'breeds.primary': 'breed'}, inplace=True)
    pure_cats_w_photos['med_photos'] = pure_cats_w_photos.photos.apply(lambda photos: [photo['medium'] for photo in photos])
    pure_cats_w_photos['breed'] = pure_cats_w_photos.breed.str.replace('/', '-')
    pure_cats_w_photos.to_csv(csv_file, index=False)
    
pure_cats_w_photos.head()

Creating today's csv file...


Unnamed: 0,id,url,type,age,gender,size,coat,breed,photos,med_photos
3,46698867,https://www.petfinder.com/cat/pirate-46698867/...,Cat,Baby,Male,Medium,Short,Tabby,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[https://dl5zpyw5k3jeb.cloudfront.net/photos/p...
4,46698864,https://www.petfinder.com/cat/9348-46698864/tx...,Cat,Baby,Male,Medium,Medium,Domestic Medium Hair,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[https://dl5zpyw5k3jeb.cloudfront.net/photos/p...
5,46698861,https://www.petfinder.com/cat/bailey-and-archi...,Cat,Baby,Female,Medium,Short,Domestic Short Hair,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[https://dl5zpyw5k3jeb.cloudfront.net/photos/p...
6,46698856,https://www.petfinder.com/cat/9346-46698856/tx...,Cat,Adult,Male,Medium,Short,Domestic Short Hair,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[https://dl5zpyw5k3jeb.cloudfront.net/photos/p...
7,46698852,https://www.petfinder.com/cat/rolo-46698852/me...,Cat,Baby,Male,Medium,Short,Domestic Short Hair,[{'small': 'https://dl5zpyw5k3jeb.cloudfront.n...,[https://dl5zpyw5k3jeb.cloudfront.net/photos/p...


In [4]:
for my_folder in pure_cats_w_photos.breed.unique():
    if not os.path.exists(my_folder):
        os.makedirs(my_folder)

downloaded_cats = []
for dirname, _, filenames in os.walk('.'):
    for filename in filenames:
        if filename.endswith('.jpg'):
            downloaded_cats.append(filename)            

for ix, row in tqdm_notebook(pure_cats_w_photos.iterrows(), total=pure_cats_w_photos.shape[0]):
    if str(row.id) in str(downloaded_cats):
        continue 
    for photo_url in row.med_photos:
        try:
            downloader(r'{breed}\{id}_{ix}'.format(breed=row.breed, ix=ix, id=row.id), photo_url)
        except HTTPError:
            time.sleep(1)
            continue

HBox(children=(IntProgress(value=0, max=627), HTML(value='')))




![Images organized in folders per cat breed](breeds.png "Breeds")