In this notebook, we resize all training and test images for GLDV2 to be of shape `512x512` without mantaining the aspect ratio. We also use `Joblib` to parallelize the resize operation and use 36 cores to perform this operation. 

In [1]:
import os
import glob
from tqdm import tqdm
from PIL import Image
from joblib import Parallel, delayed
import argparse
import albumentations
from pathlib import Path

Based on the variables in `Args`, we resize the images in `input_folder` to `output_folder`.

In [2]:
class Args:
    input_folder  = '/home/ubuntu/repos/kaggle/google-landmark-retrieval-2020/data/train/' 
    output_folder = '/home/ubuntu/repos/kaggle/google-landmark-retrieval-2020/data/train_512x512/'
    sz = 512
    
args = Args()
args.input_folder, args.output_folder, args.sz

('/home/ubuntu/repos/kaggle/google-landmark-retrieval-2020/data/train/',
 '/home/ubuntu/repos/kaggle/google-landmark-retrieval-2020/data/train_512x512/',
 512)

In [3]:
def save_file(img, input_path, args=None, fn=None):
    path = input_path
    path = str(path)
    out_path = path.replace(args.input_folder, args.output_folder) 
    out_path = Path(out_path)
    if not os.path.exists(out_path.parent):
        os.makedirs(out_path.parent, exist_ok=True)         
    img.save(str(out_path))  

In [4]:
def resize_image(path, sz: tuple, args):
    fn  = os.path.basename(path)  
    img = Image.open(path)
    img = img.resize(sz, resample=Image.BILINEAR)
    save_file(img, path, args=args, fn=fn)

In [5]:
if args.sz: 
    print("images will be resized to ({}, {})".format(args.sz, args.sz))
    args.sz= int(args.sz)

images will be resized to (512, 512)


In [6]:
images = list(Path(args.input_folder).rglob('*.jpg'))
len(images)

1580470

In [7]:
if not os.path.exists(args.output_folder):
    os.makedirs(args.output_folder)
# uncomment below to run
# Parallel(n_jobs=64)(
#     delayed(resize_image)(i, (args.sz, args.sz), args) for i in tqdm(images))