# get_dataset.ipynb

This Jupyter notebook performs thes steps:
* a GitHub repository of original sticky trap images, **REPO**, is cloned into a specified folder, **DATADIR**.
* the **.git** subfolder is deleted to save room.
* a list of images, **imagelist**, is created
* if a file named **blacklist.csv** is found in the repo, items in this list are removed from **imagelist**
* a subfolder is created for each image in **imagelist** and is populated with a symlink to the original image (symlinks are used to save storage space)
* **imagelist** is saved in **{DATADIR}/imagelist.csv**.

In [8]:
import os
import shutil
import glob
import pandas as pd

In [9]:
# This cell is tagged as "parameters" to facilitate running using papermill
REPO = 'https://github.com/aubreymoore/Tinian-sticky-traps-2022-02-26'
DATADIR = '/home/aubrey/Desktop/test'

In [10]:
def download_original_images():
    command = f'git clone {REPO} {DATADIR}/repo'
    os.system(command)
    command = f'rm -rf {DATADIR}/.git'
    os.system(command)

In [11]:
def remove_blacklisted_filenames_from_imagelist(imagelist):
    """
    If a file named 'blacklist.csv' is found in the rep folder, black-listed files will be removed from imagelist.
    Here's an example blacklist.csv:
    
    filename
    scale.png
    40.png
    """
    bl = f'{DATADIR}/repo/blacklist.csv'
    if os.path.exists(bl):
        print('   blacklist.csv found')
        df = pd.read_csv(bl)
        for i, r in df.iterrows():
            imagelist.remove(f'{DATADIR}/repo/{r.filename}')
    else:
        print('   WARNING: blacklist.csv not found')
    return imagelist

In [12]:
def create_folder_for_each_image():
    for filepath in imagelist:
        filename = os.path.basename(filepath)
        sn = filename.replace('.png', '')
        os.mkdir(f'{DATADIR}/{sn}')
        symlink = f'{DATADIR}/{sn}/original.png'         
        os.symlink(filepath, symlink)

In [13]:
def save_imagelist_as_csv(imagelist):
    f = f'{DATADIR}/imagelist.csv'
    print(f'   Saving imagelist to {f}')
    df = pd.DataFrame({'filename': imagelist})
    df.to_csv(f, index=False)

# save_imagelist_as_csv(imagelist)

In [14]:
# MAIN
print(f'Starting from scratch: deleting {DATADIR}')
shutil.rmtree(DATADIR)

print('Downloading original images.')
print(f'   Cloning {REPO} to {DATADIR}/repo')     
download_original_images()
      
print('Creating imagelist and removing blacklisted items.')
imagelist = glob.glob(f'{DATADIR}/repo/*.png')
imagelist = remove_blacklisted_filenames_from_imagelist(imagelist)
      
print('Saving imagelist')
save_imagelist_as_csv(imagelist)
      
print('Creating a folder for each image and adding a symlink to original image.')
create_folder_for_each_image()
      
print('Deleting .git to save room.')
shutil.rmtree(f'{DATADIR}/repo/.git')

print('Finished.')

Starting from scratch: deleting /home/aubrey/Desktop/test
Downloading original images.
   Cloning https://github.com/aubreymoore/Tinian-sticky-traps-2022-02-26 to /home/aubrey/Desktop/test/repo
Creating imagelist and removing blacklisted items.
   blacklist.csv found
Saving imagelist
   Saving imagelist to /home/aubrey/Desktop/test/imagelist.csv
Creating a folder for each image and adding a symlink to original image.
Deleting .git to save room.
Finished.
