## Helper code to do test/train/val split

images are originally stored in S3 at s3://s3-avalanche-guard/data/cv/uibk/ResNetClassify/train 

After running the following code, we will move a percentage to s3://s3-avalanche-guard/data/cv/uibk/ResNetClassify/val


To access S3 or any other AWS services we need SDK
The SDK is composed of two key Python packages: Botocore (the library providing the low-level functionality shared between the Python SDK and the AWS CLI) and Boto3 (the package implementing the Python SDK itself).

In [2]:
import boto3
import random
import shutil

In [10]:

# move an S3 object to another object
# example:
#move_s3_object('my_bucket', old_key='tmp/test.txt', new_key='tmp/tmp2/test.txt')
def move_s3_object(bucket: str, old_key: str, new_key: str) -> None:
    boto3.resource('s3').Object(bucket,  new_key).copy_from(CopySource=f'{bucket}/{old_key}')
    boto3.client('s3').delete_object(Bucket=bucket, Key=old_key)

# move an S3 object to another folder
# example:
#move_s3_object('my_bucket', old_folder='tmp/', new_folder='tmp/tmp2/', object_name='test.txt')
def s3_move_to_another_folder(bucket: str, old_folder: str, new_folder: str, object_name:str, deletesource:bool=True) -> None:
    
    old_key = old_folder+object_name
    new_key = new_folder+object_name
    boto3.resource('s3').Object(bucket,  new_key).copy_from(CopySource=f'{bucket}/{old_key}')
    if deletesource:
        boto3.client('s3').delete_object(Bucket=bucket, Key=old_key)

    

#print bucket contents
def print_all_s3_objects(bucket_name:str, prefix:str, replaceLine : bool = True):
    s3 = boto3.client('s3')
    paginator = s3.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket_name,Prefix=prefix)
    i=0

    for page in page_iterator:
        if 'Contents' in page:
            for obj in page['Contents']:
                i=i+1
                if replaceLine:
                    print(f'**** Count={i}, Name: {obj["Key"]} |                                                    Size: {obj["Size"]}', end="\r")
                else:
                    print(f'**** Count={i}, Name: {obj["Key"]} |                                                    Size: {obj["Size"]}')
        else:
            print("Bucket is empty or does not exist")


# Get bucket contents as a list
def list_all_s3_objects(bucket_name:str, prefix:str)-> [str]:
    s3 = boto3.client('s3')
    paginator = s3.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket_name,Prefix=prefix)

    objlist = []
    for page in page_iterator:
        if 'Contents' in page:
            for obj in page['Contents']:
                objlist.append( obj["Key"].replace(prefix, ""))

    return objlist
            
            
def split_train_val(s3_bucket_name:str, source_folder_name:str, train_val_testfolders:[str], 
                    train_val_test_pct:[float],test_mode:bool=False):

    # Create a list of image filenames in 'data_path'
    imgs_list = list_all_s3_objects(s3_bucket_name,source_folder_name)

    # Sets the random seed 
    random.seed(42)

    # Shuffle the list of image filenames
    random.shuffle(imgs_list)
    
    # get the split pct
    if len(train_val_test_pct) == 1:
        train_pct = train_val_test_pct[0]
        test_pct=0
        val_pct = 0
    elif len(train_val_test_pct) == 2:
        train_pct = train_val_test_pct[0]
        val_pct = train_val_test_pct[1]
        test_pct= 0
    elif len(train_val_test_pct) == 3:
        train_pct = train_val_test_pct[0]
        val_pct = train_val_test_pct[1]         
        test_pct=train_val_test_pct[2]
        
    else:
        raise("Error")

        
    # get the folders    
    if len(train_val_testfolders) == 1:
        train_folder = train_val_testfolders[0]
        test_folder="none"
        val_folder = "none"
        
    elif len(train_val_testfolders) == 2:
        train_folder = train_val_testfolders[0]
        val_folder = train_val_testfolders[1]
        test_folder= "none"

    elif len(train_val_testfolders) == 3:
        train_folder = train_val_testfolders[0]
        val_folder = train_val_testfolders[1]         
        test_folder=train_val_testfolders[2]

    else:
        raise("Error")
        

    # determine the number of images for each set
    train_size = int(len(imgs_list) * train_pct)
    val_size = int(len(imgs_list) * val_pct)
    test_size = int(len(imgs_list) * test_pct)

    # Create destination folders if they don't exist
    # ** to do ** 

    # Copy image files to destination folders
    for i, f in enumerate(imgs_list):
        if i < train_size:
            dest_folder = train_folder
        elif i < train_size + val_size:
            dest_folder = val_folder
        else:
            dest_folder = test_folder
        if test_mode:
            print("planning to move {0} from {1} to {2}".format(f,source_folder_name, dest_folder))
        else:
            print("moving {0} from {1}   to    {2}          ".format(f,source_folder_name, dest_folder) , end = "\r"  )
            s3_move_to_another_folder(s3_bucket_name, old_folder=source_folder_name, 
                                  new_folder=dest_folder, 
                                  object_name=f)
            print("done")
        



In [8]:
## Prepare data for Experiment 2 - Avalanche or Not

s3_bucket_name = "s3-avalanche-guard"
src_folder = "data/cv/uibk/images/none/"
new_folder = "data/experiments/exp02-binary-avalanche/orig/negative/"
src_prefix = "data/cv/uibk/images/none/"


img_list = list_all_s3_objects(s3_bucket_name, src_prefix)

for idx, object_name in enumerate(img_list):
    print(f"now moving {idx}   image",end='\r')
    s3_move_to_another_folder(s3_bucket_name, src_folder, new_folder, object_name, deletesource=False)

now moving 1070   image

In [11]:

s3_bucket_name = "s3-avalanche-guard"

##source_folder = "data/cv/uibk/ResNetClassify/train/"
##dest_folder = "data/cv/uibk/ResNetClassify/val/"


###print_all_s3_objects(s3_bucket_name,"data/experiments/exp01-terrain-binary/positive/")
print("")
print("")

###print_all_s3_objects(s3_bucket_name,"data/experiments/exp01-terrain-binary/negative/")

SKIP = False
print("")
print(SKIP)
if SKIP :
    print('Skipping the train/val split step')
    pass
else:
    split_train_val(s3_bucket_name,
                "data/experiments/exp02-binary-avalanche/orig/negative/",
                [
                    "data/experiments/exp02-binary-avalanche/train/negative/",
                    "data/experiments/exp02-binary-avalanche/val/negative/",
                    "data/experiments/exp02-binary-avalanche/test/negative/"
                ],
                [.70,.20,.10],)




False
doneng 2015-03-31 karnischer kamm (3).jpg from data/experiments/exp02-binary-avalanche/orig/negative/   to    data/experiments/exp02-binary-avalanche/train/negative/          
doneng 2018-01-22 hohe geige (9).jpg from data/experiments/exp02-binary-avalanche/orig/negative/   to    data/experiments/exp02-binary-avalanche/train/negative/          
doneng 2020-01-30 hochgurgl (14).jpg from data/experiments/exp02-binary-avalanche/orig/negative/   to    data/experiments/exp02-binary-avalanche/train/negative/          
doneng 2017-02-23 jamtal (12).jpg from data/experiments/exp02-binary-avalanche/orig/negative/   to    data/experiments/exp02-binary-avalanche/train/negative/          
doneng 2015-03-06 rietzer grieskogel (3).jpg from data/experiments/exp02-binary-avalanche/orig/negative/   to    data/experiments/exp02-binary-avalanche/train/negative/          
doneng 2019-03-19 jamtalhuette (10).jpg from data/experiments/exp02-binary-avalanche/orig/negative/   to    data/experiments/e