In [2]:
import numpy as np, pandas as pd
import os, shutil
from distutils.dir_util import copy_tree

In [3]:
from IPython.utils import io

In [25]:
src_path = './source_data/'
shared_vol_name = 'ml_vol'
dest_dir_path = '/ml'
dest_data_path = os.path.join(shared_vol_name, 'data')

final_results_dir = 'all_results'

In [26]:
def clear_directory(folder):
    if not os.path.exists(folder): return
    for fname in os.listdir(folder):
        fpath = os.path.join(folder, fname)
        try:
            if os.path.isfile(fpath) or os.path.islink(fpath):
                os.unlink(fpath)
            elif os.path.isdir(fpath):
                shutil.rmtree(fpath)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (fpath, e))

In [27]:
def initiate_shared_vol():
    # Run this as-is. Do not change any folder names!!! 
    if os.path.exists(shared_vol_name): 
        clear_directory(shared_vol_name)
    else:
        os.mkdir(shared_vol_name)
        
    subdirs = ['data', 'logs', 'model', 'output']
    for subdir in subdirs: 
        sub_dir_path = os.path.join(shared_vol_name, subdir)
        os.mkdir(sub_dir_path)

    subdirs = ['train', 'test']
    for subdir in subdirs: 
        sub_dir_path = os.path.join(shared_vol_name, 'data', subdir)
        os.mkdir(sub_dir_path)

In [28]:
def copy_dataset_files(dataset):
    src_dataset_path = os.path.join(src_path, dataset, 'processed')
    for input_type in ['train', 'test']:
        full_src = os.path.join(src_dataset_path, input_type)
        full_dest = os.path.join(dest_data_path, input_type)

        if os.path.exists(full_src): copy_tree(full_src, full_dest)

In [29]:
def save_results(algo_name, dataset):
    # main algo directory in the outputs folder
    algo_dir = os.path.join(final_results_dir, algo_name)
    if not os.path.exists(algo_dir): os.mkdir(algo_dir)
    
    # dataset sub dir inside of the algo directory
    dataset_dir = os.path.join(algo_dir, dataset)
    if not os.path.exists(dataset_dir): os.mkdir(dataset_dir)
        
    subdirs = ['model', 'output']
    for subdir in subdirs: 
        from_path = os.path.join(shared_vol_name, subdir)
        to_path = os.path.join(dataset_dir, subdir)
        if not os.path.exists(to_path): 
            os.mkdir(to_path)
        else:
            clear_directory(to_path)
        
        copied_content = copy_tree(from_path, to_path) 
#         print(copied_content)

# save_results("rec_base_mf", "jester")

In [30]:
datasets = [
        "jester", 
    #     "anime", 
    #     "book-crossing", 
    #     "movielens-10m", 
    #     "movielens-20m"
    ]

algos = [
#    {
#        "algo_name": "rec_base_mf",
#        "image":  "abudesai/rec_base_mf:latest",
#    },
    {
       "algo_name": "rec_base_mf_res",
       "image":  "abudesai/rec_base_mf_res:latest",
   },
    {
       "algo_name": "rec_base_autorec",
       "image":  "abudesai/rec_base_autorec:latest",
   },    
    
]

In [31]:
%%time
for algo in algos: 
    for dataset in datasets:
        
        algo_name = algo["algo_name"]
        image = algo["image"]
        
        print("Running....")
        print('dataset:', dataset); print('image:', image); print('algo_name:', algo_name)
        
        initiate_shared_vol()
        copy_dataset_files(dataset)        
        
        with io.capture_output() as captured:
            !bash score_algos.sh  -i $image -a $algo_name -v $shared_vol_name
        
        save_results(algo_name, dataset)
        print("Finished running on dataset...")
        print("-"*80)
print("all done")

Running....
dataset: jester
image: abudesai/rec_base_mf_res:latest
algo_name: rec_base_mf_res
--------------------------------------------------------------------------------
Running....
dataset: jester
image: abudesai/rec_base_autorec:latest
algo_name: rec_base_autorec
--------------------------------------------------------------------------------
all done
Wall time: 17min 5s
