In [1]:
import numpy as np, pandas as pd
import os, shutil
from distutils.dir_util import copy_tree

In [2]:
from IPython.utils import io

In [3]:
src_path = './source_data/'
shared_vol_name = 'ml_vol'
dest_dir_path = '/ml'
dest_data_path = os.path.join(shared_vol_name, 'data')
model_category = 'rec_base'
final_results_dir = 'all_results'

In [4]:
def clear_directory(folder):
    if not os.path.exists(folder): return
    for fname in os.listdir(folder):
        fpath = os.path.join(folder, fname)
        try:
            if os.path.isfile(fpath) or os.path.islink(fpath):
                os.unlink(fpath)
            elif os.path.isdir(fpath):
                shutil.rmtree(fpath)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (fpath, e))

In [5]:
def initiate_shared_vol():
    # Run this as-is. Do not change any folder names!!! 
    if os.path.exists(shared_vol_name): 
        clear_directory(shared_vol_name)
    else:
        os.mkdir(shared_vol_name)
        
    subdirs = ['data', 'logs', 'model', 'output']
    for subdir in subdirs: 
        sub_dir_path = os.path.join(shared_vol_name, subdir)
        os.mkdir(sub_dir_path)

    subdirs = ['train', 'test']
    for subdir in subdirs: 
        sub_dir_path = os.path.join(shared_vol_name, 'data', subdir)
        os.mkdir(sub_dir_path)

In [6]:
def copy_dataset_files(dataset):
    src_dataset_path = os.path.join(src_path, dataset, 'processed')
    for input_type in ['train', 'test']:
        full_src = os.path.join(src_dataset_path, input_type)
        full_dest = os.path.join(dest_data_path, input_type)

        if os.path.exists(full_src): copy_tree(full_src, full_dest)

In [7]:
def save_results(algo_name, dataset):
    # main algo directory in the outputs folder
    algo_dir = os.path.join(final_results_dir, f'{model_category}_{algo_name}')
    if not os.path.exists(algo_dir): os.mkdir(algo_dir)
    
    # dataset sub dir inside of the algo directory
    dataset_dir = os.path.join(algo_dir, dataset)
    if not os.path.exists(dataset_dir): os.mkdir(dataset_dir)
        
    subdirs = ['model', 'output']
    for subdir in subdirs: 
        from_path = os.path.join(shared_vol_name, subdir)
        to_path = os.path.join(dataset_dir, subdir)
        if not os.path.exists(to_path): 
            os.mkdir(to_path)
        else:
            clear_directory(to_path)
        
        copied_content = copy_tree(from_path, to_path) 
#         print(copied_content)

# save_results("rec_base_mf", "jester")

In [8]:
datasets = [
#         "jester", 
#         "anime", 
#         "book-crossing", 
#         "movielens-10m", 
#         "movielens-20m",
        "celal_data"
    ]

algos = [
   {
       "algo_name": "rec_base_mf",
       "image":  "abudesai/rec_base_mf:latest",
   },
#     {
#        "algo_name": "rec_base_mf_res",
#        "image":  "abudesai/rec_base_mf_res:latest",
#    },
#     {
#        "algo_name": "rec_base_autorec",
#        "image":  "abudesai/rec_base_autorec:latest",
#    },        
]

In [10]:
%%time
for algo in algos: 
    for dataset in datasets:
        
        algo_name = algo["algo_name"]
        image = algo["image"]
        
        print("Running....")
        print('dataset:', dataset); print('image:', image); print('algo_name:', algo_name)
        
        initiate_shared_vol()
        copy_dataset_files(dataset)        
        
#         with io.capture_output() as captured:
        !bash score_algos.sh  -i $image -a $algo_name -v $shared_vol_name
        
        save_results(algo_name, dataset)
        print("Finished running on dataset...")
        print("-"*80)
print("all done")

Running....
dataset: celal_data
image: abudesai/rec_base_mf:latest
algo_name: rec_base_mf


Error response from daemon: No such container: rec_base_mf
Error: No such container: rec_base_mf
2022-01-25 17:26:22.545371: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-25 17:26:22.545411: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-01-25 17:26:23.782302: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-01-25 17:26:23.782339: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-01-25 17:26:23.782366: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (315e885791f7): /proc/driv

host_vol_path: /mnt/c/Users/abbus/Documents/GitRepos/ready_tensor/Recommenders/recommender_images/recommender_base/ml_vol
container_vol_path: /app/ml_vol
315e885791f79124106fd356a0b9ad8478fa3b4f8ad58acc050f2656689f0b37
Starting the training process...
orig_train_data shape: (698901, 4)
Training Matrix_Factorizer ...
Pre-processing data...
Preprocessing train_data ...
Training model ...
Epoch 1/30

   1/4915 [..............................] - ETA: 27:30 - loss: 0.8882 - mae: 0.9300
   8/4915 [..............................] - ETA: 38s - loss: 0.8615 - mae: 0.9076  
  17/4915 [..............................] - ETA: 33s - loss: 0.8669 - mae: 0.9117
  27/4915 [..............................] - ETA: 30s - loss: 0.8599 - mae: 0.9077

Epoch 12/30

   1/4915 [..............................] - ETA: 23s - loss: 0.0355 - mae: 0.1404
  14/4915 [..............................] - ETA: 19s - loss: 0.0420 - mae: 0.1472
  27/4915 [..............................] - ETA: 19s - loss: 0.0425 - mae: 0.1473
  40/4915 [..............................] - ETA: 19s - loss: 0.0423 - mae: 0.1469
  53/4915 [..............................] - ETA: 19s - loss: 0.0428 - mae: 0.1475
  66/4915 [..............................] - ETA: 19s - loss: 0.0440 - mae: 0.1489

Epoch 22/30

   1/4915 [..............................] - ETA: 19s - loss: 0.0312 - mae: 0.1310
  14/4915 [..............................] - ETA: 20s - loss: 0.0300 - mae: 0.1185
  27/4915 [..............................] - ETA: 20s - loss: 0.0320 - mae: 0.1214
  40/4915 [..............................] - ETA: 20s - loss: 0.0325 - mae: 0.1214
  53/4915 [..............................] - ETA: 20s - loss: 0.0319 - mae: 0.1205
  66/4915 [..............................] - ETA: 20s - loss: 0.0321 - mae: 0.1212

In [None]:
%%time
for algo in algos: 
    algo_name = algo["algo_name"]
    image = algo["image"]    
    !bash remove_container_and_image.shcontainer_and_image.sh  -i $image -a $algo_name 