In [237]:
import boto3
import numpy as np
import concurrent.futures as fs
import io 
import linalg
import random
import string
import hashlib
import time
import os

In [445]:
def upload_data(total_mb, mb_per_file, bucket, worker):
    client = boto3.client('s3')
    hashes = {}
    num_files = int(np.ceil(total_mb/mb_per_file))
    np.random.seed(worker)
    rng_time = 0
    for n in range(num_files):
        key = "s3_benchmark_{0}_{1}".format(worker, n)
        s = time.time()
        data = np.random.randn(int((mb_per_file * 1e6)/8))
        hash_key = md5_hash_array(data)
        key = str(hash_key) + key 
        hashes[key] = hash_key
        e = time.time()
        client.put_object(Key=key, Bucket=bucket, Body=data.tostring())
        rng_time += e - s
    return hashes, rng_time

def md5_hash_array(x):
     return hashlib.md5(x.tostring()).hexdigest()
    
def download_keys(keys, bucket, worker, num_workers, total_mb, hashes, mb_per_process):
    client = boto3.client('s3')
    m = np.memmap("/dev/shm/matrix", dtype='float64', mode='r+', shape=(int((total_mb * 1e6)/8)))

    doubles_per_process = int((mb_per_process * 1e6)/8)
    mbc = mb_per_process/len(keys)
    idx = (worker*doubles_per_process)
    for i,key in enumerate(keys):
        obj = client.get_object(Key=key, Bucket=bucket)
        array = np.fromstring(obj['Body'].read())
        this_hash = md5_hash_array(array)
        assert this_hash == hashes[key]
        assert (not np.all(array == 0))
        m[idx:idx+len(array)] = array
        idx += len(array)
    m.flush()
    return 0

def clean_up(keys, client, bucket):
    for key in keys:
        client.delete_object(Bucket=bucket, Key=key)

In [204]:
upload_data(640,64,bucket,0)

[]

In [239]:
buffer = X
idx = 0

In [193]:
%time generate_random(64,10)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 58.4 µs


[array([-0.29180369,  1.26869724, -0.7464766 , ..., -0.06643731,
        -0.32220012,  0.06772099]),
 array([ 0.65249473,  0.62574683,  1.32867326, ...,  0.16590717,
        -1.09062257,  2.24514385]),
 array([-1.32026079, -0.47659805,  0.35059912, ...,  1.84824013,
        -0.26889658,  0.35529808]),
 array([ 0.69446594,  0.7202985 ,  0.72367283, ..., -2.08949591,
         0.47908934, -1.54728588]),
 array([ 0.48580424,  0.61557938,  0.46349266, ..., -1.144108  ,
         0.50400425, -1.05399785]),
 array([-1.57574107, -1.48795276, -0.12902127, ..., -0.06062828,
        -0.94697029,  0.0964624 ]),
 array([-1.39338792,  0.20887897, -0.43614628, ..., -0.32307805,
         0.38768403, -0.82191389]),
 array([-1.82783571, -1.00325585, -0.83595186, ..., -0.80306042,
         0.53820541,  1.29139682]),
 array([-1.13231237,  0.12396602, -0.2952104 , ..., -0.18426858,
        -0.07783604,  0.62999944]),
 array([-0.08132674,  1.79362581,  0.69062937, ...,  0.03914539,
        -0.38425125, -0.63

In [192]:
upload_data(data, bucket)

{'b6935bee1b1bfd2f4dbd9d66aba5171f': 'b6935bee1b1bfd2f4dbd9d66aba5171f'}

In [196]:
client = boto3.client('s3')
bucket = "vaishaalpywrenlinalg"
max_workers = 2
mb_per_file = 64
total_mb = 1024*2
mb_per_process = int(np.ceil(total_mb/max_workers))
print(mb_per_process)

1024


In [417]:
def benchmark_upload(max_workers=max_workers, total_mb=total_mb, mb_per_file=mb_per_file):
    mb_per_process = int(np.ceil(total_mb/max_workers))
    with fs.ProcessPoolExecutor(max_workers=max_workers) as executor:
        key_futures = {}
        all_futures = []
        start = time.time() 
        num_files = int(np.ceil(mb_per_process/mb_per_file))
        print(num_files, "per worker")
        for worker in range(max_workers):
            future = executor.submit(upload_data, mb_per_process, mb_per_file, bucket, worker)
            key_futures[worker] = future
            all_futures.append(future)
        
        print(len(all_futures))
        fs.wait(all_futures)
        print(all_futures[0].result())
        worker_hash_dicts, times = zip(*map(lambda x: x.result(), all_futures))
        print(list(worker_hash_dicts))
        hash_dict = {}
        [hash_dict.update(d) for d in worker_hash_dicts]
        avg_rng_time = sum(times)/max_workers
        end = time.time()
        total_upload_MBPS = total_mb/((end - start) - avg_rng_time)
        print("total upload MBPS was {0}".format(total_upload_MBPS))
        print("Time to upload {0} MB was {1}".format(total_mb, end-start))
        return hash_dict
    

In [425]:
def benchmark_download(keys, max_workers, hashes, mb_per_file=12):
    total_mb = len(keys)*mb_per_file
    mb_per_process = int(np.ceil(total_mb/max_workers))
    chunk_size = int(np.ceil(len(keys)/max_workers))
    chunks = list(linalg.chunk(keys, chunk_size))
    print(len(chunks))
    print(total_mb)
    with fs.ProcessPoolExecutor(max_workers=max_workers) as executor:
        M = np.memmap("/dev/shm/matrix", dtype='float64', mode='w+', shape=(int((total_mb * 1e6)/8)))
        all_object_futures = []
        start = time.time()
        for worker, chunk_keys in enumerate(chunks):  
            object_futures = executor.submit(download_keys, chunk_keys, bucket, worker, max_workers, total_mb, hashes, mb_per_process)
            all_object_futures.append(object_futures)
        fs.wait(all_object_futures)
        end = time.time()
        total_download_MBPS = total_mb/(end - start)
        print("total download MBPS was {0}".format(total_download_MBPS))
        print("Time to download {0} MB was {1}".format(total_mb, end-start))
        M = np.memmap("/dev/shm/matrix", dtype='float64', mode='r+', shape=(int((total_mb * 1e6)/8)))
        return M

In [467]:
%time hashes = benchmark_upload(max_workers=128, total_mb=1024*20, mb_per_file=128)

2 per worker
128
({'c8e54e4e469838198e9303f7d15155efs3_benchmark_0_0': 'c8e54e4e469838198e9303f7d15155ef', 'a6b63d16fee28aa28d84e122027c1e06s3_benchmark_0_1': 'a6b63d16fee28aa28d84e122027c1e06'}, 2.945990800857544)
[{'c8e54e4e469838198e9303f7d15155efs3_benchmark_0_0': 'c8e54e4e469838198e9303f7d15155ef', 'a6b63d16fee28aa28d84e122027c1e06s3_benchmark_0_1': 'a6b63d16fee28aa28d84e122027c1e06'}, {'59ba9198005fe560cd8db3bb3d7481bbs3_benchmark_1_0': '59ba9198005fe560cd8db3bb3d7481bb', 'c2a49db8faa1062aeae3f3d2df7ae588s3_benchmark_1_1': 'c2a49db8faa1062aeae3f3d2df7ae588'}, {'87c4739b3349ada3e0550539181d1e4ds3_benchmark_2_0': '87c4739b3349ada3e0550539181d1e4d', '82443b74685cff5fc48bba4857b84b4bs3_benchmark_2_1': '82443b74685cff5fc48bba4857b84b4b'}, {'674c38166bc7c906d608cbdad78b522as3_benchmark_3_0': '674c38166bc7c906d608cbdad78b522a', 'b04a916bed2be2f3b1f26c5e6639a286s3_benchmark_3_1': 'b04a916bed2be2f3b1f26c5e6639a286'}, {'d70b58052ecb348ed8494f9661f8f8ffs3_benchmark_4_0': 'd70b58052ecb348ed8

In [468]:
keys = list(map(lambda x: x['Key'], client.list_objects(Bucket="vaishaalpywrenlinalg")['Contents']))

In [470]:
M = benchmark_download(keys[:31], max_workers=32, hashes=hashes, mb_per_file=128)

32
32768
total download MBPS was 588.7694334354203
Time to download 32768 MB was 55.65506315231323
