In [2]:
import pywren
import numpy as np
import scipy.linalg as linalg
import boto3
import io
import itertools
from sklearn.datasets import fetch_mldata
from numba import jit
import math
import time
from sklearn.kernel_approximation import RBFSampler


In [3]:
pwex = pywren.default_executor()

In [234]:
BUCKET = "imagenet-raw"
BLOCK_PATH_ROOT = "mnist-blocked/"
MODEL_PATH_ROOT = "mnist-kernel-model/"
KERNEL_BLOCK_PATH_ROOT = "mnist-kernel-blocked/"
KEY_TEMPLATE = BLOCK_PATH_ROOT + "mnist_blocked_{0}_{1}_replicate_{2}.npy"
LABEL_KEY_TEMPLATE = BLOCK_PATH_ROOT + "mnist_labels_blocked_{0}_{1}_replicate_{2}.npy"
MODEL_KEY_TEMPLATE = MODEL_PATH_ROOT + "x_{0}_{1}_replicate_{2}.npy"
OUT_KEY_TEMPLATE = KERNEL_BLOCK_PATH_ROOT + "mnist_kernel_block_size_{0}_gamma_{1}_block_{2}_{3}_{4}_{5}"


GAMMA=1e-3
BLOCK_SIZE = 4096
NUM_EXAMPLES = 60000
REPLICATION_FACTOR = 1
NUM_BLOCKS = int(math.ceil(60000.0/BLOCK_SIZE))
NUM_CLASSES = 10

In [269]:
def make_fast_exp():
    from numba import jit
    @jit
    def fast_exp(K):
        for i in range(K.shape[0]):
            for j in range(K.shape[1]):
                K[i,j] = math.exp(K[i,j])
        return K
    return fast_exp

def chunk(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]


def computeDistanceMatrix(XTest, XTrain):
    XTrain = XTrain.reshape(XTrain.shape[0], -1)
    XTest = XTest.reshape(XTest.shape[0], -1)
    XTrain_norms = (np.linalg.norm(XTrain, axis=1) ** 2)[:, np.newaxis]
    XTest_norms = (np.linalg.norm(XTest, axis=1) ** 2)[:, np.newaxis]
    K = XTest.dot(XTrain.T)
    K *= -2
    K += XTrain_norms.T
    K += XTest_norms
    return K

def computeRBFGramMatrix(XTest, XTrain, gamma=1):
    gamma = -1.0 * gamma
    fast_exp = make_fast_exp()
    return fast_exp(gamma*computeDistanceMatrix(XTest, XTrain))

def compute_kernel_blocks(block_pairs, bucket=BUCKET, block_size=BLOCK_SIZE, gamma=GAMMA, num_examples=NUM_EXAMPLES, replication_factor=REPLICATION_FACTOR):
    times = np.zeros(4)
    for bp in block_pairs:
        time, _ = compute_kernel_block(bp, bucket, block_size, gamma, num_examples, replication_factor)
        times += time
        
    times /= float(len(block_pairs))
    return times

def compute_kernel_block(block_nums, bucket=BUCKET, block_size=4096, 
                         gamma=1e-3, 
                         num_examples=NUM_EXAMPLES, 
                         replication_factor=REPLICATION_FACTOR, 
                         verbose_return=False):
    ''' Compute a kernel block when design matrix is sharded on s3 '''
    start = time.time()
    block_num_1, block_num_2 = block_nums[0], block_nums[1]
    # blocks are symmetric so only generate lower half
    block_num_1, block_num_2 = min(block_num_1, block_num_2), max(block_num_1, block_num_2)
    block_1_key = block_num_to_shard_key(block_num_1, block_size, num_examples, replication_factor)
    block_2_key = block_num_to_shard_key(block_num_2, block_size, num_examples, replication_factor)
    meta_time = time.time() - start
    start = time.time()
    block1 = np.load(s3_key_to_byte_io(bucket, block_1_key))
    block2 = np.load(s3_key_to_byte_io(bucket, block_2_key))
    download_time = time.time() - start
    start = time.time() 
    K = computeRBFGramMatrix(block1, block2, gamma=gamma)
    kernel_time  = time.time() - start
    
    start = time.time()
    out_key = block_num_to_output_shard_key(block_num_1, block_num_2, block_size, gamma, num_examples)
    save_matrix_to_s3(K, bucket, out_key)
    upload_time = time.time() - start
    if verbose_return:
        out = K
    else:
        out = None
   
    return (np.array([meta_time, download_time, kernel_time, upload_time]), out)
    
def s3_key_to_byte_io(bucket, key):
     client = boto3.client('s3')
     print(key)
     return io.BytesIO(client.get_object(Bucket=bucket, Key=key)['Body'].read())

def block_num_to_shard_key(block_num, block_size, num_examples, replication_factor=1,key_template=KEY_TEMPLATE):
    block_start = block_size*block_num
    block_end = min(block_size*(block_num+1), num_examples)
    replicate = np.random.choice(replication_factor, 1)[0]
    key = key_template.format(block_start, block_end, replicate)
    return key 

def block_num_to_output_shard_key(block_num_1, block_num_2, block_size, gamma, num_examples, key_template=OUT_KEY_TEMPLATE):
    block_start_idx_1 = block_size*block_num_1
    block_end_idx_1 = min(block_size*(block_num_1+1), num_examples)
    block_start_idx_2 = block_size*block_num_2
    block_end_idx_2 = min(block_size*(block_num_2+1), num_examples)
    return OUT_KEY_TEMPLATE.format(block_size, 
                                   gamma, 
                                   block_start_idx_1, 
                                   block_end_idx_1, 
                                   block_start_idx_2, 
                                   block_end_idx_2)

def save_matrix_to_s3(K, bucket, out_key):
     client = boto3.client('s3')
     outb = io.BytesIO()
     np.save(outb, K)
     response = client.put_object(Key=out_key, Bucket=bucket, Body=outb.getvalue())
     return response
    
def generate_chunked_block_pairs(num_blocks, inner_chunk_size=10, outer_chunk_size=1000):
    all_pairs = list(itertools.product(range(NUM_BLOCKS), range(NUM_BLOCKS)))
    sorted_pairs = map(lambda x: tuple(sorted(x)), all_pairs)
    dedup_sorted_pairs = list(set(sorted_pairs))
    print len(dedup_sorted_pairs)
    return list(chunk(list(chunk(dedup_sorted_pairs, inner_chunk_size)), outer_chunk_size))

def get_kernel_block(block_x, block_y, 
                     block_size=BLOCK_SIZE, 
                     gamma=GAMMA, 
                     num_examples=NUM_EXAMPLES, 
                     key_template=OUT_KEY_TEMPLATE, 
                     bucket=BUCKET):
    
    '''Get BLOCK_SIZE x BLOCK_SIZE submatrix from kernel matrix'''
  
    transpose = False
    if (block_y < block_x):
        transpose = True
        block_x, block_y = block_y, block_x
    
    block_key = block_num_to_output_shard_key(block_x, block_y, block_size, gamma, num_examples, key_template)
    K_block = np.load(s3_key_to_byte_io(bucket, block_key))
    if (transpose):
        K_block = K_block.T
    return K_block




def get_matrix_block(block,
                     block_size=BLOCK_SIZE,
                     gamma=GAMMA,
                     num_examples=NUM_EXAMPLES,
                     key_template=OUT_KEY_TEMPLATE,
                     replication_factor=REPLICATION_FACTOR,
                     bucket=BUCKET):
    '''Get BLOCK_SIZE x d submatrix from kernel matrix'''
    block_key = block_num_to_shard_key(block, block_size, num_examples, replication_factor, key_template)
    matrix_block = np.load(s3_key_to_byte_io(bucket, block_key))
    return matrix_block
    
    
    


def shard_matrix(X, block_size, key_template, bucket=BUCKET, replication_factor=3):
    N = X.shape[0]
    responses = []
    client = boto3.client('s3')
    for i in range(0, X.shape[0], block_size):
        print(i)
        X_block = X[i:min(i+block_size, N)]
        outb = io.BytesIO()
        np.save(outb, X_block)
        for j in range(replication_factor):
            r = client.put_object(Key=key_template.format(i, min(i+block_size, N), j), Bucket=bucket, Body=outb.getvalue())
        responses.append(r)
    return responses

        

In [19]:
mndata = MNIST('/data/vaishaal/mnist')
train_data = mndata.load_training()
test_data = mndata.load_testing()
X_train = np.array(train_data[0]).astype('float32')/255.0
y_train = np.array(train_data[1]).astype('float32')
X_test = np.array(test_data[0]).astype('float32')/255.0
y_test = np.array(test_data[1]).astype('float32')
y_train_one_hot = np.eye(10)[y_train.astype('int')]
y_test_one_hot = np.eye(10)[y_test.astype('int')]

In [146]:
shard_matrix(X_train, BLOCK_SIZE, KEY_TEMPLATE, replication_factor=REPLICATION_FACTOR)

0
4096
8192
12288
16384
20480
24576
28672
32768
36864
40960
45056
49152
53248
57344


[{u'ETag': '"2966c58856e5bb06cb755dd48940be50"',
  'ResponseMetadata': {'HTTPHeaders': {'content-length': '0',
    'date': 'Wed, 08 Mar 2017 16:52:49 GMT',
    'etag': '"2966c58856e5bb06cb755dd48940be50"',
    'server': 'AmazonS3',
    'x-amz-id-2': 'XXRZpPaGbEsIRf6sWCgCfBaDPUHXAqYhXMGdJCS2gwiKOcH1258ZzqUml5KoMWwDfBp8eIYyBq4=',
    'x-amz-request-id': '265662B704AB12C0'},
   'HTTPStatusCode': 200,
   'HostId': 'XXRZpPaGbEsIRf6sWCgCfBaDPUHXAqYhXMGdJCS2gwiKOcH1258ZzqUml5KoMWwDfBp8eIYyBq4=',
   'RequestId': '265662B704AB12C0',
   'RetryAttempts': 0}},
 {u'ETag': '"9802d053002125b1fc4774b9d81624a2"',
  'ResponseMetadata': {'HTTPHeaders': {'content-length': '0',
    'date': 'Wed, 08 Mar 2017 16:52:51 GMT',
    'etag': '"9802d053002125b1fc4774b9d81624a2"',
    'server': 'AmazonS3',
    'x-amz-id-2': 'bgNiXH/ch7ulufcON4DcqjZ/VmAmLBYB4pacBwQ9tk0hipAofVxpCBGc/ho9ZzYWXrxKZl1I9XQ=',
    'x-amz-request-id': '133CAE0C66F71AC2'},
   'HTTPStatusCode': 200,
   'HostId': 'bgNiXH/ch7ulufcON4DcqjZ/VmAmLB

In [245]:
shard_matrix(y_train, BLOCK_SIZE, LABEL_KEY_TEMPLATE)

0
4096
8192
12288
16384
20480
24576
28672
32768
36864
40960
45056
49152
53248
57344


[{u'ETag': '"ee088d1cac3adff4aaccc308e699633a"',
  'ResponseMetadata': {'HTTPHeaders': {'content-length': '0',
    'date': 'Wed, 08 Mar 2017 19:13:27 GMT',
    'etag': '"ee088d1cac3adff4aaccc308e699633a"',
    'server': 'AmazonS3',
    'x-amz-id-2': 'EZTDVBD1F/lkB0PH4wP/sQKfQrT3NhhAmzUKjvr0UBRiqEGpBKYED1zXoTJqh3suB0JYT1xc7wo=',
    'x-amz-request-id': '017777604F549566'},
   'HTTPStatusCode': 200,
   'HostId': 'EZTDVBD1F/lkB0PH4wP/sQKfQrT3NhhAmzUKjvr0UBRiqEGpBKYED1zXoTJqh3suB0JYT1xc7wo=',
   'RequestId': '017777604F549566',
   'RetryAttempts': 0}},
 {u'ETag': '"441d9805b27fd86dbca5dab8ad6828fd"',
  'ResponseMetadata': {'HTTPHeaders': {'content-length': '0',
    'date': 'Wed, 08 Mar 2017 19:13:27 GMT',
    'etag': '"441d9805b27fd86dbca5dab8ad6828fd"',
    'server': 'AmazonS3',
    'x-amz-id-2': 'FDM2FP1C2ruGOv3YX2e49m60XZw/Uqzd97h/xtQwX7d/XzGoU3KXDdKdjQabH78ZAkgilLpmaFM=',
    'x-amz-request-id': '00B4B6347B065CCF'},
   'HTTPStatusCode': 200,
   'HostId': 'FDM2FP1C2ruGOv3YX2e49m60XZw/Uq

In [237]:
chunked_blocks = generate_chunked_block_pairs(NUM_BLOCKS, 10, 1500)

120


In [238]:
all_futures = [] 
all_times = []
gamma = 1e-3
t = time.time()
for c in chunked_blocks:
    %time kernel_futures = pwex.map(lambda x: compute_kernel_blocks(x, block_size=BLOCK_SIZE, gamma=gamma, replication_factor=3), c)
    %time pywren.wait(kernel_futures)
    all_futures.extend(kernel_futures)
    all_times.append(time.time() - t)

     

CPU times: user 1.21 s, sys: 120 ms, total: 1.33 s
Wall time: 2.28 s
CPU times: user 1.75 s, sys: 384 ms, total: 2.14 s
Wall time: 58.7 s


In [290]:
def block_matrix_multiply(block_x, block_y, key_template=OUT_KEY_TEMPLATE, x_key_template=MODEL_KEY_TEMPLATE, gamma=GAMMA, block_size=BLOCK_SIZE):
    K_block = get_kernel_block(block_x, block_y, key_template=key_template)
    x_block = get_matrix_block(block_y, key_template=x_key_template, replication_factor=1)
    print K_block.shape
    print x_block.shape
    return K_block.dot(x_block)


def blocks_matrix_multiply(block_x, blocks_y, block_size=BLOCK_SIZE, key_template=OUT_KEY_TEMPLATE, x_key_template=MODEL_KEY_TEMPLATE, gamma=GAMMA, num_examples=NUM_EXAMPLES, num_classes=10):
    if (block_x*block_size + block_size > num_examples):
        x_size = num_examples - block_x*block_size
    else:
        x_size = block_size
    x = np.zeros((x_size,num_classes))
    for block_y in blocks_y:
        x += block_matrix_multiply(block_x, block_y, key_template, x_key_template, gamma, block_size)
    return x

In [165]:
compute_kernel_block((0,0), verbose_return=True, gamma=1e-3)

mnist-blocked/mnist_blocked_0_4096_replicate_2.npy
mnist-blocked/mnist_blocked_0_4096_replicate_4.npy


(array([  2.55823135e-04,   4.12059307e+00,   1.44077897e+00,
          1.65737009e+00]),
 array([[ 1.        ,  0.91609752,  0.88845819, ...,  0.904172  ,
          0.9165563 ,  0.90353596],
        [ 0.91609752,  1.        ,  0.87876332, ...,  0.90516526,
          0.90381181,  0.89736366],
        [ 0.88845819,  0.87876332,  1.        , ...,  0.90027785,
          0.88014066,  0.90891707],
        ..., 
        [ 0.904172  ,  0.90516526,  0.90027785, ...,  1.        ,
          0.89309585,  0.95199209],
        [ 0.9165563 ,  0.90381181,  0.88014066, ...,  0.89309585,
          1.        ,  0.91428226],
        [ 0.90353596,  0.89736366,  0.90891707, ...,  0.95199209,
          0.91428226,  1.        ]], dtype=float32))

In [132]:
get_kernel_block(0,0)

mnist-kernel-blocked/mnist_kernel_block_size_4096_gamma_0.001_block_0_4096_0_4096


array([[ 1.        ,  0.91609752,  0.88845819, ...,  0.904172  ,
         0.9165563 ,  0.90353596],
       [ 0.91609752,  1.        ,  0.87876332, ...,  0.90516526,
         0.90381181,  0.89736366],
       [ 0.88845819,  0.87876332,  1.        , ...,  0.90027785,
         0.88014066,  0.90891707],
       ..., 
       [ 0.904172  ,  0.90516526,  0.90027785, ...,  1.        ,
         0.89309585,  0.95199209],
       [ 0.9165563 ,  0.90381181,  0.88014066, ...,  0.89309585,
         1.        ,  0.91428226],
       [ 0.90353596,  0.89736366,  0.90891707, ...,  0.95199209,
         0.91428226,  1.        ]], dtype=float32)

In [133]:
%time K = computeRBFGramMatrix(X_train, X_train, gamma=GAMMA)

CPU times: user 3min 23s, sys: 41.9 s, total: 4min 5s
Wall time: 1min 17s


In [149]:
get_matrix_block(0, key_template=KEY_TEMPLATE)

mnist-blocked/mnist_blocked_0_4096_replicate_0.npy


array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32)

In [291]:
%time block_matrix_multiply(14,0)

mnist-kernel-blocked/mnist_kernel_block_size_4096_gamma_0.001_block_0_4096_57344_60000
mnist-kernel-model/x_0_4096_replicate_0.npy
(2656, 4096)
(4096, 10)
CPU times: user 1.14 s, sys: 168 ms, total: 1.3 s
Wall time: 1.91 s


array([[-128.28378071,   46.75721334, -128.84601117, ..., -101.87003691,
         -45.20726051,    2.69887605],
       [-125.72319509,   46.26822994, -127.59930875, ..., -101.86083542,
         -46.83079916,    2.94831494],
       [-128.97963348,   45.45131767, -130.11050096, ..., -102.71055678,
         -44.64585475,    3.12577386],
       ..., 
       [-128.03554733,   45.82619925, -128.09757918, ..., -102.0359487 ,
         -45.62099551,    2.32139735],
       [-126.85063097,   46.15789753, -129.21182259, ..., -103.39954227,
         -45.9747167 ,    2.31670984],
       [-127.22006186,   46.11199791, -128.51631563, ..., -102.58263828,
         -47.25640348,    0.74326638]])

In [344]:
%time x = np.random.randn(NUM_EXAMPLES, NUM_CLASSES)

CPU times: user 60 ms, sys: 0 ns, total: 60 ms
Wall time: 56.1 ms


In [345]:
%time y_hat = K.dot(x)

CPU times: user 1min 12s, sys: 9.37 s, total: 1min 22s
Wall time: 12.1 s


In [347]:
%time shard_matrix(x, BLOCK_SIZE, key_template=MODEL_KEY_TEMPLATE)

0
4096
8192
12288
16384
20480
24576
28672
32768
36864
40960
45056
49152
53248
57344
CPU times: user 404 ms, sys: 32 ms, total: 436 ms
Wall time: 8.18 s


[{u'ETag': '"12985460e5fd28b9bfd02748fe5b90ce"',
  'ResponseMetadata': {'HTTPHeaders': {'content-length': '0',
    'date': 'Wed, 08 Mar 2017 20:28:02 GMT',
    'etag': '"12985460e5fd28b9bfd02748fe5b90ce"',
    'server': 'AmazonS3',
    'x-amz-id-2': 'jIYdxXF105CAmPu94fetaFGJ8dRFjOUp0eNvqMRzRIdUSyQ+h1lnUGmV0zediHS8+yf9C+TmLU8=',
    'x-amz-request-id': '93777648962E86BB'},
   'HTTPStatusCode': 200,
   'HostId': 'jIYdxXF105CAmPu94fetaFGJ8dRFjOUp0eNvqMRzRIdUSyQ+h1lnUGmV0zediHS8+yf9C+TmLU8=',
   'RequestId': '93777648962E86BB',
   'RetryAttempts': 0}},
 {u'ETag': '"ee201263606982feb424d42a92ef5bd8"',
  'ResponseMetadata': {'HTTPHeaders': {'content-length': '0',
    'date': 'Wed, 08 Mar 2017 20:28:02 GMT',
    'etag': '"ee201263606982feb424d42a92ef5bd8"',
    'server': 'AmazonS3',
    'x-amz-id-2': '7IOjQ+REiwVoKcVIMwMYVfwzHAidLasS193acWbev9tYiZ8dVzn9vrihEYbnq62WGGL/+6f4rFw=',
    'x-amz-request-id': '19A9F8AFBEA16D4B'},
   'HTTPStatusCode': 200,
   'HostId': '7IOjQ+REiwVoKcVIMwMYVfwzHAidLa

In [348]:
all_futures = [] 
all_times = []
gamma = 1e-3
t = time.time()
for c in chunked_blocks:
    %time matmul_futures = pwex.map(lambda x: blocks_matrix_multiply(x, range(NUM_BLOCKS)), range(NUM_BLOCKS))
    %time pywren.wait(matmul_futures)

CPU times: user 1.21 s, sys: 96 ms, total: 1.3 s
Wall time: 2.37 s
CPU times: user 1.57 s, sys: 476 ms, total: 2.05 s
Wall time: 55.6 s


In [349]:
y_pywren = np.vstack(map(lambda x: x.result(), matmul_futures))

In [352]:
np.max(y_pywren - y_hat)

8.1240711153895973e-06

array([[-186.00949731,  124.43041149,  497.65620407, ...,  -10.91797642,
         188.52965082, -171.28709987],
       [-171.77387533,  119.43956622,  492.88423478, ...,  -11.70699885,
         182.36139324, -165.06179244],
       [-177.53636246,  117.7787255 ,  494.54356989, ...,  -13.07114439,
         189.44762783, -169.94315537],
       ..., 
       [-188.85554876,  132.83066674,  495.44710393, ...,   -9.61378749,
         190.65508852, -169.69689099],
       [-176.94234491,  124.26633807,  495.964517  , ...,  -14.94719051,
         187.09091449, -177.67136713],
       [-180.90586938,  122.5344383 ,  500.22194193, ...,  -11.83839985,
         188.99764754, -172.9376709 ]])

In [235]:
NUM_BLOCKS

15

In [317]:
y_hat.astype('float32')

array([[ 227.23910522,   67.11650848, -220.43588257, ..., -153.90719604,
          73.61194611, -161.79536438],
       [ 233.42433167,   62.26885223, -208.6574707 , ..., -151.76164246,
          71.20610046, -155.42860413],
       [ 221.47065735,   71.08818817, -226.14672852, ..., -154.83169556,
          80.12179565, -160.51142883],
       ..., 
       [ 220.43322754,   66.28712463, -219.16998291, ..., -153.2624054 ,
          76.82614136, -165.19265747],
       [ 236.87371826,   70.62846375, -218.40971375, ..., -158.09988403,
          78.82926941, -163.04502869],
       [ 229.33178711,   71.03556824, -221.09628296, ..., -155.94696045,
          83.37482452, -167.30075073]], dtype=float32)

In [314]:
y_pywren.astype('float32')

array([[ 227.23910522,   67.11650848, -220.43588257, ..., -153.90719604,
          73.61194611, -161.79536438],
       [ 233.42433167,   62.26885223, -208.6574707 , ..., -151.76164246,
          71.20610046, -155.42860413],
       [ 221.47065735,   71.08818817, -226.14672852, ..., -154.83169556,
          80.12179565, -160.51142883],
       ..., 
       [ 220.43322754,   66.28713226, -219.16998291, ..., -153.2624054 ,
          76.82614136, -165.19265747],
       [ 236.87371826,   70.62846375, -218.40971375, ..., -158.09989929,
          78.82926941, -163.04502869],
       [ 229.33178711,   71.03556824, -221.09628296, ..., -155.94696045,
          83.37482452, -167.30076599]], dtype=float32)

In [342]:
y_pywren.dtype

dtype('float64')

In [346]:
y_hat.dtype

dtype('float64')

In [1]:
import cloudpickle

In [4]:
linalg = cloudpickle.loads('\x80\x02ccloudpickle.cloudpickle\nsubimport\nq\x00U\x06linalgq\x01\x85q\x02Rq\x03.')

NameError: name 'X' is not defined