# Storage (EFS, EBS) performance and cost - taxi dataset

In [14]:
import pandas as pd
import plotly.express as plx
import matplotlib.pyplot as plt

In [15]:
ops = pd.read_csv("../reports/m5ad.large.ap-southeast-2-taxi-sqlite.csv")

In [16]:
ops.vol = ops.vol.apply(lambda x: x.split('/')[-1])

### EFS prices
* Standard Storage (GB-Month) USD0.30
* Infrequent Access Storage (GB-Month) USD 0.025
* Infrequent Access Requests (per GB transferred) USD0.01
* Provisioned Throughput (MB/s-Month) USD6.00


### EBS prices
note: current assumption for graphs is no provisioned IOPS
* General Purpose SSD (gp2) Volumes	USD0.10 per GB-month of provisioned storage
* Provisioned IOPS SSD (io2) Volumes	USD0.125 per GB-month of provisioned storage AND USD0.065 per provisioned IOPS-month
* Provisioned IOPS SSD (io1) Volumes	USD0.125 per GB-month of provisioned storage AND USD0.065 per provisioned IOPS-month
* Throughput Optimized HDD (st1) Volumes	USD0.045 per GB-month of provisioned storage
* Cold HDD (sc1) Volumes	USD0.025 per GB-month of provisioned storage

### NVMe and Ramdisk prices
These prices are not used in the analysis and are only noted for reference.
* NVMe USD1.2 per GB-month
* Ramdisk UDS11.7 per GB-month

In [17]:
def get_cost_per_byte(row):
    if 'io1' in row['vol']:
        return 0.125*10**-9
    elif 'io2' in row['vol']:
        return 0.125*10**-9
    elif 'standard' in row['vol']:
        return 0.05*10**-9
    elif 'st1' in row['vol']:
        return 0.045*10**-9
    elif 'sc1' in row['vol']:
        return 0.025*10**-9
    elif 'efs-gp-burst' in row['vol']:
        return 0.3*10**-9
    elif 'efs-maxio-burst' in row['vol']:
        return 0.3*10**-9
    elif 'gp2' in row['vol']:
        return 0.1*10**-9
    elif 'nvme' in row['vol']:
        return 1.2*10**-9
    elif 'ramdisk' in row['vol']:
        return 11.7*10**-9
    else:
        return 1*10**-9
    
def get_cost_per_iops_month(row):
    if 'io1' in row['vol']:
        return 0.065
    elif 'io2' in row['vol']:
        return 0.065
    else:
        return 0

In [18]:
ops['cost_per_byte'] = ops.apply(lambda row: get_cost_per_byte(row), axis=1)
ops['cost_per_iops_month'] = ops.apply(lambda row: get_cost_per_iops_month(row), axis=1)

In [19]:
ops['costtime'] = ops['time']*ops['cost_per_byte']

In [20]:
ops_pv = ops.pivot_table(index = 'vol', columns='type', values='time')
normalized_ops_pv = (ops_pv-ops_pv.min())/(ops_pv.max()-ops_pv.min())
ops_cost = ops.pivot_table(index = 'vol', columns='type', values='costtime')
normalized_ops_cost = (ops_cost-ops_cost.min())/(ops_cost.max()-ops_cost.min())

In [21]:
plot_data = ops_pv.drop(index = ['nvme', 'ramdisk'])
fig = plx.imshow(plot_data, aspect=('auto'), title='raw time measurements in seconds per GB', labels={'x':'type of operation', 'y':'mounted file system', 'color':'execution time in ms'})
fig.write_html('../datavis/exp2_raw_time.html')
fig.show()

In [22]:
plot_data = normalized_ops_pv.drop(index = ['nvme', 'ramdisk'])
fig = plx.imshow(plot_data, aspect=('auto'), title='time normalised per operation', labels={'x':'type of operation', 'y':'mounted file system', 'color':'normalised time'})
fig.write_html('../datavis/exp2_storage_speed_normalised.html')
fig.show()

In [23]:
plot_data = ops_cost.drop(index = ['nvme', 'ramdisk'])

In [24]:
fig = plx.imshow(plot_data, aspect=('auto'), title='raw cost per speed (dollars/(GB/s))', labels={'x':'type of operation', 'y':'mounted file system', 'color':'dollars/GB/ms'})
fig.write_html('../datavis/exp2_storage_speed_normalised_by_cost.html')
fig.show()

In [25]:
plot_data = normalized_ops_cost.drop(index = ['nvme', 'ramdisk'])

In [26]:
fig = plx.imshow(plot_data, aspect=('auto'), title='cost per speed normalised per operation', labels={'x':'type of operation', 'y':'mounted file system', 'color':'cost/speed/op'})
fig.write_html('../datavis/exp2_cost_div_speed_norm_ops.html')
fig.show()