In [1]:
import pandas as pd
from scipy.stats import norm

In [None]:
def read(csv):
    file = pd.read_csv(csv)
    data = {
        'IO_SIZE(Bytes)': file["IO_SIZE(Bytes)"].tolist(),  
        'Trial 1 Throughput(MB/s)': file["Trial 1 Throughput(MB/s)"].tolist(),  
        'Trial 2 Throughput(MB/s)': file["Trial 2 Throughput(MB/s)"].tolist(),
        'Trial 3 Throughput(MB/s)': file["Trial 3 Throughput(MB/s)"].tolist(),  
        'Trial 4 Throughput(MB/s)': file["Trial 4 Throughput(MB/s)"].tolist(),  
        'Trial 5 Throughput(MB/s)': file["Trial 5 Throughput(MB/s)"].tolist()   
    }
    return data

def read_stride(csv):
    file = pd.read_csv(csv)
    size_list = file["IO_SIZE(Bytes)"].tolist()
    stride_list = file["Stride(Bytes)"].tolist()
    result = []
    # Choosing to concat size and stride, will fix/separate in post
    for a, b in zip(size_list, stride_list):
        concatenated = str(a) + "." + str(b)
        result.append(float(concatenated))
    data = {
        'IO_SIZE(Bytes)': result,
        'Trial 1 Throughput(MB/s)': file["Trial 1 Throughput(MB/s)"].tolist(), 
        'Trial 2 Throughput(MB/s)': file["Trial 2 Throughput(MB/s)"].tolist(), 
        'Trial 3 Throughput(MB/s)': file["Trial 3 Throughput(MB/s)"].tolist(), 
        'Trial 4 Throughput(MB/s)': file["Trial 4 Throughput(MB/s)"].tolist(), 
        'Trial 5 Throughput(MB/s)': file["Trial 5 Throughput(MB/s)"].tolist()  
    }
    return data

size_read = read('challenge_data/size_read.csv')
size_write = read('challenge_data/size_write.csv')
random_read = read('challenge_data/random_read.csv')
random_write = read('challenge_data/random_write.csv')
stride_read = read_stride('challenge_data/stride_read.csv')
stride_write = read_stride('challenge_data/stride_write.csv')

In [None]:
def calculate_sample_size(data):
    df = pd.DataFrame(data)

    # calculate mean and std
    means = df.iloc[:, 1:].mean(axis=1)
    stds = df.iloc[:, 1:].std(axis=1)

    df['Mean Throughput (MB/s)'] = means
    df['Standard Deviation (MB/s)'] = stds

    results = df[['IO_SIZE(Bytes)', 'Mean Throughput (MB/s)', 'Standard Deviation (MB/s)']]

    # cohen's d definition
    def cohens_d(mean, population_mean, std_dev):
        return (mean - population_mean) / std_dev

    # create list of hypothetical means
    hypothetical_means = []
    for item in means:
        hypothetical_means.append(0.90*item)

    cohens_d_values = []
    for index, row in df.iterrows():
        d_value = cohens_d(row['Mean Throughput (MB/s)'], hypothetical_means[index], row['Standard Deviation (MB/s)'])
        cohens_d_values.append(round(d_value,2))

    df['Cohen\'s d'] = cohens_d_values

    alpha = 0.05
    power = 0.8

    z_alpha = norm.ppf(1 - alpha / 2) # z score for a two tailed test
    z_beta = norm.ppf(power)            # power score

    # use the formula for n
    def calculate_n(std_dev, cohen_d):
        return ((z_alpha + z_beta) ** 2 * (std_dev ** 2)) / (cohen_d ** 2)

    sample_sizes = []
    for index, row in df.iterrows():
        n = calculate_n(row['Standard Deviation (MB/s)'], row['Cohen\'s d'])
        sample_sizes.append(round(n,2))

    df['Sample Size (n)'] = sample_sizes

    print(df[['IO_SIZE(Bytes)', 'Cohen\'s d', 'Sample Size (n)']])
    return df[['IO_SIZE(Bytes)', 'Cohen\'s d', 'Sample Size (n)']]


In [None]:
all_data = [size_read, size_write, random_read, random_write, stride_read, stride_write]

data_labels = ['Size Read', 'Size Write', 'Random Read', 'Random Write', 'Stride Read', 'Stride Write']

results = []

for label, item in zip(data_labels, all_data):
    result_df = calculate_sample_size(item)
    result_df['Dataset'] = label  
    results.append(result_df)

final_results = pd.concat(results, ignore_index=True)

# send results to csv file
final_results.to_csv('challenge_results.csv', index=False)

    IO_SIZE(Bytes)  Cohen's d  Sample Size (n)
0             4096       5.62             2.29
1             8192       2.34           296.95
2            16384       1.37          9945.40
3            32768       0.97        121128.19
4            65536       0.76        971735.57
5           131072       0.82       2010632.15
6           262144       0.78       2334885.79
7           524288       0.47      16262891.59
8          1048576       0.55       9035544.29
9          2097152       0.47      17451606.41
10         4194304       0.74       3106626.08
11         8388608       0.80       1692891.84
12        16777216       1.08        412682.40
13        33554432       1.21        268545.52
14        67108864       0.73       1737146.67
15       100663296       0.35      26656702.99
    IO_SIZE(Bytes)  Cohen's d  Sample Size (n)
0             4096       1.03          1589.93
1             8192       0.56         52117.95
2            16384       2.06          1287.64
3            