In [15]:
import numpy as np
import pandas as pd
import plotly.express as px

In [16]:
def average_eight_streaks(strides):
    streak_lengths = []
    current_streak = 0

    for num in strides:
        if num == 8:
            current_streak += 1
        else:
            if current_streak > 0:
                streak_lengths.append(current_streak)
            current_streak = 0

    if current_streak > 0:
        streak_lengths.append(current_streak)

    if streak_lengths:
        average_streak = sum(streak_lengths) / len(streak_lengths)
        return average_streak
    else:
        return 0

In [17]:
def get_average_streak_length(strides):
    if not strides:
        return 0
    
    streak_lengths = []
    current_streak = 1

    for i in range(1, len(strides)):
        if strides[i] == strides[i - 1]:
            current_streak += 1
        else:
            streak_lengths.append(current_streak)
            current_streak = 1

    streak_lengths.append(current_streak)

    if streak_lengths:
        average_streak = sum(streak_lengths) / len(streak_lengths)
        return average_streak
    else:
        return 0

In [18]:
def calculate_strides(group):
    strides = group['vpn'].diff().dropna()
    return strides

def average_streak_length(strides):
    if strides.empty:
        return 0

    streak_lengths = []
    current_streak = 1

    for i in range(1, len(strides)):
        if strides.iloc[i] == strides.iloc[i - 1]:
            current_streak += 1
        else:
            streak_lengths.append(current_streak)
            current_streak = 1

    streak_lengths.append(current_streak)

    average_streak = sum(streak_lengths) / len(streak_lengths)
    return average_streak

def process_dataframe(df):
    grouped = df.groupby('eip').apply(calculate_strides).reset_index(level=0, drop=True)
    average_streaks = grouped.groupby(df['eip']).apply(average_streak_length)
    return average_streaks

In [19]:
def total_strides_per_eip(df):
    total_entries = df.groupby('eip').size()
    total_strides = total_entries - 1
    return total_strides

In [20]:
def analyse_workload(name):

    print("---------------------------------------------")
    print(f"Analyzing {name} workload...")
    print("---------------------------------------------")
    
    df = pd.read_csv('./data/'+name+'_full.csv')
    num_unique_eip = df['eip'].nunique()

    print(f"The number of unique 'eip' values is: {num_unique_eip}")

    my_strides = []
    my_list = df['vpn'].to_list()

    my_eips = df['eip'].to_list()

    for i in range(len(my_list)-1):
        stride = my_list[i+1] - my_list[i]
        my_strides.append(stride)

    strides = pd.Series(my_strides)

    # Stride frequency analysis
    stride_counts = strides.value_counts()
    total_strides = len(strides.to_list())
    print("Stride Frequency:")
    i = 0
    sum = 0
    for stride, count in stride_counts.items():
        if i > 10:
            break
        print(f"Stride: {stride}, Fraction: {count/total_strides}")
        sum += count/total_strides
        i += 1

    print("Sum of fractions of top 10 strides:", sum)

    # # Plot the stride-fraction distribution
    # fig = px.bar(x=stride_counts.index, y=stride_counts.values/total_strides, labels={'x':'Stride', 'y':'Fraction'})
    
    # # Save it in 'plots' directory
    # fig.write_image(f'./plots/{name}_stride_frequency.png')

    # Total stride count
    print("Total Strides:", total_strides)

    # Number of unique strides
    print("Number of Unique Strides:", len(stride_counts))

    # Number of strides for which sum of fractions is greater than 0.5
    sum_fraction = 0
    i = 0
    for stride, count in stride_counts.items():
        sum_fraction += count/total_strides
        i += 1
        if sum_fraction > 0.5:
            break
    print("Number of strides for which sum of fractions is greater than 0.5:", i)

    average_streak = get_average_streak_length(my_strides)
    print(f"The average length of any streak is: {average_streak}")

    # Process the DataFrame and get average streak lengths by 'eip'
    average_streaks_by_eip = process_dataframe(df)
    total_strides_by_eip = total_strides_per_eip(df)

    total_strides_by_eip_df = pd.DataFrame({'eip': total_strides_by_eip.index, 'total_strides': total_strides_by_eip.values})
    average_streaks_by_eip_df = pd.DataFrame({'eip': average_streaks_by_eip.index, 'average_streak': average_streaks_by_eip.values})

    merged = pd.merge(total_strides_by_eip_df, average_streaks_by_eip_df, on='eip')

    sorted_df = merged.sort_values('average_streak', ascending=False)
    print("Average Streaks by 'eip':")
    print(sorted_df)
    print("")

In [21]:
analyse_workload("bc")
analyse_workload("bfs")
analyse_workload("cc")
analyse_workload("dlrm")
analyse_workload("gc")
analyse_workload("gen")
analyse_workload("pr")
analyse_workload("rnd")
analyse_workload("sssp")
analyse_workload("tc")
analyse_workload("xs")

---------------------------------------------
Analyzing bc workload...
---------------------------------------------
The number of unique 'eip' values is: 49
Stride Frequency:
Stride: 34, Fraction: 0.003129446477618355
Stride: 35, Fraction: 0.0029751722205854283
Stride: 11, Fraction: 0.002943771265614125
Stride: 33, Fraction: 0.0026324922337246804
Stride: 10, Fraction: 0.002376096610089427
Stride: 36, Fraction: 0.002360532658494955
Stride: 12, Fraction: 0.002326947289264778
Stride: 14, Fraction: 0.0022903583504286502
Stride: 13, Fraction: 0.0021186087793247377
Stride: 105, Fraction: 0.0021172435204129415
Stride: 106, Fraction: 0.0020934880153476946
Sum of fractions of top 10 strides: 0.02736415742090577
Total Strides: 3662309
Number of Unique Strides: 1008865
Number of strides for which sum of fractions is greater than 0.5: 2526
The average length of any streak is: 1.0013564473492815
Average Streaks by 'eip':
                eip  total_strides  average_streak
18  140461382775704       