In [99]:
import pandas as pd
import statistics
from sys import argv
import matplotlib.pyplot as plt
import numpy as np
import json

In [37]:
### Get durations in between consecutive timestamps rather than the 
def diff_list(l):
    dl = []
    prev = l[0]
    for item in l[1:]:
        dl.append(item-prev)
        prev = item
    return dl

In [38]:
def cumulative_sum(l):
    sl = []
    sum_ = 0
    for duration in l:
        sum_ += duration
        sl.append(sum_)
    return sl

In [42]:
def filter_for_outliers(unfiltered, **kwargs):
    allowed_kws = {"lt", "gt", "ge", "le", "eq"}
    kws_provided = set(kwargs.keys())
    diff = kws_provided - allowed_kws
    if diff:
        raise ValueError(f"Invalid keyword(s) provided: {diff}")
    filtered = []
    outliers = []
    for val in unfiltered:
        append_val = True
        if "lt" in kws_provided:
            append_val &= (val < kwargs["lt"])
        if "gt" in kws_provided:
            append_val &= (val > kwargs["gt"])
        if "ge" in kws_provided:
            append_val &= (val >= kwargs["ge"])
        if "le" in kws_provided:
            append_val &= (val <= kwargs["le"])
        if "eq" in kws_provided:
            append_val &= (val == kwargs["eq"])
        if append_val: filtered.append(val)
        else: outliers.append(val)
    return filtered, outliers

In [40]:
df = pd.read_csv('jolteon-samples.csv')
df = df.loc[:, (df != 0).any(axis=0)]
print('read the csv')
header = list(df.head())
del header[-1] # remove timestamp
timestamps = df['timestamp']
header

read the csv


['dram', 'pkg', 'dram.1', 'pkg.1']

In [91]:
result_dfs = dict()

for powerDomain in header:
    energies = df[powerDomain]
    lastDifferent = energies[0]
    change_ts = [ timestamps[0] ] # timestamps where there was an energy update
    ener_diffs = [] # differences between consecutive non-equal readings
    n_samples_bw = [] # number of samples between consecutive non-equal readings
    non_zero_reading_nums = [1]
    bw = 0
    for i in range(len(energies)):
        if energies[i] != lastDifferent:
            non_zero_reading_nums.append(i+1)
            ener_diffs.append(energies[i] - lastDifferent)
            change_ts.append(timestamps[i])
            n_samples_bw.append(bw)
            lastDifferent = energies[i]
            bw = 0
        else:
            bw += 1
    # Get the difference between consecutive timestamps
    ts_diffs = diff_list(change_ts)
    # Get the cumulative sum of the unfiltered duractions
    cum_sums_unfiltered = cumulative_sum(ts_diffs) # get durations instead of raw timestamps
    # Get the filtered durations <=5000 and outliers >5000
    filtered, outliers = filter_for_outliers(ts_diffs, le=5000)
    # Get the cumulative sum of the filtered timestamps
    cum_sums_filtered = cumulative_sum(filtered)
    
    result = filtered
    
    result_dfs[powerDomain] = {
                               "change_timestamps": np.array(change_ts),
                               "reading_change_num": np.array(non_zero_reading_nums),
                               "energy_differences": np.array(ener_diffs),
                               "readings_bw_change": np.array(n_samples_bw),
                               "filtered": np.array(filtered, dtype=np.double),
                               "outliers": np.array(outliers),
                               "cumulative_sum_unfiltered": np.array(cum_sums_unfiltered),
                               "cumulative_sum_filtered": np.array(cum_sums_filtered)
                              }  
    
    print(f'collected result dataframe for {powerDomain}. len(outliers)={len(outliers)},len(result)={len(result)}')
    
print('all results stored in \'result_dfs\', currently a dictionary')

collected result dataframe for dram. len(outliers)=100,len(result)=28266
collected result dataframe for pkg. len(outliers)=10,len(result)=49688
collected result dataframe for dram.1. len(outliers)=10,len(result)=49229
collected result dataframe for pkg.1. len(outliers)=10,len(result)=49689
all results stored in 'result_dfs', currently a dictionary


In [46]:
for key in result_dfs.keys():
    try:
        plot = results_dfs[key].plot()
        plot.set_title(f'time to update the MSR for {key.upper()}')
        plot.get_figure().savefig(f'{key}_msr-update-time.png')
        # plot.get_figure().clf()
        with open(key+'_summary-statistics.tex','w') as fh: fh.write(results_dfs[key].describe().to_latex())
    except:
        print(f"error plotting results_dfs[{key}]")

error plotting results_dfs[dram]
error plotting results_dfs[pkg]
error plotting results_dfs[dram.1]
error plotting results_dfs[pkg.1]


In [111]:
my_dpi = 1000
fname = "trial1_"
for domain in result_dfs.keys():
    change_ts = result_dfs[domain]["change_timestamps"]
    non_zero_readings = result_dfs[domain]["reading_change_num"]
    ener_diffs = result_dfs[domain]["energy_differences"]
    n_samples_bw = result_dfs[domain]["readings_bw_change"]
    filtered = result_dfs[domain]["filtered"]
    outliers = result_dfs[domain]["outliers"]
    cum_sums_unfiltered = result_dfs[domain]["cumulative_sum_unfiltered"]
    cum_sums_filtered = result_dfs[domain]["cumulative_sum_filtered"]
    domain = domain.split('.')
    if len(domain) == 2:
        domain = f"{domain[0]} Socket 1"
    else:
        domain = f"{domain[0]} Socket 0"
    domain_fname = f"rutvik_results/{'_'.join(domain.split())}"
    stats_d = {}
    
    ### Calculating correlation coeffiecients for all pairs of reading_value, readings_bw_change, num_unique_readings
    corr_coeffs = stats_d["CorrCoeffs"] = {}
    
    cc_readings_samples_bw = np.corrcoef(ener_diffs, n_samples_bw)[0][1]
    corr_coeffs["ener_diff-vs-num_samples_bw_nonequal"] = cc_readings_samples_bw
    
    cc_n_samples_bw = np.corrcoef(range(len(ener_diffs)), n_samples_bw)[0][1]
    corr_coeffs['unique_reading_num-vs-num_samples_bw_nonequal'] = cc_n_samples_bw
    
    cc_n_readings = np.corrcoef(range(len(ener_diffs)), ener_diffs)[0][1]
    corr_coeffs['unique_reading_num-vs-ener_diff'] = cc_n_readings
    
    ### Stats like mean and sd
    stats_d['num_readings'] = len(cum_sums_unfiltered) + 1
    
    stats_d["avg_readings_bw_2_nonequal"] = np.mean(n_samples_bw)
    stats_d["s-d_readings_bw_2_nonequal"] = np.std(n_samples_bw)
    
    stats_d['avg_energy_reading'] = np.mean(ener_diffs)
    stats_d['s-d_energy_reading'] = np.std(ener_diffs)
    
    
    
    ###Plots
    
    # Not sure - autocorelation for the filtered values
    # plt.acorr(filtered, usevlines=True, normed=True, maxlags=10, lw=2)
    # plt.xlabel("idk")
    # plt.ylabel("idk")
    # plt.title(f"Autocorrelation filtered times- {domain}")
    # plt.savefig()
    # plt.close(plt.gcf())
    
    # scatter plot time for value x vs time for value x+1  - filtered
    plt.scatter(filtered[:-1], filtered[1:], marker='.', s=3)
    plt.xlabel("Time for reading i")
    plt.ylabel("Time for reading i+1")
    plt.title(f"Consecutive reading times - {domain}")
    plt.savefig(f"{domain_fname}-i_n-v-i_n1",  bbox_inches = 'tight', dpi=my_dpi)
    plt.cla()
    plt.clf()
    
    # filtered cumulative sum line plot
    plt.plot(cum_sums_filtered)
    plt.xlabel("Reading number")
    plt.ylabel("Cumulative time")
    plt.title(f"Cumulative Summed Line - {domain}")
    plt.savefig(f"{domain_fname}-cumulative-summed",  bbox_inches = 'tight', dpi=my_dpi)
    plt.cla()
    plt.clf()
    
    json.dump(stats_d, open(f"{domain_fname}_stats.json", 'w'))
    
    

<Figure size 432x288 with 0 Axes>

In [102]:
    stats_d

{'CorrCoeffs': {'ener_diff-vs-num_samples_bw_nonequal': -0.03648222165532877,
  'unique_reading_num-vs-num_samples_bw_nonequal': 0.3133872125639214,
  'unique_reading_num-vs-ener_diff': 0.0022845401948618956},
 'num_readings': 49700,
 'avg_readings_bw_2_nonequal': 46.73502484959456,
 's-d_readings_bw_2_nonequal': 4.585179884126291,
 'avg_energy_reading': 0.029396382220970442,
 's-d_energy_reading': 0.5899954497038302}