## Averaging Correlation Matrices of Per-Core Logs
#### Assuming that all cores of a running experiment do similar work, we can assume that their correlation matrices are also very similar and hence can be averaged out into 1 matrix.

In [48]:
import os

# get all itrs in a logs directory
def list_itrs(dirname):
    itrs = []
    for file in os.listdir(dirname):
        tags = file.split('_')
        itr = tags[2]
        itrs.append(int(itr))
    itrs = list(set(itrs))
    return itrs

### Test: qps = [200k, 400k, 600k], dvfs = [0xd00, 0x1d00], itr = [all_vals]

In [233]:
import eigen_analysis

cols = eigen_analysis.LINUX_COLS
time_unit = eigen_analysis.TIME_CONVERSION_khz
joules_unit = eigen_analysis.JOULE_CONVERSION

In [80]:
qpss = ['200k', '400k', '600k']
dvfss = ['0xd00', '0x1d00']
run = '0'
rapl = '135'

In [179]:
import pandas as pd
import numpy as np

for qps in qpss:

    qps_dir = qps + '_qps/'
    print(qps_dir)

    for dvfs in dvfss:
        dvfs_dir = qps_dir + 'linux_mcd_dmesg_' + run + '_' + dvfs + '_' + rapl + '_' + qps + '/'
        rdtsc_dir = qps_dir + 'linux_mcd_rdtsc_' + run + '_' + dvfs + '_' + rapl + '_' + qps + '/'
        print(dvfs_dir)
        print(rdtsc_dir)

        # get all itrs tested with this dvfs
        itrs = list_itrs(dvfs_dir)
        for itr in itrs:
            # for syncing log file with recorded experiment time
            rdtsc_file = rdtsc_dir + 'linux.mcd.rdtsc.' + run + '_' + str(itr) + '_' + dvfs + '_' + rapl + '_' + qps[0:-1] + '000'
            start, end = eigen_analysis.get_rdtsc(rdtsc_file)
            
            # to store per-core dfs for subsequent averaging
            corr_list_no_diffs = []
            corr_list_diffs = []
            
            for core in range(0,16):
                log_file = dvfs_dir + 'linux.mcd.dmesg.' + run + '_' + str(core) + '_' + str(itr) + '_' + dvfs + '_' + rapl + '_' + qps[0:-1] + '000'
                df = pd.read_csv(log_file, sep= ' ', names = cols, index_col='i')

                df_no_diffs = df[['rx_bytes', 'rx_desc', 'tx_bytes', 'tx_desc']].copy()
                
                df = df[['instructions', 'cycles', 'ref_cycles', 'llc_miss', 'joules', 'timestamp']]
                df = df[(df['timestamp'] >= start) & (df['timestamp'] <= end)]
                df['timestamp'] = df['timestamp'] - df['timestamp'].min()
                df['timestamp'] = df['timestamp'] * time_unit
                df['joules'] = df['joules'] * joules_unit
                
                # removing empty/zero log-entries
                df_copy = df[(df['joules'] > 0) & (df['instructions'] > 0) & (df['cycles'] > 0) \
                             & (df['ref_cycles'] > 0) & (df['llc_miss'] > 0)].copy()
                tmp = df_copy.diff().copy()
                tmp.columns = [f'{c}_diff' for c in tmp.columns]
                
                # ignoring all log files that produce negative diffs due to overflow
                df_diffs_neg = tmp[(tmp['joules_diff'] < 0) | (tmp['instructions_diff'] < 0) | (tmp['cycles_diff'] < 0) \
                                   | (tmp['ref_cycles_diff'] < 0) | (tmp['llc_miss_diff'] < 0) | (tmp['timestamp_diff'] < 0)]
                tmp = tmp[(tmp['joules_diff'] >= 0) | (tmp['instructions_diff'] >= 0) | (tmp['cycles_diff'] >= 0) \
                                   | (tmp['ref_cycles_diff'] >= 0) | (tmp['llc_miss_diff'] >= 0) | (tmp['timestamp_diff'] >= 0)]
                if df_diffs_neg.shape[0] > 0:
                    print('NEGATIVE DIFFS IN LOG FILE ', log_file)
                    print(df_diffs_neg)
                    break

                df_diffs = tmp.copy()

                # add dfs to list of <= 16 dfs, 1 per core
                corr_list_no_diffs.append(df_no_diffs.corr())
                corr_list_diffs.append(df_diffs.corr())
                
            print()
            print('---------------------------------------- PARSED 16 LOGS ----------------')
            print()
            no_diffs_list_len = len(corr_list_no_diffs)
            diffs_list_len = len(corr_list_diffs)
            no_diffs_sum = np.zeros((corr_list_no_diffs[0].shape[0], corr_list_no_diffs[0].shape[1]), dtype=np.float64)
            diffs_sum = np.zeros((corr_list_diffs[0].shape[0], corr_list_diffs[0].shape[1]), dtype=np.float64)
            for c1 in corr_list_no_diffs:
                corr_no_diffs_sum += c1.to_numpy()
            for c2 in corr_list_diffs:
                corr_diffs_sum += c2.to_numpy()
            corr_no_diffs_avg = corr_no_diffs_sum / corr_no_diffs_len
            corr_diffs_avg = corr_diffs_sum / corr_diffs_len
            
            print(corr_no_diffs_avg)
            print(corr_diffs_avg)
            break
        print()
        print('---------------------------------------- PARSED 12 ITRS ----------------')
        print()
        break
    print()
    print('---------------------------------------- PARSED 2 DVFSS ----------------')
    print()
    break
    

200k_qps/
200k_qps/linux_mcd_dmesg_0_0xd00_135_200k/
200k_qps/linux_mcd_rdtsc_0_0xd00_135_200k/

---------------------------------------- PARSED 16 LOGS ----------------

[[ 2.125       1.23096522 -0.25186319 -0.55828859]
 [ 1.23096522  2.125      -0.49316421 -1.09677356]
 [-0.25186319 -0.49316421  2.125       0.97282386]
 [-0.55828859 -1.09677356  0.97282386  2.125     ]]
[[ 2.125       1.84417674  1.84413281  1.64835021  0.07375969  0.16036363]
 [ 1.84417674  2.125       2.12499987  1.74685209  0.04352085 -0.01514847]
 [ 1.84413281  2.12499987  2.125       1.74683578  0.04351749 -0.01515857]
 [ 1.64835021  1.74685209  1.74683578  2.125       0.04350651  0.05199408]
 [ 0.07375969  0.04352085  0.04351749  0.04350651  2.125       0.38183241]
 [ 0.16036363 -0.01514847 -0.01515857  0.05199408  0.38183241  2.125     ]]

---------------------------------------- PARSED 12 ITRS ----------------


---------------------------------------- PARSED 2 DVFSS ----------------



In [308]:
file1 = '200k_qps/linux_mcd_dmesg_0_0xd00_135_200k/linux.mcd.dmesg.0_0_100_0xd00_135_200000'
file2 = '200k_qps/linux_mcd_dmesg_0_0xd00_135_200k/linux.mcd.dmesg.0_1_100_0xd00_135_200000'
rdtsc = '200k_qps/linux_mcd_rdtsc_0_0xd00_135_200k/linux.mcd.rdtsc.0_100_0xd00_135_200000'
df1 = pd.read_csv(file1, sep = ' ', names = cols, index_col='i').drop(['c1', 'c1e', 'c3', 'c6', 'c7', 'rx_bytes', 'rx_desc', 'tx_bytes', 'tx_desc'], axis=1)
df2 = pd.read_csv(file2, sep = ' ', names = cols, index_col='i').drop(['c1', 'c1e', 'c3', 'c6', 'c7', 'rx_bytes', 'rx_desc', 'tx_bytes', 'tx_desc'], axis=1)

In [309]:
df1

Unnamed: 0_level_0,instructions,cycles,ref_cycles,llc_miss,joules,timestamp
i,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,0,0,0,663493062,93414342458307
1,0,0,0,0,0,93414342840375
2,0,0,0,0,0,93414343089925
3,0,0,0,0,0,93414343384457
4,7547970794547,13194822714544,18845536420253,25697293146,663495697,93414347185473
...,...,...,...,...,...,...
189144,0,0,0,0,0,93488381915293
189145,0,0,0,0,0,93488382212029
189146,0,0,0,0,0,93488382512377
189147,0,0,0,0,0,93488382808313


In [310]:
start, end = eigen_analysis.get_rdtsc(rdtsc)
df1 = df1[(df1['timestamp'] >= start) & (df1['timestamp'] <= end)]
df2 = df2[(df2['timestamp'] >= start) & (df2['timestamp'] <= end)]
df1['timestamp'] = df1['timestamp'] - df1['timestamp'].min()
df1['timestamp'] = df1['timestamp'] * time_unit
df1['joules'] = df1['joules'] * joules_unit
df2['timestamp'] = df2['timestamp'] - df2['timestamp'].min()
df2['timestamp'] = df2['timestamp'] * time_unit
df2['joules'] = df2['joules'] * joules_unit

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['timestamp'] = df1['timestamp'] - df1['timestamp'].min()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['timestamp'] = df1['timestamp'] * time_unit
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['joules'] = df1['joules'] * joules_unit


In [311]:
# removing empty/zero log-entries
df1_copy = df1[(df1['joules'] > 0) & (df1['instructions'] > 0) & (df1['cycles'] > 0) \
             & (df1['ref_cycles'] > 0) & (df1['llc_miss'] > 0)].copy()
df2_copy = df2[(df2['joules'] > 0) & (df2['instructions'] > 0) & (df2['cycles'] > 0) \
             & (df2['ref_cycles'] > 0) & (df2['llc_miss'] > 0)].copy()

# computing diffs
tmp1 = df1_copy.diff().dropna()
tmp1.columns = [f'{c}_diff' for c in tmp1.columns]
tmp2 = df2_copy.diff().dropna()
tmp2.columns = [f'{c}_diff' for c in tmp2.columns]

In [312]:
tmp1

Unnamed: 0_level_0,instructions_diff,cycles_diff,ref_cycles_diff,llc_miss_diff,joules_diff,timestamp_diff
i,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
87,494620.0,1156267.0,2579347.0,2857.0,0.107299,0.001098
97,480904.0,910190.0,2030377.0,2506.0,0.115168,0.001023
107,453412.0,904490.0,2017791.0,2402.0,0.234057,0.001025
117,419248.0,884458.0,1973044.0,2205.0,0.000000,0.001024
127,332785.0,668349.0,1491035.0,1564.0,0.235704,0.001024
...,...,...,...,...,...,...
189104,306646.0,614471.0,1370801.0,740.0,0.108397,0.001052
189114,328868.0,657008.0,1465747.0,841.0,0.107909,0.001179
189122,372074.0,813075.0,1813776.0,1486.0,0.103212,0.001005
189132,648734.0,1232771.0,2750041.0,2401.0,0.120170,0.001040


In [313]:
# ignoring all log files that produce negative diffs due to overflow
df1_diffs_neg = tmp1[(tmp1['joules_diff'] < 0) | (tmp1['instructions_diff'] < 0) | (tmp1['cycles_diff'] < 0) \
                   | (tmp1['ref_cycles_diff'] < 0) | (tmp1['llc_miss_diff'] < 0) | (tmp1['timestamp_diff'] < 0)]
df1_diffs = tmp1[(tmp1['joules_diff'] >= 0) | (tmp1['instructions_diff'] >= 0) | (tmp1['cycles_diff'] >= 0) \
                   | (tmp1['ref_cycles_diff'] >= 0) | (tmp1['llc_miss_diff'] >= 0) | (tmp1['timestamp_diff'] >= 0)]
if df1_diffs_neg.shape[0] > 0:
    print('NEGATIVE DIFFS')
    print(df1_diffs_neg)

df2_diffs_neg = tmp2[(tmp2['joules_diff'] < 0) | (tmp2['instructions_diff'] < 0) | (tmp2['cycles_diff'] < 0) \
                   | (tmp2['ref_cycles_diff'] < 0) | (tmp2['llc_miss_diff'] < 0) | (tmp2['timestamp_diff'] < 0)]
df2_diffs = tmp2[(tmp2['joules_diff'] >= 0) | (tmp2['instructions_diff'] >= 0) | (tmp2['cycles_diff'] >= 0) \
                   | (tmp2['ref_cycles_diff'] >= 0) | (tmp2['llc_miss_diff'] >= 0) | (tmp2['timestamp_diff'] >= 0)]
if df2_diffs_neg.shape[0] > 0:
    print('NEGATIVE DIFFS')
    print(df2_diffs_neg)
    
# dropping timestamp_diff column
df1_diffs = df1_diffs.drop(['timestamp_diff'], axis=1).reset_index()
df2_diffs = df2_diffs.drop(['timestamp_diff'], axis=1).reset_index()

In [314]:
df2_diffs

Unnamed: 0,i,instructions_diff,cycles_diff,ref_cycles_diff,llc_miss_diff,joules_diff
0,143,547085.0,1166322.0,2612987.0,3968.0,0.112057
1,153,460793.0,1079236.0,2407551.0,2795.0,0.115290
2,163,417253.0,827774.0,1846546.0,2268.0,0.120109
3,173,297719.0,781661.0,1743741.0,1653.0,0.112789
4,183,577270.0,1037151.0,2313649.0,2954.0,0.124440
...,...,...,...,...,...,...
19229,188485,292730.0,589911.0,1325010.0,825.0,0.218258
19230,188493,198113.0,459273.0,1024976.0,627.0,0.104432
19231,188503,609253.0,1214345.0,2708919.0,2733.0,0.115717
19232,188513,629149.0,1327935.0,2962321.0,2341.0,0.131028


In [315]:
df1_diffs

Unnamed: 0,i,instructions_diff,cycles_diff,ref_cycles_diff,llc_miss_diff,joules_diff
0,87,494620.0,1156267.0,2579347.0,2857.0,0.107299
1,97,480904.0,910190.0,2030377.0,2506.0,0.115168
2,107,453412.0,904490.0,2017791.0,2402.0,0.234057
3,117,419248.0,884458.0,1973044.0,2205.0,0.000000
4,127,332785.0,668349.0,1491035.0,1564.0,0.235704
...,...,...,...,...,...,...
19244,189104,306646.0,614471.0,1370801.0,740.0,0.108397
19245,189114,328868.0,657008.0,1465747.0,841.0,0.107909
19246,189122,372074.0,813075.0,1813776.0,1486.0,0.103212
19247,189132,648734.0,1232771.0,2750041.0,2401.0,0.120170


In [316]:
#df11 = df1_diffs.head(10)
#df22 = df2_diffs.head(10)

In [317]:
df11 = df1_diffs.copy()
df22 = df2_diffs.copy()

In [318]:
df22

Unnamed: 0,i,instructions_diff,cycles_diff,ref_cycles_diff,llc_miss_diff,joules_diff
0,143,547085.0,1166322.0,2612987.0,3968.0,0.112057
1,153,460793.0,1079236.0,2407551.0,2795.0,0.115290
2,163,417253.0,827774.0,1846546.0,2268.0,0.120109
3,173,297719.0,781661.0,1743741.0,1653.0,0.112789
4,183,577270.0,1037151.0,2313649.0,2954.0,0.124440
...,...,...,...,...,...,...
19229,188485,292730.0,589911.0,1325010.0,825.0,0.218258
19230,188493,198113.0,459273.0,1024976.0,627.0,0.104432
19231,188503,609253.0,1214345.0,2708919.0,2733.0,0.115717
19232,188513,629149.0,1327935.0,2962321.0,2341.0,0.131028


In [319]:
df_full = df11.merge(df22, left_index=True, right_index=True, how='outer', sort=True).fillna(0).copy()

In [320]:
df_full

Unnamed: 0,i_x,instructions_diff_x,cycles_diff_x,ref_cycles_diff_x,llc_miss_diff_x,joules_diff_x,i_y,instructions_diff_y,cycles_diff_y,ref_cycles_diff_y,llc_miss_diff_y,joules_diff_y
0,87,494620.0,1156267.0,2579347.0,2857.0,0.107299,143.0,547085.0,1166322.0,2612987.0,3968.0,0.112057
1,97,480904.0,910190.0,2030377.0,2506.0,0.115168,153.0,460793.0,1079236.0,2407551.0,2795.0,0.115290
2,107,453412.0,904490.0,2017791.0,2402.0,0.234057,163.0,417253.0,827774.0,1846546.0,2268.0,0.120109
3,117,419248.0,884458.0,1973044.0,2205.0,0.000000,173.0,297719.0,781661.0,1743741.0,1653.0,0.112789
4,127,332785.0,668349.0,1491035.0,1564.0,0.235704,183.0,577270.0,1037151.0,2313649.0,2954.0,0.124440
...,...,...,...,...,...,...,...,...,...,...,...,...
19244,189104,306646.0,614471.0,1370801.0,740.0,0.108397,0.0,0.0,0.0,0.0,0.0,0.000000
19245,189114,328868.0,657008.0,1465747.0,841.0,0.107909,0.0,0.0,0.0,0.0,0.0,0.000000
19246,189122,372074.0,813075.0,1813776.0,1486.0,0.103212,0.0,0.0,0.0,0.0,0.0,0.000000
19247,189132,648734.0,1232771.0,2750041.0,2401.0,0.120170,0.0,0.0,0.0,0.0,0.0,0.000000


In [321]:
# creating a larger and more sparse dataframe of average diffs across log files
df_final = pd.DataFrame(columns=['instructions_diff', 'cycles_diff', 'ref_cycles_diff', 'llc_miss_diff', 'joules_diff'])
count = 0
for i,j in df_full.iterrows():
    new_row = {}
    for col in df_final.columns:
        sum = 0
        for s in ['x', 'y']:
            sum += j[col+'_'+s]
        new_row[col] = sum / 2
#     for col in list(j.index):
#         col_final = ''
#         for c in col.split('_')[0:-1]:
#             col_final += c + '_'
#         col_final = col_final[0:-1]
#         print(col_final)
    df_final = df_final._append(new_row, ignore_index = True)
    count += 1

# TODO joules_sum averages to per-core joules_sum as a test

In [322]:
df_final

Unnamed: 0,instructions_diff,cycles_diff,ref_cycles_diff,llc_miss_diff,joules_diff
0,520852.5,1161294.5,2596167.0,3412.5,0.109678
1,470848.5,994713.0,2218964.0,2650.5,0.115229
2,435332.5,866132.0,1932168.5,2335.0,0.177083
3,358483.5,833059.5,1858392.5,1929.0,0.056395
4,455027.5,852750.0,1902342.0,2259.0,0.180072
...,...,...,...,...,...
19244,153323.0,307235.5,685400.5,370.0,0.054198
19245,164434.0,328504.0,732873.5,420.5,0.053954
19246,186037.0,406537.5,906888.0,743.0,0.051606
19247,324367.0,616385.5,1375020.5,1200.5,0.060085
