# Data Analysis

## Libraries

In [1]:
# Libraries
import re
import sys
import os
import glob
import webbrowser
import natsort
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from sklearn.preprocessing import MaxAbsScaler
from IPython.display import display, HTML

## Releases of each programming language according to their respective compilers/interpreters

In [2]:
# All releases measured
actual_directory = os.getcwd() + '/'  

python_releaseDates = {
    'Python 3.13.0a0': '2023-06-07',
    'Python 3.12.0b1': '2023-05-22',
    'Python 3.11.3': '2023-04-05',
    'Python 3.10.11': '2023-04-05',
    'Python 3.9.16': '2022-12-06',
    'Python 3.8.16': '2022-12-06',
    'Python 3.7.16': '2022-12-06',
    'Python 3.6.15': '2021-09-04',
    'Python 3.5.10': '2020-09-05',
    'Python 3.4.10': '2019-03-18',
    'Python 3.3.7': '2017-09-19',
    'Python 3.2.6': '2014-10-11',
    'Python 3.1.5': '2012-04-09',
    'Python 3.0.1': '2009-02-13',
    'Python 2.7.18': '2020-04-20',
    'Python 2.6.9': '2013-10-29',
    'Python 2.5.6': '2011-05-26',
}

cplusplus_releaseDates = {
    'g++-4.4 4.4.7': '2012-03-13',
    'g++-4.6 4.6.4': '2013-04-12',
    'g++-4.7 4.7.4': '2014-06-12',
    'g++-4.8 4.8.5': '2015-06-23',
    'g++-4.9 4.9.3': '2015-06-26',
    'g++-5 5.5.0': '2017-10-10',
    'g++-6 6.5.0': '2018-10-26',
    'g++-7 7.5.0': '2019-11-14',
    'g++-8 8.5.0': '2021-05-14',
    'g++-8 ': '2021-05-14',
    'g++-9 9.5.0': '2022-05-27',
    'g++-10 10.4.0': '2022-06-28',
    'g++-10 10.5.0': '2023-07-07',
    'g++-11 11.4.0': '2023-05-29',
    'g++-12 12.3.0': '2023-05-08',
    'g++-13 13.1.0': '2023-04-26',
}

java_releaseDates = {
    '1.8.0_362': '2023-04-18',
    '1.8.0_382': '2023-07-18',
    '9.0.4': '2018-01-16',
    '10.0.2': '2018-07-17',
    '11.0.19': '2020-10-20',
    '11.0.20': '2023-07-18',
    '11.0.20.1': '2023-08-22',
    '12.0.2': '2019-07-16',
    '13.0.2': '2020-01-14',
    '14.0.2': '2020-07-14',
    '15.0.2': '2021-01-19',
    '16.0.2': '2021-07-20',
    '17.0.7': '2023-04-18',
    '17.0.8': '2023-07-18',
    '17.0.8.1': '2023-08-22',
    '18.0.2-ea': '2022-07-19',
    '19.0.2': '2023-01-17',
    '20.0.2': '2023-07-18',
}

js_releaseDates = {
    '20.5.1': '2023-08-09',
    '19.9.0': '2023-04-10',
    '18.17.1': '2023-08-08',
    '17.9.1': '2022-06-01',
    '16.20.2': '2023-08-08',
    '15.14.0': '2021-04-06',
    '14.21.3': '2023-02-16',
    '13.14.0': '2020-04-29',
    '12.22.12': '2022-04-05',
    '11.15.0': '2019-04-30',
    '10.24.1': '2021-04-06',
    '9.11.2': '2018-06-12',
    '8.17.0': '2019-12-17',
    '7.10.1': '2017-07-11',
    '6.17.1': '2019-04-03',
    '5.12.0': '2016-06-23',
    '4.9.1': '2018-03-29',
    '3.3.1': '2015-09-15',
    '2.5.0': '2015-07-28',
    '1.8.4': '2015-07-09',
    '0.12.18': '2017-02-22',
    '0.10.48': '2016-10-18',
    '0.8.28': '2014-07-31'
}

## Functions for extracting information

In [3]:
# Function to get release date for a given Python version
def get_release_date(version):
    if language == 'python':
        return python_releaseDates.get(version, 'Unknown')
    elif language == 'c++':
        return cplusplus_releaseDates.get(version, 'Unknown')
    elif language == 'java':
        return java_releaseDates.get(version, 'Unknown')
    elif language == 'js':
        return js_releaseDates.get(version, 'Unknown')

# Function to convert "g", "m" or KiB to "byte"
def convert_g_to_byte(value):
    value_str = str(value)
    if value_str[:-1] == "":
        Byte_value = 0
    elif value_str[-1].lower() == 'g':
        g_value = float(value_str[:-1])
        Byte_value = g_value * 1024 * 1024 * 1024  # 1 giga = 1,000,000,000
    elif value_str[-1].lower() == 'm':
        m_value = float(value_str[:-1])
        Byte_value = m_value * 1024 * 1024 # 1 giga = 1,000,000
    elif value_str[-1].lower() == 'k':
        k_value = float(value_str[:-1])
        Byte_value = k_value * 1024 # 1 kilo = 1,000
    else:
        k_value = float(value_str)
        Byte_value = k_value * 1024 # 1 kilo = 1,000
    return int(Byte_value)

def convert_toUnit(column):
    column = column.apply(convert_g_to_byte)
    column = pd.to_numeric(column, errors='coerce')
    return column

# Define a function to remove units from a string
def remove_units(cell_value):
    # Use regular expressions to remove units
    cell_value = re.sub(r'[A-Za-z]+', '', str(cell_value))
    return cell_value
    
def Data_normalized(df, tool):
    df_data = df[['version', 'release_date', 'path', 'appplication']]
    if tool == "turbostat": 
        df_metric = df.loc[:, ~df.columns.isin(['version', 'release_date', 'path', 'appplication'])]
    elif tool == "top":
        df_metric = df[['virt', 'res', 'shr', 'percent_cpu', 'percent_mem',
                    'nTH', 'P', 'SWAP', 'CODE', 'DATA', 'nMaj',
                    'nDRT', 'USED']]

    transformer = MaxAbsScaler().fit(df_metric)
    scaled = transformer.transform(df_metric)

    df_norm = pd.DataFrame(scaled, columns=df_metric.columns)
    df = pd.concat([df_data,df_norm.reindex(df_data.index)], axis=1)
    return df

# Function to extract information in every file
def from_CSVfile(file, directory, tool):
     # Read CSV file
    df = pd.read_csv(file)
    if language == 'js': df['version'] = df['version'].str.replace('v', '')
    
    # New column 'release_date' as the second 
    df['release_date'] = df['version'].apply(get_release_date)
    df.insert(1, 'release_date', df.pop('release_date'))

    # Convert date into datetime
    df['release_date'] = pd.to_datetime(df['release_date'])

    # New column 'path' as the third
    df['path'] = directory
    df.insert(2, 'path', df.pop('path'))

    # Clean and remove the unnecessary rows
    df.replace(to_replace='-', value=0, inplace=True)

    # Apply the conversion function to the DataFrame column
    if tool == "top":
        df = df.dropna(subset=['command'])
        df['virt'] = convert_toUnit(df['virt'])
        df['res'] = convert_toUnit(df['res'])
        df['shr'] = convert_toUnit(df['shr'])
        df['CODE'] = convert_toUnit(df['CODE'])
        df['DATA'] = convert_toUnit(df['DATA'])
        df['SWAP'] = convert_toUnit(df['SWAP'])
        df['USED'] = convert_toUnit(df['USED'])
        df['nMin'] = convert_toUnit(df['nMin'])
        df['nMaj'] = convert_toUnit(df['nMaj'])

    # Changes in the 'version' column
    if language == 'python': df['version'] = df['version'].str.replace('Python ', '')
    if language == 'c++': df['version'] = df['version'].str.split().str[0]

    # with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    #     print(df)

    return df


# Extract all information from ALL files
def from_CSVfiles(tool, norm):

    all_df = pd.DataFrame()

    list_directories = next(os.walk(language))[1]
    list_directories.sort()

    for directory_name in list_directories:
        if directory_name != "waiting" and directory_name != "older" and directory_name != "test" and directory_name != "general_plots" and directory_name != "general_plots_v1":
            
            path=language + '/' + directory_name + '/' + tool + '/'
            
            if tool == "top":

                df = pd.DataFrame()
                list_files = os.listdir(path)

                # Get list of all files only in the given directory
                list_files = natsort.natsorted(list_files)

                for file_name in list_files:
                    if file_name.startswith("temp_top_data_") and file_name.endswith('.csv'):
                        df_Top = from_CSVfile(path + file_name, directory_name, tool)
                        df = pd.concat([df, df_Top])

                # Split the your_column into minutes, seconds, and hundredths
                df[['minutes', 'seconds_hundredths']] = df['time'].str.split(':', expand=True)

                # Convert minutes, seconds, and hundredths to seconds
                df['time'] = pd.to_numeric(df['minutes']) * 60 + pd.to_numeric(df['seconds_hundredths'])

                df.to_csv(path + "top_data_allVersions.csv", index=False)
            elif tool == "turbostat":
                df = pd.DataFrame()
                list_files = os.listdir(path)

                # Get list of all files only in the given directory
                list_files = natsort.natsorted(list_files)

                for file_name in list_files:
                    if file_name.startswith("turbostat_performance_data") and file_name.endswith('.csv') and file_name != "turbostat_performance_data_allVersions.csv":
                        df_Turbo = from_CSVfile(path + file_name, directory_name, tool)
                        df = pd.concat([df, df_Turbo])
                df['Pkg+RAM_J'] = df['Pkg_J'] + df['RAM_J']
                df['Pkg_Watt'] = df['Pkg_J'] / df['time_elapsed']
                df['RAM_Watt'] = df['RAM_J'] / df['time_elapsed']
                df['Pkg+RAM_Watts'] = df['Pkg_Watt'] + df['RAM_Watt']
            elif tool == "perf": 
                # path = path + tool
                df = pd.DataFrame()
                list_files = os.listdir(path)

                # Get list of all files only in the given directory
                list_files = natsort.natsorted(list_files)
            
                for file_name in list_files:
                    if file_name.startswith("perf_performance_data") and file_name.endswith('.csv'):
                        df_Perf= from_CSVfile(path + file_name, directory_name, tool)
                        df = pd.concat([df, df_Perf])

                # Extract the units contained in the every measurement, for instance, the column 'time_elapsed'
                # contains values in msec; so this convert into secs and remove the unit "msec" to add it in the header
                parameters = ["time_elapsed","freq_cycles","cpu_clock","freq_cpu_cycles","task_clock","cpu_thermal_margin"]

                for parameter in parameters:
                    colname = df[parameter].str.extract(r'\s(.+)$').iloc[0, 0]
                    if parameter == "time_elapsed":
                        df[parameter] = df[parameter].str.replace(colname, '').astype(int)
                        df['time_elapsed'] = df['time_elapsed'] / 1000000000
                        df.rename(columns={parameter: f'{parameter}_sec'}, inplace=True)
                    else:
                        df[parameter] = df[parameter].str.replace(colname, '').astype(float)
                        df.rename(columns={parameter: f'{parameter}_{colname}'}, inplace=True)

                df = df.rename(columns={"IPC": "IPC_perf"})

                df.to_csv(path + "perf_data_allVersions_10times.csv", index=False)
            else:
                # df = from_CSVfile(path + '_data_allVersions.csv', directory_name, tool)
                # if norm: df = Data_normalized(df, tool)
                print("Error selecting the tool. Please check an appropriate too.")
            all_df = pd.concat([all_df, df])

    return all_df

## Functions for plotting

In [4]:
def plot_Type(df, filename_plot, x_data, y_data, color_data, type):

    if type == "corrTurbo":

        corrs = df[['time_elapsed','Pkg_J','Cor_J','GFX_J','RAM_J', 'Avg_MHz', 'Busy%', 'IPC', 'IRQ', 'POLL', 'C1%','C1E%','C3%','C6%','C7s%','C8%','C9%','C10%','CPU%c1','CPU%c3','CPU%c6','CPU%c7','CoreTmp','PkgTmp','GFX%rc6','Totl%C0','Any%C0','GFX%C0','CPUGFX%']].corr()

        np.fill_diagonal(corrs.values, np.nan)

        corrs = corrs.head(5)

        heat = go.Heatmap(z=corrs.values.round(2),
                        x=list(corrs.columns),
                        y=list(corrs.index),
                        xgap=1, ygap=1,
                        texttemplate="%{z}",
                        showscale=True,
                        colorbar_thickness=20,
                        colorbar_ticklen=3,
                        # zmax=1, zmin=0.1
                        )
        layout = go.Layout(title_x=0.5, 
                        xaxis_showgrid=False,
                        yaxis_showgrid=False,
                        yaxis_autorange='reversed')
        fig=go.Figure(data=[heat], layout=layout)

    elif type == "corrPerf":

        corrs = df[['time_elapsed','Pkg_J','Cor_J','GFX_J','RAM_J', 'Avg_MHz', 'Busy%', 'IPC', 'IRQ', 'POLL', 'C1%','C1E%','C3%','C6%','C7s%','C8%','C9%','C10%','CPU%c1','CPU%c3','CPU%c6','CPU%c7','CoreTmp','PkgTmp','GFX%rc6','Totl%C0','Any%C0','GFX%C0','CPUGFX%']].corr()

        np.fill_diagonal(corrs.values, np.nan)

        corrs = corrs.head(5)

        heat = go.Heatmap(z=corrs.values.round(2),
                        x=list(corrs.columns),
                        y=list(corrs.index),
                        xgap=1, ygap=1,
                        texttemplate="%{z}",
                        showscale=True,
                        colorbar_thickness=20,
                        colorbar_ticklen=3,
                        # zmax=1, zmin=0.1
                        )
        layout = go.Layout(title_x=0.5, 
                        xaxis_showgrid=False,
                        yaxis_showgrid=False,
                        yaxis_autorange='reversed')
        fig=go.Figure(data=[heat], layout=layout)

    elif type == "scattermatrixTurbo":

        # fig = ff.create_scatterplotmatrix(df[['time_elapsed', 'Pkg_J', 'RAM_J', 'version']], 
        #                           height=1000,
        #                           width=1000,
        #                           diag='histogram',
        #                           text=df['version'],
        #                           index='version')
        
        fig = px.scatter(df, x='time_elapsed', y='Pkg+RAM_J', color='path',
                 animation_frame="version",
                 range_x=[df['time_elapsed'].min(),df['time_elapsed'].max()], range_y=[df['Pkg_J'].min(),df['Pkg_J'].max()]
                 )
    elif type == "scatterTurbo":

        fig = px.scatter(df, x='time_elapsed', y='Pkg+RAM_J', color="version")

    else:

        if type == "line" or type == "lineTop" or type == "lineTurbo":
            
            if type == "lineTop": df = df.groupby([x_data,color_data], sort=False)[[y_data]].median().reset_index()
            if type == "lineTurbo": df = df.groupby([x_data,color_data], sort=False)[[y_data]].median().reset_index()

            fig = px.line(df,
                    x = x_data,
                    y = y_data,
                    color = color_data,
                    title=language + ' - ' + filename_plot)
            fig.update_traces(textposition="bottom right")
            buttons=list([
                        dict(
                            args=[{"type": "line",}],
                            label="Line Chart",
                            method="restyle"
                            ),
                        dict(
                            args=[{"type": "bar"}],
                            label="Bar Chart",
                            method="restyle"
                        )
                    ])

        elif type == "bar" or type == "barTop":

            if type == "barTop": df = df.groupby([x_data,color_data], sort=False)[[y_data]].median().reset_index()
            if type == "barTurbo": df = df.groupby([x_data,color_data], sort=False)[[y_data]].median().reset_index()

            fig = px.bar(df,
                        x = x_data,
                        y = y_data,
                        color = color_data,
                        title=language + ' - ' + filename_plot)
            buttons=list([
                        dict(
                            args=[{"type": "bar"}],
                            label="Bar Chart",
                            method="restyle"
                        ),dict(
                            args=[{"type": "line",}],
                            label="Line Chart",
                            method="restyle"
                        )
                        ])

        elif type == "box":
            fig = px.box(df,
                            x = x_data,
                            y = y_data,
                            color = color_data,
                            title=language + ' - ' + filename_plot)
            # fig.update_traces(textposition="bottom right")

            buttons=list([
                        dict(
                            args=[{"type": "box"}],
                            label="Box Plot",
                            method="restyle"
                        )])

        
        updatemenus = list([
                dict(
                    type="dropdown",
                    direction="down",
                    x=0.12,
                    y=1.12,
                    xanchor="left",
                    yanchor="top",
                    pad={"r": 10, "t": 10},
                    buttons=buttons
                ),
                dict(
                    type="dropdown",
                    direction="down",
                    x=0.44,
                    y=1.12,
                    xanchor="left",
                    yanchor="top",
                    pad={"r": 10, "t": 10},
                    buttons=list([
                        dict(
                            args=[{"yaxis.type": "linear"}],
                            label="Linear Scale",
                            method="relayout"
                        ),
                        dict(
                            args=[{"yaxis.type": "log"}],
                            label="Log Scale",
                            method="relayout"
                        )
                    ])
                ),
            ])

        annotations=[
                dict(text="Plot type:", x=-0.01, xref="paper", y=1.08, yref="paper",
                                    align="left", showarrow=False),
                dict(text="Scale:", x=0.4, xref="paper", y=1.08,
                                    yref="paper", showarrow=False),
        ]

        fig.update_layout(updatemenus=updatemenus, annotations=annotations, hovermode="x unified")
        if language == 'js': fig.update_xaxes(categoryorder='array', categoryarray= ['0.8.28', '0.10.48', '0.12.18', '1.8.4', '2.5.0', '3.3.1', '4.9.1', '5.12.0', '6.17.1', '7.10.1', '8.17.0', '9.11.2', '10.24.1', '11.15.0', '12.22.12', '13.14.0', '14.21.3', '15.14.0', '16.20.2', '17.9.1', '18.17.1', '19.9.0', '20.5.1'])


    # Check if the directory exists
    directory = language + '/general_plots/'
    if not os.path.exists(directory):
        # If it doesn't exist, create it
        os.makedirs(directory)

    filename_plot_wDir = directory + filename_plot
    fig.show()

## Functions for calling the specific plots

In [5]:
def two_plots(df, title, filename_plot, x_data, y_data, color_data, type):

    fig1 = plot_Type(df, filename_plot, x_data, y_data, color_data, type)
    # fig2 = plot_Compare(df, filename_plot + "_Comparison_Diff", x_data, y_data, color_data, type, diff_flag=True)

## Results

### Extract all dataset

In [15]:
language = "java"

df_turbostat = from_CSVfiles("turbostat", norm=False)
df_top = from_CSVfiles("top", norm=False)
df_perf = from_CSVfiles("perf", norm=False)

df1 = df_turbostat.groupby(["path","version","release_date"], sort=False)[['time_elapsed',
                        'Pkg_J','Cor_J','GFX_J','RAM_J', 'Avg_MHz', 'Busy%', 'IPC', 'IRQ', 'POLL', 'C1%','C1E%','C3%','C6%','C7s%','C8%',
                        'C9%','C10%','CPU%c1','CPU%c3','CPU%c6','CPU%c7','CoreTmp','PkgTmp','GFX%rc6','Totl%C0','Any%C0','GFX%C0','CPUGFX%']].median().reset_index()
df2 = df_perf.groupby(["path","version","release_date"], sort=False)[['time_elapsed_sec','CPU_Utilization','Retiring','Frontend_Bound','Bad_Speculation','Backend_Bound',
                        'CPI','ILP','IPC','cycles','freq_cycles_GHz','instructions',
                        'Kernel_Utilization', 'L1D_Cache_Fill_BW', 'Turbo_Utilization', 'cycles',
                        'instructions', 'insn_per_cycle', 'cpu_clock_msec', 'no_cpus', 'cpu_cycles', 'freq_cpu_cycles_GHz',
                        'cpu_migrations','ref_cycles','bus_cycles','task_clock_msec','no_cpus_task_clock',
                        'cpu_thermal_margin_C','branches','branch_misses','mem_loads','mem_stores','page_faults','minor_faults','major_faults',
                        'cache_references','cache_misses','percent_cache_misses','L1_dcache_loads','L1_dcache_load_misses',
                        'LLC_loads','LLC_load_misses','L1_icache_load_misses','dTLB_loads',
                        'dTLB_load_misses','iTLB_loads','iTLB_load_misses']].median().reset_index()
df3 = df_top.groupby(["path","version","release_date"], sort=False)[['time','virt','res','shr','percent_cpu','percent_mem','nTH','P',
                    'SWAP','CODE','DATA','nMaj','nDRT','USED']].median().reset_index()

df = df1.merge(df2,how ='left').merge(df3,how ='left')
df

In [13]:
df1 = df_turbostat.groupby(["path","version","release_date"], sort=False)[['time_elapsed',
                        'Pkg_J','Cor_J','GFX_J','RAM_J', 'Avg_MHz', 'Busy%', 'IPC', 'IRQ', 'POLL', 'C1%','C1E%','C3%','C6%','C7s%','C8%',
                        'C9%','C10%','CPU%c1','CPU%c3','CPU%c6','CPU%c7','CoreTmp','PkgTmp','GFX%rc6','Totl%C0','Any%C0','GFX%C0','CPUGFX%']].median().reset_index()
df2 = df_perf.groupby(["path","version","release_date"], sort=False)[['time_elapsed_sec','CPU_Utilization','Retiring','Frontend_Bound','Bad_Speculation','Backend_Bound',
                            'CPI','ILP','IPC_perf','cycles','freq_cycles_GHz','instructions',
                            'Kernel_Utilization', 'L1D_Cache_Fill_BW', 'Turbo_Utilization', 'cycles',
                            'instructions', 'insn_per_cycle', 'cpu_clock_msec', 'no_cpus', 'cpu_cycles', 'freq_cpu_cycles_GHz',
                            'cpu_migrations','ref_cycles','bus_cycles','task_clock_msec','no_cpus_task_clock',
                            'cpu_thermal_margin_C','branches','branch_misses','mem_loads','mem_stores','page_faults','minor_faults','major_faults',
                            'cache_references','cache_misses','percent_cache_misses','L1_dcache_loads','L1_dcache_load_misses',
                            'LLC_loads','LLC_load_misses','L1_icache_load_misses','dTLB_loads',
                            'dTLB_load_misses','iTLB_loads','iTLB_load_misses']].median().reset_index()
df3 = df_top.groupby(["path","version","release_date"], sort=False)[['time','virt','res','shr','percent_cpu','percent_mem','nTH','P',
                        'SWAP','CODE','DATA','nMaj','nDRT','USED']].median().reset_index()

df = df1.merge(df2,how ='left').merge(df3,how ='left')
len(df['path'])


39

In [None]:
corrs = df_perf[["time_elapsed","CPU_Utilization", "CPI"]].corr()

np.fill_diagonal(corrs.values, np.nan)

corrs = corrs.head(5)

heat = go.Heatmap(z=corrs.values.round(2),
                        x=list(corrs.columns),
                        y=list(corrs.index),
                        xgap=1, ygap=1,
                        texttemplate="%{z}",
                        showscale=True,
                        colorbar_thickness=20,
                        colorbar_ticklen=3,
                        # zmax=1, zmin=0.1
                        )
layout = go.Layout(title_x=0.5, 
                        xaxis_showgrid=False,
                        yaxis_showgrid=False,
                        yaxis_autorange='reversed')
fig=go.Figure(data=[heat], layout=layout)
fig.show()

KeyError: "['time_elapsed'] not in index"

In [None]:
df1 = df_turbostat.groupby(["path","version","release_date"], sort=False)[['time_elapsed',
                        'Pkg_J','Cor_J','GFX_J','RAM_J', 'Avg_MHz', 'Busy%', 'IPC', 'IRQ', 'POLL', 'C1%','C1E%','C3%','C6%','C7s%','C8%',
                        'C9%','C10%','CPU%c1','CPU%c3','CPU%c6','CPU%c7','CoreTmp','PkgTmp','GFX%rc6','Totl%C0','Any%C0','GFX%C0','CPUGFX%']].median().reset_index()
df2 = df_perf.groupby(["path","version","release_date"], sort=False)[['time_elapsed_sec','CPU_Utilization','Retiring','Frontend_Bound','Bad_Speculation','Backend_Bound',
                        'CPI','ILP','IPC','cycles','freq_cycles_GHz','instructions',
                        'Kernel_Utilization', 'L1D_Cache_Fill_BW', 'Turbo_Utilization', 'cycles',
                        'instructions', 'insn_per_cycle', 'cpu_clock_msec', 'no_cpus', 'cpu_cycles', 'freq_cpu_cycles_GHz',
                        'cpu_migrations','ref_cycles','bus_cycles','task_clock_msec','no_cpus_task_clock',
                        'cpu_thermal_margin_C','branches','branch_misses','mem_loads','mem_stores','page_faults','minor_faults','major_faults',
                        'cache_references','cache_misses','percent_cache_misses','L1_dcache_loads','L1_dcache_load_misses',
                        'LLC_loads','LLC_load_misses','L1_icache_load_misses','dTLB_loads',
                        'dTLB_load_misses','iTLB_loads','iTLB_load_misses']].median().reset_index()
df3 = df_top.groupby(["path","version","release_date"], sort=False)[['time','virt','res','shr','percent_cpu','percent_mem','nTH','P',
                    'SWAP','CODE','DATA','nMaj','nDRT','USED']].median().reset_index()

df = df1.merge(df2,how ='left').merge(df3,how ='left')
df

Unnamed: 0,path,version,release_date,time_elapsed,Pkg_J,Cor_J,GFX_J,RAM_J,Avg_MHz,Busy%,...,percent_cpu,percent_mem,nTH,P,SWAP,CODE,DATA,nMaj,nDRT,USED
0,binaryTrees_21,2.5.6,2011-05-26,186.595177,2541.515,2055.225,0.235,134.605,440.0,12.625,...,100.0,0.4,1.0,3.0,0.0,0.0,62140416.0,0.0,0.0,65695744.0
1,binaryTrees_21,2.7.18,2020-04-20,186.452696,2234.465,1839.435,0.030,77.085,427.0,12.630,...,100.0,0.4,1.0,3.0,0.0,4096.0,68526080.0,0.0,0.0,72269824.0
2,binaryTrees_21,3.0.1,2009-02-13,221.321456,2757.755,2293.860,0.020,86.765,431.5,12.620,...,100.0,0.1,1.0,2.0,0.0,0.0,10850304.0,0.0,0.0,14966784.0
3,binaryTrees_21,3.4.10,2019-03-18,214.759298,2568.645,2028.655,0.210,147.925,418.0,12.630,...,100.0,0.1,1.0,2.0,0.0,4096.0,11943936.0,0.0,0.0,16801792.0
4,binaryTrees_21,3.5.10,2020-09-05,229.826346,2755.885,2170.560,0.240,160.810,418.5,12.635,...,100.0,0.1,1.0,1.0,0.0,4096.0,13385728.0,0.0,0.0,18358272.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,nbody_50000000_OOflag,3.9.16,2022-12-06,520.064931,6902.065,5821.935,0.240,145.820,456.0,12.620,...,100.0,0.1,1.0,3.0,0.0,4096.0,3145728.0,0.0,0.0,8671232.0
64,nbody_50000000_OOflag,3.10.11,2023-04-05,533.623978,7085.675,5978.175,0.245,149.540,456.5,12.610,...,100.0,0.1,1.0,4.0,0.0,4096.0,3678208.0,0.0,0.0,8683520.0
65,nbody_50000000_OOflag,3.11.3,2023-04-05,270.116912,3558.390,2994.555,0.230,76.015,450.5,12.620,...,100.0,0.1,1.0,3.0,0.0,4096.0,5255168.0,0.0,0.0,10518528.0
66,nbody_50000000_OOflag,3.12.0b1,2023-05-22,297.942662,3853.470,3234.520,0.030,83.295,447.0,12.610,...,100.0,0.1,1.0,5.0,0.0,4096.0,5111808.0,0.0,0.0,10321920.0


In [None]:
corrs = df[['time_elapsed','Pkg_J','Cor_J','GFX_J','RAM_J',
                    'Avg_MHz', 'Busy%', 'IPC', 'IRQ', 'POLL', 
                    'C1%','C1E%','C3%','C6%','C7s%','C8%','C9%','C10%',
                    'CPU%c1','CPU%c3','CPU%c6','CPU%c7','CoreTmp','PkgTmp',
                    'GFX%rc6','Totl%C0','Any%C0','GFX%C0','CPUGFX%',
                    'time','virt','res','shr','percent_cpu','percent_mem','nTH','P',
                    'SWAP','CODE','DATA','nMaj','nDRT','USED',
                    'time_elapsed_sec','CPU_Utilization','Retiring','Frontend_Bound','Bad_Speculation','Backend_Bound',
                        'CPI','ILP','IPC','cycles','freq_cycles_GHz','instructions',
                        'Kernel_Utilization', 'L1D_Cache_Fill_BW', 'Turbo_Utilization', 'cycles',
                        'instructions', 'insn_per_cycle', 'cpu_clock_msec', 'no_cpus', 'cpu_cycles', 'freq_cpu_cycles_GHz',
                        'cpu_migrations','ref_cycles','bus_cycles','task_clock_msec','no_cpus_task_clock',
                        'cpu_thermal_margin_C','branches','branch_misses','mem_loads','mem_stores','page_faults','minor_faults','major_faults',
                        'cache_references','cache_misses','percent_cache_misses','L1_dcache_loads','L1_dcache_load_misses',
                        'LLC_loads','LLC_load_misses','L1_icache_load_misses','dTLB_loads',
                        'dTLB_load_misses','iTLB_loads','iTLB_load_misses'
                        ]].corr()

np.fill_diagonal(corrs.values, np.nan)

corrs = corrs.T.head(5).T
corrs
# heat = go.Heatmap(z=corrs.values.round(2),
#                         x=list(corrs.columns),
#                         y=list(corrs.index),
#                         xgap=1, ygap=1,
#                         texttemplate="%{z}",
#                         showscale=True,
#                         colorbar_thickness=20,
#                         colorbar_ticklen=3,
#                         # zmax=1, zmin=0.1
#                         )
# layout = go.Layout(title_x=0.5, 
#                         xaxis_showgrid=False,
#                         yaxis_showgrid=False,
#                         yaxis_autorange='reversed')
# fig=go.Figure(data=[heat], layout=layout)
# fig.show()

Unnamed: 0,time_elapsed,Pkg_J,Cor_J,GFX_J,RAM_J
time_elapsed,,0.994605,0.991893,-0.093720,0.751369
Pkg_J,0.994605,,0.999470,-0.091703,0.739318
Cor_J,0.991893,0.999470,,-0.092633,0.719593
GFX_J,-0.093720,-0.091703,-0.092633,,-0.029239
RAM_J,0.751369,0.739318,0.719593,-0.029239,
...,...,...,...,...,...
L1_icache_load_misses,0.787554,0.788417,0.771801,-0.014068,0.744539
dTLB_loads,0.988158,0.988172,0.987236,0.093688,0.785487
dTLB_load_misses,-0.001478,-0.001974,-0.019336,-0.468455,0.129494
iTLB_loads,-0.844505,-0.846139,-0.834533,-0.100632,-0.744735


In [61]:
language = "c++"

df_turbostat = from_CSVfiles("turbostat", norm=False)
df_top = from_CSVfiles("top", norm=False)
df_perf = from_CSVfiles("perf", norm=False)

df1 = df_turbostat.groupby(["path","version","release_date"], sort=False)[['time_elapsed',
                        'Pkg_J','Cor_J','GFX_J','RAM_J', 'Avg_MHz', 'Busy%', 'IPC', 'IRQ', 'POLL', 'C1%','C1E%','C3%','C6%','C7s%','C8%',
                        'C9%','C10%','CPU%c1','CPU%c3','CPU%c6','CPU%c7','CoreTmp','PkgTmp','GFX%rc6','Totl%C0','Any%C0','GFX%C0','CPUGFX%']].median().reset_index()
df2 = df_perf.groupby(["path","version","release_date"], sort=False)[['time_elapsed_sec','CPU_Utilization','Retiring','Frontend_Bound','Bad_Speculation','Backend_Bound',
                        'CPI','ILP','IPC_perf','cycles','freq_cycles_GHz','instructions',
                        'Kernel_Utilization', 'L1D_Cache_Fill_BW', 'Turbo_Utilization', 'cycles',
                        'instructions', 'insn_per_cycle', 'cpu_clock_msec', 'no_cpus', 'cpu_cycles', 'freq_cpu_cycles_GHz',
                        'cpu_migrations','ref_cycles','bus_cycles','task_clock_msec','no_cpus_task_clock',
                        'cpu_thermal_margin_C','branches','branch_misses','mem_loads','mem_stores','page_faults','minor_faults','major_faults',
                        'cache_references','cache_misses','percent_cache_misses','L1_dcache_loads','L1_dcache_load_misses',
                        'LLC_loads','LLC_load_misses','L1_icache_load_misses','dTLB_loads',
                        'dTLB_load_misses','iTLB_loads','iTLB_load_misses']].median().reset_index()
df3 = df_top.groupby(["path","version","release_date"], sort=False)[['time','virt','res','shr','percent_cpu','percent_mem','nTH','P',
                    'SWAP','CODE','DATA','nMaj','nDRT','USED']].median().reset_index()

df = df1.merge(df2,how ='left').merge(df3,how ='left')
df

Unnamed: 0,path,version,release_date,time_elapsed,Pkg_J,Cor_J,GFX_J,RAM_J,Avg_MHz,Busy%,...,percent_cpu,percent_mem,nTH,P,SWAP,CODE,DATA,nMaj,nDRT,USED
0,binaryTrees_v2_21_original,g++-4.4,2012-03-13,17.725146,317.865,271.990,0.210,14.005,519.0,12.690,...,100.0,1.6,1.0,2.0,0.0,4096.0,268849152.0,0.0,0.0,271605760.0
1,binaryTrees_v2_21_original,g++-4.6,2013-04-12,17.340939,304.520,259.790,0.210,13.745,508.5,12.695,...,100.0,1.6,1.0,2.0,0.0,4096.0,268849152.0,0.0,0.0,271532032.0
2,binaryTrees_v2_21_original,g++-4.7,2014-06-12,17.621673,305.915,260.895,0.210,13.950,510.0,12.670,...,100.0,1.6,1.0,3.0,0.0,0.0,268849152.0,0.0,0.0,271548416.0
3,binaryTrees_v2_21_original,g++-4.8,2015-06-23,17.278999,301.325,256.520,0.210,13.740,512.0,12.695,...,100.0,1.6,1.0,3.0,0.0,4096.0,268849152.0,0.0,0.0,271544320.0
4,binaryTrees_v2_21_original,g++-4.9,2015-06-26,17.258049,297.190,253.060,0.210,13.705,508.5,12.700,...,100.0,1.6,1.0,4.0,0.0,4096.0,268849152.0,0.0,0.0,271556608.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,nbody_50000000_original_O3flag,g++-9,2022-05-27,2.788220,46.775,39.800,0.025,1.680,520.0,12.695,...,100.0,0.0,1.0,3.5,0.0,0.0,606208.0,0.0,0.0,1957888.0
96,nbody_50000000_original_O3flag,g++-10,2023-07-07,2.945812,52.450,45.000,0.030,1.770,519.0,12.680,...,100.0,0.0,1.0,5.0,0.0,0.0,606208.0,0.0,0.0,1996800.0
97,nbody_50000000_original_O3flag,g++-11,2023-05-29,2.934154,50.490,43.135,0.030,1.765,519.5,12.680,...,100.0,0.0,1.0,4.0,0.0,0.0,606208.0,0.0,0.0,2023424.0
98,nbody_50000000_original_O3flag,g++-12,2023-05-08,2.862272,49.640,42.280,0.030,1.720,519.5,12.690,...,100.0,0.0,1.0,3.0,0.0,0.0,606208.0,0.0,0.0,1992704.0


In [63]:
 # Per versions
if language == "python":
    filter_1 = 'version == "3.11.3" or version == "3.12.0b1" or version == "3.13.0a0"'
    filter_2 = 'version != "3.11.3" and version != "3.12.0b1" and version != "3.13.0a0"'
elif language == "c++":
        text1 = "Binary Trees Version 2 of B using -O3"
        text2 = "Binary Trees Version 6 of B using -O3"
        filter_1 = 'path == "binaryTrees_v2_21_original_O3flag"'
        filter_2 = 'path == "binaryTrees_v6_21_original_O3flag"'
elif language == 'java':
    filter_1 = 'version == "1.8.0_382" or version == "9.0.4" or version == "10.0.2"'
    filter_2 = 'version != "1.8.0_382" and version != "9.0.4" and version != "10.0.22"'
elif language == 'js':
    filter_1 = 'path == "nbody_50000000_original" and (version == "6.17.1" or version == "7.10.1")'
    filter_2 = 'path == "nbody_50000000_original" and (version != "6.17.1" and version != "6.17.1")'

df_filtered_1 = df.query(filter_1)
df_filtered_2 = df.query(filter_2)

df_turbostat_filtered_1 = df_turbostat.query(filter_1)
df_turbostat_filtered_2 = df_turbostat.query(filter_2)

df_top_filtered_1 = df_top.query(filter_1)
df_top_filtered_2 = df_top.query(filter_2)

df_perf_filtered_1 = df_perf.query(filter_1)
df_perf_filtered_2 = df_perf.query(filter_2)

In [40]:




corrs = df[['time_elapsed','Pkg_J','Cor_J','GFX_J','RAM_J',
                    'Avg_MHz', 'Busy%', 'IPC', 'IRQ', 'POLL', 
                    'C1%','C1E%','C3%','C6%','C7s%','C8%','C9%','C10%',
                    'CPU%c1','CPU%c3','CPU%c6','CPU%c7','CoreTmp','PkgTmp',
                    'GFX%rc6','Totl%C0','Any%C0','GFX%C0','CPUGFX%',
                    'time','virt','res','shr','percent_cpu','percent_mem','nTH','P',
                    'SWAP','CODE','DATA','nMaj','nDRT','USED',
                    'time_elapsed_sec','CPU_Utilization','Retiring','Frontend_Bound','Bad_Speculation','Backend_Bound',
                        'CPI','ILP','IPC','cycles','freq_cycles_GHz','instructions',
                        'Kernel_Utilization', 'L1D_Cache_Fill_BW', 'Turbo_Utilization', 'cycles',
                        'instructions', 'insn_per_cycle', 'cpu_clock_msec', 'no_cpus', 'cpu_cycles', 'freq_cpu_cycles_GHz',
                        'cpu_migrations','ref_cycles','bus_cycles','task_clock_msec','no_cpus_task_clock',
                        'cpu_thermal_margin_C','branches','branch_misses','mem_loads','mem_stores','page_faults','minor_faults','major_faults',
                        'cache_references','cache_misses','percent_cache_misses','L1_dcache_loads','L1_dcache_load_misses',
                        'LLC_loads','LLC_load_misses','L1_icache_load_misses','dTLB_loads',
                        'dTLB_load_misses','iTLB_loads','iTLB_load_misses'
                        ]].corr()

np.fill_diagonal(corrs.values, np.nan)

corrs = corrs.T.head(5).T
corrs = corrs[(corrs > 0.7) | (corrs < -0.7)]
corrs = corrs.dropna(how='all')
corrs = corrs.sort_values(by=['time_elapsed', 'Pkg_J','Cor_J','RAM_J','GFX_J'], ascending=False)

positive_rows = (corrs['time_elapsed'] > 0) | (corrs['Pkg_J'] > 0) | (corrs['Cor_J'] > 0) | (corrs['RAM_J'] > 0) | (corrs['GFX_J'] > 0)
negative_rows = (corrs['time_elapsed'] < 0) | (corrs['Pkg_J'] < 0) | (corrs['Cor_J'] < 0) | (corrs['RAM_J'] < 0) | (corrs['GFX_J'] < 0)
df_negative = corrs.loc[negative_rows]
df_negative = df_negative.sort_values(by=['time_elapsed'], ascending=True)
df_positive = corrs.loc[positive_rows]

df_positive = df_positive.drop_duplicates(subset=['time_elapsed', 'Pkg_J','Cor_J','RAM_J','GFX_J'], keep='first')
df_negative = df_negative.drop_duplicates(subset=['time_elapsed', 'Pkg_J','Cor_J','RAM_J','GFX_J'], keep='first')

heat = go.Heatmap(z=corrs.values.round(2),
                        x=list(corrs.columns),
                        y=list(corrs.index),
                        xgap=1, ygap=1,
                        texttemplate="%{z}",
                        showscale=True,
                        colorbar_thickness=20,
                        colorbar_ticklen=3,
                        # zmax=1, zmin=0.1
                        )
layout = go.Layout(title_x=0.5, 
                        xaxis_showgrid=False,
                        yaxis_showgrid=False,
                        yaxis_autorange='reversed')
fig=go.Figure(data=[heat], layout=layout)
fig['layout']['xaxis']['side'] = 'top'
fig.show()

# print("\nPOSITIVE Rows:")
# print(corrs.loc[positive_rows])

# print("\nNEGATIVE Rows:")
# print(corrs.loc[negative_rows])
df_negative['position'] = range(1,len(df_negative)+1)
first_column = df_negative.pop('position') 
df_negative.insert(0, 'position', first_column) 

print(df_negative)
print(df_positive)

                     position  time_elapsed     Pkg_J     Cor_J     GFX_J  \
Avg_MHz                     1     -0.895195 -0.861044 -0.849143       NaN   
Turbo_Utilization           2     -0.867248 -0.830337 -0.817069       NaN   
freq_cpu_cycles_GHz         3     -0.843787 -0.811575 -0.798261       NaN   
freq_cycles_GHz             4     -0.840296 -0.805938 -0.792340       NaN   
GFX%C0                      5     -0.812323 -0.826370 -0.826797       NaN   
CPUGFX%                     6     -0.807455 -0.822411 -0.822985       NaN   
C8%                         7     -0.731052 -0.744442 -0.747812       NaN   
CPU%c3                      8           NaN       NaN       NaN -0.708291   

                        RAM_J  
Avg_MHz             -0.826195  
Turbo_Utilization   -0.815142  
freq_cpu_cycles_GHz -0.819100  
freq_cycles_GHz     -0.812074  
GFX%C0                    NaN  
CPUGFX%                   NaN  
C8%                       NaN  
CPU%c3                    NaN  
                  

In [None]:
type = "scattermatrixTurbo"

if type != "scattermatrixTurbo" and type != "scatterTurbo":
    print("Hello")
else:
    print("world!")

world!


### Matrix correlation

### Energy Consumption

In [None]:
two_plots(df_turbostat, "Energy Consumption (Pkg + RAM)",
                    filename_plot="turbostat_Pkg+RAM_J", 
                    x_data="version", 
                    y_data="Pkg+RAM_J", 
                    color_data="path",
                    type="lineTurbo")

In [None]:
two_plots(df_turbostat, "Energy Consumption (Pkg + RAM)",
                    filename_plot="turbostat_Pkg+RAM_J", 
                    x_data="version", 
                    y_data="Pkg+RAM_Watts", 
                    color_data="path",
                    type="lineTurbo")

In [None]:
x_data = "version"
y_data = "Pkg+RAM_Watts"
color_data = "path"


df = df_turbostat.groupby([x_data,color_data], sort=False)[[y_data]].median().reset_index()
df = df[df['version'].str.contains("2.5.6|2.7.18|3.0.1|3.4.10|3.5.10") == False]

if language == "python": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]
if language == "c++": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]

fig = px.bar(df,
              x = x_data,
              y = y_data,
              color = color_data,
              title="Total Energy Consumption (Pkg + RAM) in " + language,
              barmode="group",
              color_discrete_sequence=color_list)
# fig.update_traces(textposition="bottom right")
fig.show()


In [None]:
x_data = "version"
y_data = "Pkg+RAM_Watts"
color_data = "path"


df = df_turbostat.groupby([x_data,color_data], sort=False)[[y_data]].median().reset_index()
df = df[df['version'].str.contains("2.5.6|2.7.18|3.0.1|3.4.10|3.5.10") == False]

fig = px.line(df,
              x = x_data,
              y = y_data,
              color = color_data,
              title="Total Energy Consumption (Pkg + RAM) of " + language)
fig.update_traces(textposition="bottom right")
fig.show()


In [None]:
df_perf["path"].unique()[1]

'binaryTrees_21_OOflag'

### Top Level - Performance

#### Python

In [None]:
language = "python"
df_perf = from_CSVfiles("perf", norm=False)

x_data = "version"
categories = ["Frontend_Bound","Backend_Bound","Bad_Speculation","Retiring"]
programs = df_perf["path"].unique()


for program in programs:
    print(program)
    df = df_perf.groupby([x_data,"path"], sort=False)[categories].median().reset_index()
    # df = df[df['version'].str.contains("2.5.6|2.7.18|3.0.1|3.4.10|3.5.10") == False]
    df = df.query('path == "'+ program +'"')
    df = df.drop('path', axis=1)
    
    if language == "python": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]
    if language == "c++": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]

    df_melted = pd.melt(df, id_vars=['version'], var_name='Perf_parameters', value_name='Value')
    fig = px.bar(df_melted,
              x = "version",
              y = "Value",
              color = "Perf_parameters",
              title="Performance of " + program + " in " + language)
    # fig.update_traces(textposition="bottom right")
    fig.show()


binaryTrees_21


binaryTrees_21_OOflag


binaryTrees_21_original


binaryTrees_21_original_OOflag


nbody_50000000


nbody_50000000_OOflag


#### C++

In [None]:
language = "c++"
df_perf = from_CSVfiles("perf", norm=False)

x_data = "version"
categories = ["Frontend_Bound","Backend_Bound","Bad_Speculation","Retiring"]
programs = df_perf["path"].unique()


for program in programs:
    df = df_perf.groupby([x_data,"path"], sort=False)[categories].median().reset_index()
    # df = df[df['version'].str.contains("2.5.6|2.7.18|3.0.1|3.4.10|3.5.10") == False]
    df = df.query('path == "'+ program +'"')
    df = df.drop('path', axis=1)
    
    if language == "python": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]
    if language == "c++": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]

    df_melted = pd.melt(df, id_vars=['version'], var_name='Perf_parameters', value_name='Value')
    fig = px.bar(df_melted,
              x = "version",
              y = "Value",
              color = "Perf_parameters",
              title="Performance of " + program + " in " + language)
    # fig.update_traces(textposition="bottom right")
    fig.show()

#### Java

In [None]:
language = "java"
df_perf = from_CSVfiles("perf", norm=False)

x_data = "version"
categories = ["Frontend_Bound","Backend_Bound","Bad_Speculation","Retiring"]
programs = df_perf["path"].unique()


for program in programs:
    df = df_perf.groupby([x_data,"path"], sort=False)[categories].median().reset_index()
    # df = df[df['version'].str.contains("2.5.6|2.7.18|3.0.1|3.4.10|3.5.10") == False]
    df = df.query('path == "'+ program +'"')
    df = df.drop('path', axis=1)
    
    if language == "python": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]
    if language == "c++": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]

    df_melted = pd.melt(df, id_vars=['version'], var_name='Perf_parameters', value_name='Value')
    fig = px.bar(df_melted,
              x = "version",
              y = "Value",
              color = "Perf_parameters",
              title="Performance of " + program + " in " + language)
    # fig.update_traces(textposition="bottom right")
    fig.show()

#### JavaScript

In [None]:
language = "js"
df_perf = from_CSVfiles("perf", norm=False)

x_data = "version"
categories = ["Frontend_Bound","Backend_Bound","Bad_Speculation","Retiring"]
programs = df_perf["path"].unique()


for program in programs:
    df = df_perf.groupby([x_data,"path"], sort=False)[categories].median().reset_index()
    # df = df[df['version'].str.contains("2.5.6|2.7.18|3.0.1|3.4.10|3.5.10") == False]
    df = df.query('path == "'+ program +'"')
    df = df.drop('path', axis=1)
    
    if language == "python": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]
    if language == "c++": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]

    df_melted = pd.melt(df, id_vars=['version'], var_name='Perf_parameters', value_name='Value')
    fig = px.bar(df_melted,
              x = "version",
              y = "Value",
              color = "Perf_parameters",
              title="Performance of " + program + " in " + language)
    # fig.update_traces(textposition="bottom right")
    fig.show()

In [None]:


df = df_perf.groupby([x_data,"path"], sort=False)[categories].median().reset_index()
# df = df[df['version'].str.contains("2.5.6|2.7.18|3.0.1|3.4.10|3.5.10") == False]
df = df.query('path == "'+ programs[0] +'"')
df = df.drop('path', axis=1)

if language == "python": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]
if language == "c++": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]

df_melted = pd.melt(df, id_vars=['version'], var_name='Perf_parameters', value_name='Value')

fig = px.bar(df_melted,
              x = "version",
              y = "Value",
              color = "Perf_parameters",
              title="Total Energy Consumption (Pkg + RAM) in " + language)
# fig.update_traces(textposition="bottom right")
fig.show()

In [None]:
language = "c++"
df_perf = from_CSVfiles("perf", norm=False)

x_data = "version"
categories = ["Frontend_Bound","Backend_Bound","Bad_Speculation","Retiring"]
programs = df_perf["path"].unique()


df = df_perf.groupby([x_data,"path"], sort=False)[categories].median().reset_index()
# df = df[df['version'].str.contains("2.5.6|2.7.18|3.0.1|3.4.10|3.5.10") == False]
df = df.query('path == "'+ programs[0] +'"')
df = df.drop('path', axis=1)

if language == "python": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]
if language == "c++": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]

df_melted = pd.melt(df, id_vars=['version'], var_name='Perf_parameters', value_name='Value')

fig = px.bar(df_melted,
              x = "version",
              y = "Value",
              color = "Perf_parameters",
              title="Total Energy Consumption (Pkg + RAM) in " + language)
# fig.update_traces(textposition="bottom right")
fig.show()

In [None]:
language = "java"
df_perf = from_CSVfiles("perf", norm=False)

x_data = "version"
categories = ["Frontend_Bound","Backend_Bound","Bad_Speculation","Retiring"]
programs = df_perf["path"].unique()


df = df_perf.groupby([x_data,"path"], sort=False)[categories].median().reset_index()
# df = df[df['version'].str.contains("2.5.6|2.7.18|3.0.1|3.4.10|3.5.10") == False]
df = df.query('path == "'+ programs[0] +'"')
df = df.drop('path', axis=1)

if language == "python": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]
if language == "c++": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]

df_melted = pd.melt(df, id_vars=['version'], var_name='Perf_parameters', value_name='Value')

fig = px.bar(df_melted,
              x = "version",
              y = "Value",
              color = "Perf_parameters",
              title="Total Energy Consumption (Pkg + RAM) in " + language)
# fig.update_traces(textposition="bottom right")
fig.show()

In [None]:
language = "js"
df_perf = from_CSVfiles("perf", norm=False)

x_data = "version"
categories = ["Frontend_Bound","Backend_Bound","Bad_Speculation","Retiring"]
programs = df_perf["path"].unique()


df = df_perf.groupby([x_data,"path"], sort=False)[categories].median().reset_index()
# df = df[df['version'].str.contains("2.5.6|2.7.18|3.0.1|3.4.10|3.5.10") == False]
df = df.query('path == "'+ programs[0] +'"')
df = df.drop('path', axis=1)

if language == "python": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]
if language == "c++": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]

df_melted = pd.melt(df, id_vars=['version'], var_name='Perf_parameters', value_name='Value')

fig = px.bar(df_melted,
              x = "version",
              y = "Value",
              color = "Perf_parameters",
              title="Total Energy Consumption (Pkg + RAM) in " + language)
# fig.update_traces(textposition="bottom right")
fig.show()