# Data Analysis

## Libraries

In [3]:
# Libraries
import re
import sys
import os
import glob
import webbrowser
import natsort
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from sklearn.preprocessing import MaxAbsScaler
from IPython.display import display, HTML
from plots_general import *

## Releases of each programming language according to their respective compilers/interpreters

In [4]:
# Directory
actual_directory = os.getcwd() + '/'  

python_releaseDates = {
    'Python 3.13.0a0': '2023-06-07',
    'Python 3.12.0b1': '2023-05-22',
    'Python 3.11.3': '2023-04-05',
    'Python 3.10.11': '2023-04-05',
    'Python 3.9.16': '2022-12-06',
    'Python 3.8.16': '2022-12-06',
    'Python 3.7.16': '2022-12-06',
    'Python 3.6.15': '2021-09-04',
    'Python 3.5.10': '2020-09-05',
    'Python 3.4.10': '2019-03-18',
    'Python 3.3.7': '2017-09-19',
    'Python 3.2.6': '2014-10-11',
    'Python 3.1.5': '2012-04-09',
    'Python 3.0.1': '2009-02-13',
    'Python 2.7.18': '2020-04-20',
    'Python 2.6.9': '2013-10-29',
    'Python 2.5.6': '2011-05-26',
}

cplusplus_releaseDates = {
    'g++-4.4 4.4.7': '2012-03-13',
    'g++-4.6 4.6.4': '2013-04-12',
    'g++-4.7 4.7.4': '2014-06-12',
    'g++-4.8 4.8.5': '2015-06-23',
    'g++-4.9 4.9.3': '2015-06-26',
    'g++-5 5.5.0': '2017-10-10',
    'g++-6 6.5.0': '2018-10-26',
    'g++-7 7.5.0': '2019-11-14',
    'g++-8 8.5.0': '2021-05-14',
    'g++-8 ': '2021-05-14',
    'g++-9 9.5.0': '2022-05-27',
    'g++-10 10.4.0': '2022-06-28',
    'g++-10 10.5.0': '2023-07-07',
    'g++-11 11.4.0': '2023-05-29',
    'g++-12 12.3.0': '2023-05-08',
    'g++-13 13.1.0': '2023-04-26',
}

java_releaseDates = {
    '1.8.0_362': '2023-04-18',
    '1.8.0_382': '2023-07-18',
    '9.0.4': '2018-01-16',
    '10.0.2': '2018-07-17',
    '11.0.19': '2020-10-20',
    '11.0.20': '2023-07-18',
    '11.0.20.1': '2023-08-22',
    '12.0.2': '2019-07-16',
    '13.0.2': '2020-01-14',
    '14.0.2': '2020-07-14',
    '15.0.2': '2021-01-19',
    '16.0.2': '2021-07-20',
    '17.0.7': '2023-04-18',
    '17.0.8': '2023-07-18',
    '17.0.8.1': '2023-08-22',
    '18.0.2-ea': '2022-07-19',
    '19.0.2': '2023-01-17',
    '20.0.2': '2023-07-18',
}

js_releaseDates = {
    '20.5.1': '2023-08-09',
    '19.9.0': '2023-04-10',
    '18.17.1': '2023-08-08',
    '17.9.1': '2022-06-01',
    '16.20.2': '2023-08-08',
    '15.14.0': '2021-04-06',
    '14.21.3': '2023-02-16',
    '13.14.0': '2020-04-29',
    '12.22.12': '2022-04-05',
    '11.15.0': '2019-04-30',
    '10.24.1': '2021-04-06',
    '9.11.2': '2018-06-12',
    '8.17.0': '2019-12-17',
    '7.10.1': '2017-07-11',
    '6.17.1': '2019-04-03',
    '5.12.0': '2016-06-23',
    '4.9.1': '2018-03-29',
    '3.3.1': '2015-09-15',
    '2.5.0': '2015-07-28',
    '1.8.4': '2015-07-09',
    '0.12.18': '2017-02-22',
    '0.10.48': '2016-10-18',
    '0.8.28': '2014-07-31'
}

## Functions for extracting information

In [5]:
# Function to get release date for a given Python version
def get_release_date(version):
    if language == 'python':
        return python_releaseDates.get(version, 'Unknown')
    elif language == 'c++':
        return cplusplus_releaseDates.get(version, 'Unknown')
    elif language == 'java':
        return java_releaseDates.get(version, 'Unknown')
    elif language == 'js':
        return js_releaseDates.get(version, 'Unknown')

# Function to convert "g", "m" or KiB to "byte"
def convert_g_to_byte(value):
    value_str = str(value)
    if value_str[:-1] == "":
        Byte_value = 0
    elif value_str[-1].lower() == 'g':
        g_value = float(value_str[:-1])
        Byte_value = g_value * 1024 * 1024 * 1024  # 1 giga = 1,000,000,000
    elif value_str[-1].lower() == 'm':
        m_value = float(value_str[:-1])
        Byte_value = m_value * 1024 * 1024 # 1 giga = 1,000,000
    elif value_str[-1].lower() == 'k':
        k_value = float(value_str[:-1])
        Byte_value = k_value * 1024 # 1 kilo = 1,000
    else:
        k_value = float(value_str)
        Byte_value = k_value * 1024 # 1 kilo = 1,000
    return int(Byte_value)

def convert_toUnit(column):
    column = column.apply(convert_g_to_byte)
    column = pd.to_numeric(column, errors='coerce')
    return column

# Define a function to remove units from a string
def remove_units(cell_value):
    # Use regular expressions to remove units
    cell_value = re.sub(r'[A-Za-z]+', '', str(cell_value))
    return cell_value
    
def Data_normalized(df, tool):
    df_data = df[['version', 'release_date', 'path', 'appplication']]
    if tool == "turbostat": 
        df_metric = df.loc[:, ~df.columns.isin(['version', 'release_date', 'path', 'appplication'])]
    elif tool == "top":
        df_metric = df[['virt', 'res', 'shr', 'percent_cpu', 'percent_mem',
                    'nTH', 'P', 'SWAP', 'CODE', 'DATA', 'nMaj',
                    'nDRT', 'USED']]

    transformer = MaxAbsScaler().fit(df_metric)
    scaled = transformer.transform(df_metric)

    df_norm = pd.DataFrame(scaled, columns=df_metric.columns)
    df = pd.concat([df_data,df_norm.reindex(df_data.index)], axis=1)
    return df

# Function to extract information in every file
def from_CSVfile(file, directory, tool):
     # Read CSV file
    df = pd.read_csv(file)
    if language == 'js': df['version'] = df['version'].str.replace('v', '')
    
    # New column 'release_date' as the second 
    df['release_date'] = df['version'].apply(get_release_date)
    df.insert(1, 'release_date', df.pop('release_date'))

    # Convert date into datetime
    df['release_date'] = pd.to_datetime(df['release_date'])

    # New column 'path' as the third
    df['path'] = directory
    df.insert(2, 'path', df.pop('path'))

    # Clean and remove the unnecessary rows
    df.replace(to_replace='-', value=0, inplace=True)

    # Apply the conversion function to the DataFrame column
    if tool == "top":
        df = df.dropna(subset=['command'])
        df['virt'] = convert_toUnit(df['virt'])
        df['res'] = convert_toUnit(df['res'])
        df['shr'] = convert_toUnit(df['shr'])
        df['CODE'] = convert_toUnit(df['CODE'])
        df['DATA'] = convert_toUnit(df['DATA'])
        df['SWAP'] = convert_toUnit(df['SWAP'])
        df['USED'] = convert_toUnit(df['USED'])
        df['nMin'] = convert_toUnit(df['nMin'])
        df['nMaj'] = convert_toUnit(df['nMaj'])

    # Changes in the 'version' column
    if language == 'python': df['version'] = df['version'].str.replace('Python ', '')
    if language == 'c++': df['version'] = df['version'].str.split().str[0]

    # with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    #     print(df)

    return df

# Extract all information from ALL files
def from_CSVfiles(tool, norm):

    all_df = pd.DataFrame()

    list_directories = next(os.walk(language))[1]
    list_directories.sort()

    for directory_name in list_directories:
        if directory_name != "waiting" and directory_name != "older" and directory_name != "test" and directory_name != "general_plots" and directory_name != "general_plots_v1" and directory_name != "mainPrograms_plots":
            
            path=language + '/' + directory_name + '/' + tool + '/'
            
            if tool == "top":

                df = pd.DataFrame()
                list_files = os.listdir(path)

                # Get list of all files only in the given directory
                list_files = natsort.natsorted(list_files)

                for file_name in list_files:
                    if file_name.startswith("temp_top_data_") and file_name.endswith('.csv'):
                        df_Top = from_CSVfile(path + file_name, directory_name, tool)
                        df = pd.concat([df, df_Top])

                # Split the your_column into minutes, seconds, and hundredths
                df[['minutes', 'seconds_hundredths']] = df['time'].str.split(':', expand=True)

                # Convert minutes, seconds, and hundredths to seconds
                df['time'] = pd.to_numeric(df['minutes']) * 60 + pd.to_numeric(df['seconds_hundredths'])

                df.to_csv(path + "top_data_allVersions.csv", index=False)
            elif tool == "turbostat":
                df = pd.DataFrame()
                list_files = os.listdir(path)

                # Get list of all files only in the given directory
                list_files = natsort.natsorted(list_files)

                for file_name in list_files:
                    if file_name.startswith("turbostat_performance_data") and file_name.endswith('.csv') and file_name != "turbostat_performance_data_allVersions.csv":
                        df_Turbo = from_CSVfile(path + file_name, directory_name, tool)
                        df = pd.concat([df, df_Turbo])
                df['Pkg+RAM_J'] = df['Pkg_J'] + df['RAM_J']
                df['Pkg_Watt'] = df['Pkg_J'] / df['time_elapsed']
                df['RAM_Watt'] = df['RAM_J'] / df['time_elapsed']
                df['Pkg+RAM_Watts'] = df['Pkg_Watt'] + df['RAM_Watt']
            elif tool == "perf": 
                # path = path + tool
                df = pd.DataFrame()
                list_files = os.listdir(path)

                # Get list of all files only in the given directory
                list_files = natsort.natsorted(list_files)
            
                for file_name in list_files:
                    if file_name.startswith("perf_performance_data") and file_name.endswith('.csv'):
                        df_Perf= from_CSVfile(path + file_name, directory_name, tool)
                        df = pd.concat([df, df_Perf])

                # Extract the units contained in the every measurement, for instance, the column 'time_elapsed'
                # contains values in msec; so this convert into secs and remove the unit "msec" to add it in the header
                parameters = ["time_elapsed","freq_cycles","cpu_clock","freq_cpu_cycles","task_clock","cpu_thermal_margin"]

                for parameter in parameters:
                    colname = df[parameter].str.extract(r'\s(.+)$').iloc[0, 0]
                    if parameter == "time_elapsed":
                        df[parameter] = df[parameter].str.replace(colname, '').astype(int)
                        df['time_elapsed'] = df['time_elapsed'] / 1000000000
                        df.rename(columns={parameter: f'{parameter}_sec'}, inplace=True)
                    else:
                        df[parameter] = df[parameter].str.replace(colname, '').astype(float)
                        df.rename(columns={parameter: f'{parameter}_{colname}'}, inplace=True)
                df = df.rename(columns={"IPC": "IPC_perf"})

                # df['TotalSlots'] = df['freq_cycles_GHz'] * 4 * 1000000000
                df['TotalSlots_Giga'] = df['cycles'] * 4
                df['FetchBubbles_Giga'] = df['TotalSlots_Giga'] * df['Frontend_Bound'] 
                df['SlotsRetired_Giga'] = df['TotalSlots_Giga'] * df['Retiring'] 
                # df['SlotsIssued+RecoveryBubbles_Giga'] = (df['TotalSlots_Giga'] * df['Bad_Speculation']) + df['SlotsRetired_Giga']
                df['SlotsIssued+RecoveryBubbles_Giga'] = df['TotalSlots_Giga'] * df['Bad_Speculation'] 
                df['SlotsBackend_Giga'] = df['TotalSlots_Giga'] * df['Backend_Bound'] 
                df.to_csv(path + "perf_data_allVersions_10times.csv", index=False)
            else:
                # df = from_CSVfile(path + '_data_allVersions.csv', directory_name, tool)
                # if norm: df = Data_normalized(df, tool)
                print("Error selecting the tool. Please check an appropriate too.")
            all_df = pd.concat([all_df, df])

    return all_df

# CREATING A GENERAL DF WITH ALL IMPORTANT PARAMETERS
def general_df(df_turbostat, df_perf, df_top):
    df1 = df_turbostat.groupby(["path","version","release_date"], sort=False)[['time_elapsed',
                        'Pkg_J','Cor_J','RAM_J','GFX_J', 'Avg_MHz', 'Busy%', 'IPC', 'IRQ', 'POLL', 'C1%','C1E%','C3%','C6%','C7s%','C8%',
                        'C9%','C10%','CPU%c1','CPU%c3','CPU%c6','CPU%c7','CoreTmp','PkgTmp','GFX%rc6','Totl%C0','Any%C0','GFX%C0','CPUGFX%']].median().reset_index()
    df2 = df_perf.groupby(["path","version","release_date"], sort=False)[['time_elapsed_sec','CPU_Utilization','Retiring','Frontend_Bound','Bad_Speculation','Backend_Bound',
                            'TotalSlots_Giga','FetchBubbles_Giga','SlotsRetired_Giga','SlotsIssued+RecoveryBubbles_Giga','SlotsBackend_Giga',
                            'CPI','ILP','IPC_perf','cycles','freq_cycles_GHz','instructions',
                            'Kernel_Utilization', 'L1D_Cache_Fill_BW', 'Turbo_Utilization', 'cycles',
                            'instructions', 'insn_per_cycle', 'cpu_clock_msec', 'no_cpus', 'cpu_cycles', 'freq_cpu_cycles_GHz',
                            'cpu_migrations','ref_cycles','bus_cycles','task_clock_msec','no_cpus_task_clock',
                            'cpu_thermal_margin_C','branches','branch_misses','mem_loads','mem_stores','page_faults','minor_faults','major_faults',
                            'cache_references','cache_misses','percent_cache_misses','L1_dcache_loads','L1_dcache_load_misses',
                            'LLC_loads','LLC_load_misses','L1_icache_load_misses','dTLB_loads',
                            'dTLB_load_misses','iTLB_loads','iTLB_load_misses']].median().reset_index()
    df3 = df_top.groupby(["path","version","release_date"], sort=False)[['time','virt','res','shr','percent_cpu','percent_mem','nTH','P',
                        'SWAP','CODE','DATA','nMaj','nDRT','USED']].median().reset_index()

    df_merged = pd.merge(df1, df2, on=["path","version","release_date"], how='left')
    df = pd.merge(df_merged, df3, on=["path","version","release_date"], how='left')


    path=language + '/'
    df.to_csv(path + "dataframe_General_medianValues.csv", index=False)

    return df


## Results

### Extract all dataset

In [64]:
language = "java"

df_turbostat = from_CSVfiles("turbostat", norm=False)
df_top = from_CSVfiles("top", norm=False)
df_perf = from_CSVfiles("perf", norm=False)
df = general_df(df_turbostat, df_perf, df_top)

In [5]:
corrs = df_perf[["time_elapsed_sec","CPU_Utilization", "CPI"]].corr()

np.fill_diagonal(corrs.values, np.nan)

corrs = corrs.head(5)

heat = go.Heatmap(z=corrs.values.round(2),
                        x=list(corrs.columns),
                        y=list(corrs.index),
                        xgap=1, ygap=1,
                        texttemplate="%{z}",
                        showscale=True,
                        colorbar_thickness=20,
                        colorbar_ticklen=3,
                        # zmax=1, zmin=0.1
                        )
layout = go.Layout(title_x=0.5, 
                        xaxis_showgrid=False,
                        yaxis_showgrid=False,
                        yaxis_autorange='reversed')
fig=go.Figure(data=[heat], layout=layout)
fig.show()

In [6]:
all_df = pd.DataFrame()

list_directories = next(os.walk(language))[1]
list_directories.sort()

if language == "python":
    nbody = "nbody_50000000_OOflag"
    binarytrees = "binaryTrees_21_original_OOflag"
elif language == "c++":
    nbody = "nbody_50000000_original_O3flag"
    binarytrees = "binaryTrees_v6_21_original_O3flag"
elif language == "java":
    nbody = "nbody_50000000"
    binarytrees = "binaryTrees_21_with_Multithreading"
elif language == "js":
    nbody = "nbody_50000000_original"
    binarytrees = "binaryTrees_original_v1"

df_filtered_nbody = df.query('path == "' + nbody + '"')
df_filtered_binarytrees = df.query('path == "' + binarytrees +'"')
df_filtered_binarytrees

Unnamed: 0,path,version,release_date,time_elapsed,Pkg_J,Cor_J,RAM_J,GFX_J,Avg_MHz,Busy%,...,percent_cpu,percent_mem,nTH,P,SWAP,CODE,DATA,nMaj,nDRT,USED


In [7]:
import pandas as pd

# Create two sample dataframes
data1 = {'ID': [1, 1, 3, 4],
         'Value': ['A', 'A', 'C', 'D']}
df1 = pd.DataFrame(data1)

data2 = {'ID': [5, 1, 2, 6],
         'Value': ['C', 'D', 'E', 'F']}
df2 = pd.DataFrame(data2)

# Find common values in the 'ID' column
common_values = pd.merge(df1, df2, on='ID', how='inner')

# Find values that do not have in common and show from which dataframe belongs
non_common_values = pd.merge(df1, df2, on='ID', how='outer', indicator=True).query('_merge != "both"').drop('_merge', axis=1)

# Add a column to indicate the source dataframe
non_common_values['Source'] = non_common_values.apply(lambda row: 'df1' if pd.isna(row['Value_y']) else 'df2', axis=1)

# Rename columns for clarity
non_common_values = non_common_values.rename(columns={'ID': 'Non_Common_ID', 'Value_x': 'Value_df1', 'Value_y': 'Value_df2'})

# Display the results
print("Non-Common Values:")
print(non_common_values)

print(common_values)


Non-Common Values:
   Non_Common_ID Value_df1 Value_df2 Source
2              3         C       NaN    df1
3              4         D       NaN    df1
4              5       NaN         C    df2
5              2       NaN         E    df2
6              6       NaN         F    df2
   ID Value_x Value_y
0   1       A       D
1   1       A       D


In [8]:
import pandas as pd

# Create two sample dataframes
data1 = {'time_elapsed': [1, 1, 3, 4],
         'Pkg_J': [10, 10, 30, 40],
         'Cor_J': [5, 5, 25, 35],
         'RAM_J': [50, 50, 70, 80],
         'GFX_J': [90, 90, 110, 120]}
df1 = pd.DataFrame(data1, index=['latency', 'latency', 'power', 'voltage'])
print(df1)

data2 = {'time_elapsed': [5, 1, 2, 6],
         'Pkg_J': [30, 40, 50, 60],
         'Cor_J': [25, 35, 45, 55],
         'RAM_J': [70, 80, 90, 100],
         'GFX_J': [110, 120, 130, 140]}
df2 = pd.DataFrame(data2, index=['current', 'speed', 'other', 'latency'])
print(df2)

# Find common values in the index
common_values = pd.merge(df1, df2, left_index=True, right_index=True, how='inner')
common_values = common_values.drop_duplicates()

# Find values that do not have in common and show from which dataframe belongs
non_common_values = pd.merge(df1, df2, left_index=True, right_index=True, how='outer', indicator=True).query('_merge != "both"').drop('_merge', axis=1)

# Add a column to indicate the source dataframe
non_common_values['Source'] = non_common_values.apply(lambda row: 'df1' if pd.isna(row['time_elapsed_y']) else 'df2', axis=1)

# Rename columns for clarity
non_common_values = non_common_values.rename(columns={
    'time_elapsed_x': 'time_elapsed_df1',
    'Pkg_J_x': 'Pkg_J_df1',
    'Cor_J_x': 'Cor_J_df1',
    'RAM_J_x': 'RAM_J_df1',
    'GFX_J_x': 'GFX_J_df1',
    'time_elapsed_y': 'time_elapsed_df2',
    'Pkg_J_y': 'Pkg_J_df2',
    'Cor_J_y': 'Cor_J_df2',
    'RAM_J_y': 'RAM_J_df2',
    'GFX_J_y': 'GFX_J_df2'
})

# Display the results
print("Common Values:")
print(common_values)

print("\nNon-Common Values:")
print(non_common_values)


         time_elapsed  Pkg_J  Cor_J  RAM_J  GFX_J
latency             1     10      5     50     90
latency             1     10      5     50     90
power               3     30     25     70    110
voltage             4     40     35     80    120
         time_elapsed  Pkg_J  Cor_J  RAM_J  GFX_J
current             5     30     25     70    110
speed               1     40     35     80    120
other               2     50     45     90    130
latency             6     60     55    100    140
Common Values:
         time_elapsed_x  Pkg_J_x  Cor_J_x  RAM_J_x  GFX_J_x  time_elapsed_y  \
latency               1       10        5       50       90               6   

         Pkg_J_y  Cor_J_y  RAM_J_y  GFX_J_y  
latency       60       55      100      140  

Non-Common Values:
         time_elapsed_df1  Pkg_J_df1  Cor_J_df1  RAM_J_df1  GFX_J_df1  \
current               NaN        NaN        NaN        NaN        NaN   
other                 NaN        NaN        NaN        NaN        Na

In [9]:
from IPython.display import display, HTML

def get_source(row):
    if row['_indicator_python_c++'] == 'both':
        if row['_indicator_java'] == 'both':
            if row['_indicator_js'] == 'both':
                return 'common'
            else:
                return 'python, c++, java'
        else:
            if row['_indicator_js'] == 'both':
                return 'python, c++, js'
            else:
                return 'python, c++'
    elif row['_indicator_python_c++'] == 'left_only':
        if row['_indicator_java'] == 'left_only':
            if row['_indicator_js'] == 'both':
                return 'python, js'
            else:
                return 'python'
        else: 
            if row['_indicator_js'] == 'both':
                return 'python, java, js'
            else:
                return 'python, java'
    elif row['_indicator_python_c++'] == 'right_only':
        if row['_indicator_java'] == 'left_only':
            if row['_indicator_js'] == 'both':
                return 'c++, js'
            else:
                return 'c++'
        else: 
            if row['_indicator_js'] == 'both':
                return 'c++, java, js'
            else:
                return 'c++, java'
    elif row['_indicator_java'] == 'both':
        return 'common'
    elif row['_indicator_js'] == 'both':
        return 'common'
    elif row['_indicator_python_c++'] == 'left_only':
        return 'python'
    elif row['_indicator_java'] == 'left_only':
        return 'c++'
    elif row['_indicator_js'] == 'left_only':
        return 'java'
    else:
        return 'js'

def df_common_and_noncommon_Parameters(df_1, df_2, df_3, df_4, correlation_type):
    df_common_values = pd.merge(pd.merge(pd.merge(df_1, df_2, left_index=True, right_index=True, how='inner', suffixes=('_python', '_c++')),
                                         df_3, left_index=True, right_index=True, how='inner', suffixes=('', '_java')),
                                df_4, left_index=True, right_index=True, how='inner', suffixes=('', '_js'))
    # df_common_values = pd.merge(df_common_values, df_3, left_index=True, right_index=True, how='inner')
    # df_common_values = df_common_values.drop_duplicates()
    # df_common_values = pd.merge(df_common_values, df_4, left_index=True, right_index=True, how='inner')
    df_common_values = df_common_values.drop_duplicates()

    # df_non_common_values = pd.merge(df_1, df_2, left_index=True, right_index=True, how='outer', indicator=True).query('_merge != "both"').drop('_merge', axis=1)
    # df_non_common_values = pd.merge(df_non_common_values, df_4, left_index=True, right_index=True, how='outer', indicator=True).query('_merge != "both"').drop('_merge', axis=1)
    df_non_common_values = pd.merge(pd.merge(pd.merge(df_1, df_2, left_index=True, right_index=True, how='outer', indicator='_indicator_python_c++', suffixes=('_python', '_c++')),
                                     df_3, left_index=True, right_index=True, how='outer', indicator='_indicator_java', suffixes=('', '_java')),
                            df_4, left_index=True, right_index=True, how='outer', indicator='_indicator_js', suffixes=('', '_js'))
    df_non_common_values = df_non_common_values.drop_duplicates()

    df_non_common_values['Source'] = df_non_common_values.apply(lambda row: get_source(row), axis=1)
    second_column = df_non_common_values.pop('Source') 
    df_non_common_values.insert(0, 'Source', second_column)
    df_non_common_values = df_non_common_values.sort_values(by=['Source'])

    df_common_values['position'] = range(1,len(df_common_values)+1)
    first_column = df_common_values.pop('position') 
    df_common_values.insert(0, 'position', first_column)

    df_non_common_values['position'] = range(1,len(df_non_common_values)+1)
    first_column = df_non_common_values.pop('position') 
    df_non_common_values.insert(0, 'position', first_column)

    return df_common_values, df_non_common_values

def correlation_ProgrammingLanguages(correlation_type):
    if correlation_type == "general":
        df_python = pd.read_csv("python" + "/correlation_general_medianValues.csv", index_col=0)
        df_cplusplus = pd.read_csv("c++" + "/correlation_general_medianValues.csv", index_col=0)
        df_java = pd.read_csv("java" + "/correlation_general_medianValues.csv", index_col=0)
        df_js = pd.read_csv("js" + "/correlation_general_medianValues.csv", index_col=0)
    elif correlation_type == "turbostat":
        df_python = pd.read_csv("python" + "/correlation_turbostat_allData.csv", index_col=0)
        df_cplusplus = pd.read_csv("c++" + "/correlation_turbostat_allData.csv", index_col=0)
        df_java = pd.read_csv("java" + "/correlation_turbostat_allData.csv", index_col=0)
        df_js = pd.read_csv("js" + "/correlation_turbostat_allData.csv", index_col=0)
    elif correlation_type == "perf":
        df_python = pd.read_csv("python" + "/correlation_perf_allData.csv", index_col=0)
        df_cplusplus = pd.read_csv("c++" + "/correlation_perf_allData.csv", index_col=0)
        df_java = pd.read_csv("java" + "/correlation_perf_allData.csv", index_col=0)
        df_js = pd.read_csv("js" + "/correlation_perf_allData.csv", index_col=0)
    elif correlation_type == "top":
        df_python = pd.read_csv("python" + "/correlation_top_allData.csv", index_col=0)
        df_cplusplus = pd.read_csv("c++" + "/correlation_top_allData.csv", index_col=0)
        df_java = pd.read_csv("java" + "/correlation_top_allData.csv", index_col=0)
        df_js = pd.read_csv("js" + "/correlation_top_allData.csv", index_col=0)

    df_python.set_index(df_python.columns[0])
    df_cplusplus.set_index(df_cplusplus.columns[0])
    df_java.set_index(df_java.columns[0])
    df_js.set_index(df_js.columns[0])

    df_common_values, df_non_common_values = df_common_and_noncommon_Parameters(df_python, df_cplusplus, df_java, df_js, correlation_type)

    return df_common_values, df_non_common_values

def correlations():
    list = ['general', 'turbostat', 'perf', 'top']

    for correlation_type in list:
        df_common_values, df_non_common_values = correlation_ProgrammingLanguages(correlation_type)
        display(HTML(df_common_values.to_html()))
        display(HTML(df_non_common_values.to_html()))
        

In [114]:
language = "js"

df_turbostat = from_CSVfiles("turbostat", norm=False)
df_top = from_CSVfiles("top", norm=False)
df_perf = from_CSVfiles("perf", norm=False)
df = general_df(df_turbostat, df_perf, df_top)
df[["freq_cycles_GHz","Retiring","Bad_Speculation","Frontend_Bound","Backend_Bound",'SlotsRetired_Giga','SlotsIssued+RecoveryBubbles_Giga','FetchBubbles_Giga','SlotsBackend_Giga','TotalSlots_Giga']]

Unnamed: 0,freq_cycles_GHz,Retiring,Bad_Speculation,Frontend_Bound,Backend_Bound,SlotsRetired_Giga,SlotsIssued+RecoveryBubbles_Giga,FetchBubbles_Giga,SlotsBackend_Giga,TotalSlots_Giga
0,3.9270,0.55,0.02,0.145,0.285,1.861826e+11,6.847942e+09,4.870742e+10,9.766773e+10,3.423971e+11
1,3.3870,0.54,0.03,0.120,0.310,2.004406e+11,1.116069e+10,4.472353e+10,1.155192e+11,3.720230e+11
2,3.3875,0.58,0.04,0.120,0.260,2.034262e+11,1.402939e+10,4.199462e+10,9.049966e+10,3.507348e+11
3,3.3300,0.57,0.04,0.140,0.250,1.779845e+11,1.249014e+10,4.352705e+10,7.839655e+10,3.122534e+11
4,3.3710,0.54,0.03,0.130,0.290,1.919394e+11,1.067929e+10,4.720758e+10,1.039677e+11,3.559764e+11
...,...,...,...,...,...,...,...,...,...,...
81,4.0725,0.53,0.00,0.035,0.430,4.469410e+10,0.000000e+00,2.931843e+09,3.592601e+10,8.389957e+10
82,4.0730,0.53,0.00,0.040,0.425,4.433534e+10,0.000000e+00,3.346063e+09,3.560565e+10,8.365158e+10
83,4.0690,0.58,0.00,0.050,0.370,4.846412e+10,0.000000e+00,4.174781e+09,3.091677e+10,8.358267e+10
84,4.0700,0.58,0.00,0.050,0.370,4.844753e+10,0.000000e+00,4.186151e+09,3.090618e+10,8.372301e+10


In [86]:
x_data = "version"
categories = ["Retiring","Bad_Speculation","Frontend_Bound","Backend_Bound"]
color_list = ["MediumSeaGreen", "OrangeRed", "SlateBlue", "DodgerBlue"]
programs = df_perf["path"].unique()

flag = True

for program in programs:
    df = df_perf.groupby([x_data,"path"], sort=False)[categories].median().reset_index()
    df_IPC = df_perf.groupby([x_data,"path"], sort=False)['IPC_perf'].median().reset_index()
    df_IPC = df_IPC.query('path == "'+ program +'"')
    df_IPC = df_IPC.drop('path', axis=1)
    
    df = df[df['version'].str.contains("2.5.6|2.7.18|3.0.1|3.4.10|3.5.10") == False]
    df = df.query('path == "'+ program +'"')
    df = df.drop('path', axis=1)

    df_melted = pd.melt(df, id_vars=['version'], var_name='Perf_parameters', value_name='Value')
    df_melted_2 = pd.melt(df_IPC, id_vars=['version'], var_name='IPC', value_name='Value')

    fig = px.bar(df_melted,
                x = "version",
                y = "Value",
                color = "Perf_parameters",
                color_discrete_sequence=color_list,
                title="Performance of " + program + " in " + language)
    fig.add_trace(
        go.Scatter(
            x=df_melted_2["version"],
            y=df_melted_2["Value"],
            mode="lines",
            yaxis="y2",
            marker=dict(color="black"),
            name="IPC"
        )
    )
    fig.update_layout(
        legend=dict(orientation="h"),
        yaxis=dict(
            title=dict(text="Total number of diners"),
            side="left",
            range=[0, 1],
        ),
        yaxis2=dict(
            title=dict(text="Total bill amount"),
            side="right",
            range=[0, 4],
            overlaying="y",
            tickmode="sync",
        ),
    )

    fig.show()

In [118]:
list=['instructions', 'SlotsRetired', 'branches', 'dTLB_loads', 'L1_dcache_loads', 'mem_stores', 'branch_misses', 'SlotsIssued+RecoveryBubbles', 'FetchBubles', 'TotalSlots', 'cpu_cycles', 'GFX%C0', 'CPUGFX%']

div_html = ''

for parameter in list:

    if parameter == 'GFX%C0' or parameter == 'CPUGFX%':
        print(parameter)
    else:
        print("-----" + parameter)

-----instructions
-----SlotsRetired
-----branches
-----dTLB_loads
-----L1_dcache_loads
-----mem_stores
-----branch_misses
-----SlotsIssued+RecoveryBubbles
-----FetchBubles
-----TotalSlots
-----cpu_cycles
GFX%C0
CPUGFX%


In [111]:
language = "js"

df_turbostat = from_CSVfiles("turbostat", norm=False)
df_top = from_CSVfiles("top", norm=False)
df_perf = from_CSVfiles("perf", norm=False)
df = general_df(df_turbostat, df_perf, df_top)

In [115]:
x_data = "version"
categories = ['SlotsRetired_Giga','SlotsIssued+RecoveryBubbles_Giga','FetchBubbles_Giga','SlotsBackend_Giga']
color_list = ["MediumSeaGreen", "OrangeRed", "SlateBlue", "DodgerBlue"]
programs = df_perf["path"].unique()

flag = True

for program in programs:
    df_plot = df_perf.groupby([x_data,"path"], sort=False)[categories].median().reset_index()
    df_TotalSlots = df_turbostat.groupby([x_data,"path"], sort=False)[['Pkg+RAM_J']].median().reset_index()
    df_TotalSlots = df_TotalSlots.query('path == "'+ program +'"')
    df_TotalSlots = df_TotalSlots.drop('path', axis=1)
    
    # df_plot = df_plot[df_plot['version'].str.contains("2.5.6|2.7.18|3.0.1|3.4.10|3.5.10") == False]
    df_plot = df_plot.query('path == "'+ program +'"')
    df_plot = df_plot.drop('path', axis=1)

    df_melted = pd.melt(df_plot, id_vars=['version'], var_name='Perf_parameters', value_name='Value')
    df_melted_2 = pd.melt(df_TotalSlots, id_vars=['version'], var_name='Pkg+RAM_J', value_name='Value')

    fig = px.bar(df_melted,
                x = "version",
                y = "Value",
                color = "Perf_parameters",
                color_discrete_sequence=color_list,
                title="Performance of " + program + " in " + language)
    fig.add_trace(
        go.Scatter(
            x=df_melted_2["version"],
            y=df_melted_2["Value"],
            mode="lines",
            yaxis="y2",
            marker=dict(color="black"),
            name="Total Slots "
        )
    )
    fig.update_layout(
        legend=dict(orientation="h"),
        yaxis=dict(
            title=dict(text="Issue-pipeline slots (10^9)"),
            side="left",
            range=[0, (df_plot['SlotsRetired_Giga'] + df_plot['SlotsIssued+RecoveryBubbles_Giga'] + df_plot['FetchBubbles_Giga'] + df_plot['SlotsBackend_Giga']).max()],
        ),
        yaxis2=dict(
            title=dict(text="Energy Consumed (Pkg + RAM) in Joules"),
            side="right",
            range=[0, df_TotalSlots['Pkg+RAM_J'].max()],
            overlaying="y",
            tickmode="sync",
        ),
    )

    fig.show()

In [None]:
 # Per versions
if language == "python":
    filter_1 = 'version == "3.11.3" or version == "3.12.0b1" or version == "3.13.0a0"'
    filter_2 = 'version != "3.11.3" and version != "3.12.0b1" and version != "3.13.0a0"'
elif language == "c++":
        text1 = "Binary Trees Version 2 of B using -O3"
        text2 = "Binary Trees Version 6 of B using -O3"
        filter_1 = 'path == "binaryTrees_v2_21_original_O3flag"'
        filter_2 = 'path == "binaryTrees_v6_21_original_O3flag"'
elif language == 'java':
    filter_1 = 'version == "1.8.0_382" or version == "9.0.4" or version == "10.0.2"'
    filter_2 = 'version != "1.8.0_382" and version != "9.0.4" and version != "10.0.22"'
elif language == 'js':
    filter_1 = 'path == "nbody_50000000_original" and (version == "6.17.1" or version == "7.10.1")'
    filter_2 = 'path == "nbody_50000000_original" and (version != "6.17.1" and version != "6.17.1")'

df_filtered_1 = df.query(filter_1)
df_filtered_2 = df.query(filter_2)

df_turbostat_filtered_1 = df_turbostat.query(filter_1)
df_turbostat_filtered_2 = df_turbostat.query(filter_2)

df_top_filtered_1 = df_top.query(filter_1)
df_top_filtered_2 = df_top.query(filter_2)

df_perf_filtered_1 = df_perf.query(filter_1)
df_perf_filtered_2 = df_perf.query(filter_2)

In [None]:




corrs = df[['time_elapsed','Pkg_J','Cor_J','GFX_J','RAM_J',
                    'Avg_MHz', 'Busy%', 'IPC', 'IRQ', 'POLL', 
                    'C1%','C1E%','C3%','C6%','C7s%','C8%','C9%','C10%',
                    'CPU%c1','CPU%c3','CPU%c6','CPU%c7','CoreTmp','PkgTmp',
                    'GFX%rc6','Totl%C0','Any%C0','GFX%C0','CPUGFX%',
                    'time','virt','res','shr','percent_cpu','percent_mem','nTH','P',
                    'SWAP','CODE','DATA','nMaj','nDRT','USED',
                    'time_elapsed_sec','CPU_Utilization','Retiring','Frontend_Bound','Bad_Speculation','Backend_Bound',
                        'CPI','ILP','IPC','cycles','freq_cycles_GHz','instructions',
                        'Kernel_Utilization', 'L1D_Cache_Fill_BW', 'Turbo_Utilization', 'cycles',
                        'instructions', 'insn_per_cycle', 'cpu_clock_msec', 'no_cpus', 'cpu_cycles', 'freq_cpu_cycles_GHz',
                        'cpu_migrations','ref_cycles','bus_cycles','task_clock_msec','no_cpus_task_clock',
                        'cpu_thermal_margin_C','branches','branch_misses','mem_loads','mem_stores','page_faults','minor_faults','major_faults',
                        'cache_references','cache_misses','percent_cache_misses','L1_dcache_loads','L1_dcache_load_misses',
                        'LLC_loads','LLC_load_misses','L1_icache_load_misses','dTLB_loads',
                        'dTLB_load_misses','iTLB_loads','iTLB_load_misses'
                        ]].corr()

np.fill_diagonal(corrs.values, np.nan)

corrs = corrs.T.head(5).T
corrs = corrs[(corrs > 0.7) | (corrs < -0.7)]
corrs = corrs.dropna(how='all')
corrs = corrs.sort_values(by=['time_elapsed', 'Pkg_J','Cor_J','RAM_J','GFX_J'], ascending=False)

positive_rows = (corrs['time_elapsed'] > 0) | (corrs['Pkg_J'] > 0) | (corrs['Cor_J'] > 0) | (corrs['RAM_J'] > 0) | (corrs['GFX_J'] > 0)
negative_rows = (corrs['time_elapsed'] < 0) | (corrs['Pkg_J'] < 0) | (corrs['Cor_J'] < 0) | (corrs['RAM_J'] < 0) | (corrs['GFX_J'] < 0)
df_negative = corrs.loc[negative_rows]
df_negative = df_negative.sort_values(by=['time_elapsed'], ascending=True)
df_positive = corrs.loc[positive_rows]

df_positive = df_positive.drop_duplicates(subset=['time_elapsed', 'Pkg_J','Cor_J','RAM_J','GFX_J'], keep='first')
df_negative = df_negative.drop_duplicates(subset=['time_elapsed', 'Pkg_J','Cor_J','RAM_J','GFX_J'], keep='first')

heat = go.Heatmap(z=corrs.values.round(2),
                        x=list(corrs.columns),
                        y=list(corrs.index),
                        xgap=1, ygap=1,
                        texttemplate="%{z}",
                        showscale=True,
                        colorbar_thickness=20,
                        colorbar_ticklen=3,
                        # zmax=1, zmin=0.1
                        )
layout = go.Layout(title_x=0.5, 
                        xaxis_showgrid=False,
                        yaxis_showgrid=False,
                        yaxis_autorange='reversed')
fig=go.Figure(data=[heat], layout=layout)
fig['layout']['xaxis']['side'] = 'top'
fig.show()

# print("\nPOSITIVE Rows:")
# print(corrs.loc[positive_rows])

# print("\nNEGATIVE Rows:")
# print(corrs.loc[negative_rows])
df_negative['position'] = range(1,len(df_negative)+1)
first_column = df_negative.pop('position') 
df_negative.insert(0, 'position', first_column) 

print(df_negative)
print(df_positive)

                     position  time_elapsed     Pkg_J     Cor_J     GFX_J  \
Avg_MHz                     1     -0.895195 -0.861044 -0.849143       NaN   
Turbo_Utilization           2     -0.867248 -0.830337 -0.817069       NaN   
freq_cpu_cycles_GHz         3     -0.843787 -0.811575 -0.798261       NaN   
freq_cycles_GHz             4     -0.840296 -0.805938 -0.792340       NaN   
GFX%C0                      5     -0.812323 -0.826370 -0.826797       NaN   
CPUGFX%                     6     -0.807455 -0.822411 -0.822985       NaN   
C8%                         7     -0.731052 -0.744442 -0.747812       NaN   
CPU%c3                      8           NaN       NaN       NaN -0.708291   

                        RAM_J  
Avg_MHz             -0.826195  
Turbo_Utilization   -0.815142  
freq_cpu_cycles_GHz -0.819100  
freq_cycles_GHz     -0.812074  
GFX%C0                    NaN  
CPUGFX%                   NaN  
C8%                       NaN  
CPU%c3                    NaN  
                  

In [None]:
type = "scattermatrixTurbo"

if type != "scattermatrixTurbo" and type != "scatterTurbo":
    print("Hello")
else:
    print("world!")

world!


### Matrix correlation

### Energy Consumption

In [7]:
language = "java"

df_turbostat = from_CSVfiles("turbostat", norm=False)
df_top = from_CSVfiles("top", norm=False)
df_perf = from_CSVfiles("perf", norm=False)
df = general_df(df_turbostat, df_perf, df_top)
df[["freq_cycles_GHz","Retiring","Bad_Speculation","Frontend_Bound","Backend_Bound",'SlotsRetired_Giga','SlotsIssued+RecoveryBubbles_Giga','FetchBubbles_Giga','SlotsBackend_Giga','TotalSlots_Giga']]

Unnamed: 0,freq_cycles_GHz,Retiring,Bad_Speculation,Frontend_Bound,Backend_Bound,SlotsRetired_Giga,SlotsIssued+RecoveryBubbles_Giga,FetchBubbles_Giga,SlotsBackend_Giga,TotalSlots_Giga
0,3.621,0.35,0.01,0.1,0.57,64129100000.0,1855511000.0,18555110000.0,105630000000.0,185551100000.0
1,3.3985,0.66,0.02,0.11,0.25,54701310000.0,1665627000.0,9160950000.0,20742970000.0,83281360000.0
2,3.461,0.66,0.02,0.11,0.24,56064720000.0,1698920000.0,9450631000.0,20522460000.0,84946020000.0
3,3.5285,0.62,0.03,0.09,0.3,42184680000.0,2018765000.0,6056294000.0,20238750000.0,67292150000.0
4,3.612,0.65,0.03,0.09,0.28,42965780000.0,2000519000.0,6195403000.0,18812310000.0,66683960000.0
5,3.555,0.66,0.03,0.12,0.27,43246490000.0,1926884000.0,7793541000.0,17560180000.0,65676710000.0
6,3.627,0.65,0.03,0.1,0.275,43329490000.0,1980722000.0,6825936000.0,18189230000.0,66669050000.0
7,3.497,0.64,0.03,0.08,0.29,40465610000.0,1851255000.0,5161240000.0,17930280000.0,62519690000.0
8,3.4345,0.62,0.03,0.14,0.275,39168160000.0,1846863000.0,8618695000.0,17079330000.0,63197750000.0
9,3.5315,0.62,0.03,0.13,0.265,41056860000.0,1976260000.0,8409561000.0,17126250000.0,65875320000.0


In [9]:
two_plots(df_turbostat, "Energy Consumption (Pkg + RAM)",
                    filename_plot="turbostat_Pkg+RAM_J", 
                    x_data="version", 
                    y_data="Pkg+RAM_J", 
                    color_data="path",
                    type="lineTurbo")

TypeError: two_plots() missing 1 required positional argument: 'title'

In [None]:
two_plots(df_turbostat, "Energy Consumption (Pkg + RAM)",
                    filename_plot="turbostat_Pkg+RAM_J", 
                    x_data="version", 
                    y_data="Pkg+RAM_Watts", 
                    color_data="path",
                    type="lineTurbo")

In [10]:
x_data = "version"
y_data = "Pkg+RAM_Watts"
color_data = "path"


df = df_turbostat.groupby([x_data,color_data], sort=False)[[y_data]].median().reset_index()
df = df[df['version'].str.contains("2.5.6|2.7.18|3.0.1|3.4.10|3.5.10") == False]

if language == "python": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]
if language == "c++": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]

fig = px.bar(df,
              x = x_data,
              y = y_data,
              color = color_data,
              title="Total Energy Consumption (Pkg + RAM) in " + language,
              barmode="group",
              color_discrete_sequence=color_list)
# fig.update_traces(textposition="bottom right")
fig.show()


NameError: name 'color_list' is not defined

In [25]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

x_data = "version"
y_data = "Pkg+RAM_Watts"
color_data = "path"


df = df_turbostat.groupby([x_data,color_data], sort=False)[[y_data]].median().reset_index()
df = df[df['version'].str.contains("2.5.6|2.7.18|3.0.1|3.4.10|3.5.10") == False]


# Assuming you have a DataFrame df with columns x_data, y_data, color_data, and language

# Create a subplot with multiple y-axes
fig = make_subplots(specs=[[{"secondary_y": True}] * (len(df['path'].unique()) - 1) + [{}]])

# Loop through unique values in the 'color_data' column
for color_value in df['path'].unique():
    # Filter DataFrame for each color_value
    filtered_df = df[df['path'] == color_value]
    
    # Add trace for the current color_value
    trace = px.line(filtered_df, x=x_data, y=y_data, color=color_data).update_traces(yaxis="y" + str(color_value))
    
    # Add the trace to the subplot
    fig.add_traces(trace)

# Update layout
fig.update_layout(title="Total Energy Consumption (Pkg + RAM) of " + language)

# Show the plot
fig.show()



ValueError: 
The 'specs' argument to make_subplots must be a 2D list of dictionaries with dimensions (1 x 1).
    Received value of type <class 'list'>: [[{'secondary_y': True}, {'secondary_y': True}, {}]]

In [None]:
df_perf["path"].unique()[1]

'binaryTrees_21_OOflag'

### Top Level - Performance

#### Python

In [None]:
language = "python"
df_perf = from_CSVfiles("perf", norm=False)

x_data = "version"
categories = ["Frontend_Bound","Backend_Bound","Bad_Speculation","Retiring"]
programs = df_perf["path"].unique()


for program in programs:
    print(program)
    df = df_perf.groupby([x_data,"path"], sort=False)[categories].median().reset_index()
    # df = df[df['version'].str.contains("2.5.6|2.7.18|3.0.1|3.4.10|3.5.10") == False]
    df = df.query('path == "'+ program +'"')
    df = df.drop('path', axis=1)
    
    if language == "python": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]
    if language == "c++": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]

    df_melted = pd.melt(df, id_vars=['version'], var_name='Perf_parameters', value_name='Value')
    fig = px.bar(df_melted,
              x = "version",
              y = "Value",
              color = "Perf_parameters",
              title="Performance of " + program + " in " + language)
    # fig.update_traces(textposition="bottom right")
    fig.show()


binaryTrees_21


binaryTrees_21_OOflag


binaryTrees_21_original


binaryTrees_21_original_OOflag


nbody_50000000


nbody_50000000_OOflag


#### C++

In [None]:
language = "c++"
df_perf = from_CSVfiles("perf", norm=False)

x_data = "version"
categories = ["Frontend_Bound","Backend_Bound","Bad_Speculation","Retiring"]
programs = df_perf["path"].unique()


for program in programs:
    df = df_perf.groupby([x_data,"path"], sort=False)[categories].median().reset_index()
    # df = df[df['version'].str.contains("2.5.6|2.7.18|3.0.1|3.4.10|3.5.10") == False]
    df = df.query('path == "'+ program +'"')
    df = df.drop('path', axis=1)
    
    if language == "python": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]
    if language == "c++": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]

    df_melted = pd.melt(df, id_vars=['version'], var_name='Perf_parameters', value_name='Value')
    fig = px.bar(df_melted,
              x = "version",
              y = "Value",
              color = "Perf_parameters",
              title="Performance of " + program + " in " + language)
    # fig.update_traces(textposition="bottom right")
    fig.show()

#### Java

In [None]:
language = "java"
df_perf = from_CSVfiles("perf", norm=False)

x_data = "version"
categories = ["Frontend_Bound","Backend_Bound","Bad_Speculation","Retiring"]
programs = df_perf["path"].unique()


for program in programs:
    df = df_perf.groupby([x_data,"path"], sort=False)[categories].median().reset_index()
    # df = df[df['version'].str.contains("2.5.6|2.7.18|3.0.1|3.4.10|3.5.10") == False]
    df = df.query('path == "'+ program +'"')
    df = df.drop('path', axis=1)
    
    if language == "python": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]
    if language == "c++": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]

    df_melted = pd.melt(df, id_vars=['version'], var_name='Perf_parameters', value_name='Value')
    fig = px.bar(df_melted,
              x = "version",
              y = "Value",
              color = "Perf_parameters",
              title="Performance of " + program + " in " + language)
    # fig.update_traces(textposition="bottom right")
    fig.show()

#### JavaScript

In [None]:
language = "js"
df_perf = from_CSVfiles("perf", norm=False)

x_data = "version"
categories = ["Frontend_Bound","Backend_Bound","Bad_Speculation","Retiring"]
programs = df_perf["path"].unique()


for program in programs:
    df = df_perf.groupby([x_data,"path"], sort=False)[categories].median().reset_index()
    # df = df[df['version'].str.contains("2.5.6|2.7.18|3.0.1|3.4.10|3.5.10") == False]
    df = df.query('path == "'+ program +'"')
    df = df.drop('path', axis=1)
    
    if language == "python": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]
    if language == "c++": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]

    df_melted = pd.melt(df, id_vars=['version'], var_name='Perf_parameters', value_name='Value')
    fig = px.bar(df_melted,
              x = "version",
              y = "Value",
              color = "Perf_parameters",
              title="Performance of " + program + " in " + language)
    # fig.update_traces(textposition="bottom right")
    fig.show()

In [None]:


df = df_perf.groupby([x_data,"path"], sort=False)[categories].median().reset_index()
# df = df[df['version'].str.contains("2.5.6|2.7.18|3.0.1|3.4.10|3.5.10") == False]
df = df.query('path == "'+ programs[0] +'"')
df = df.drop('path', axis=1)

if language == "python": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]
if language == "c++": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]

df_melted = pd.melt(df, id_vars=['version'], var_name='Perf_parameters', value_name='Value')

fig = px.bar(df_melted,
              x = "version",
              y = "Value",
              color = "Perf_parameters",
              title="Total Energy Consumption (Pkg + RAM) in " + language)
# fig.update_traces(textposition="bottom right")
fig.show()

In [None]:
language = "c++"
df_perf = from_CSVfiles("perf", norm=False)

x_data = "version"
categories = ["Frontend_Bound","Backend_Bound","Bad_Speculation","Retiring"]
programs = df_perf["path"].unique()


df = df_perf.groupby([x_data,"path"], sort=False)[categories].median().reset_index()
# df = df[df['version'].str.contains("2.5.6|2.7.18|3.0.1|3.4.10|3.5.10") == False]
df = df.query('path == "'+ programs[0] +'"')
df = df.drop('path', axis=1)

if language == "python": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]
if language == "c++": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]

df_melted = pd.melt(df, id_vars=['version'], var_name='Perf_parameters', value_name='Value')

fig = px.bar(df_melted,
              x = "version",
              y = "Value",
              color = "Perf_parameters",
              title="Total Energy Consumption (Pkg + RAM) in " + language)
# fig.update_traces(textposition="bottom right")
fig.show()

In [None]:
language = "java"
df_perf = from_CSVfiles("perf", norm=False)

x_data = "version"
categories = ["Frontend_Bound","Backend_Bound","Bad_Speculation","Retiring"]
programs = df_perf["path"].unique()


df = df_perf.groupby([x_data,"path"], sort=False)[categories].median().reset_index()
# df = df[df['version'].str.contains("2.5.6|2.7.18|3.0.1|3.4.10|3.5.10") == False]
df = df.query('path == "'+ programs[0] +'"')
df = df.drop('path', axis=1)

if language == "python": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]
if language == "c++": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]

df_melted = pd.melt(df, id_vars=['version'], var_name='Perf_parameters', value_name='Value')

fig = px.bar(df_melted,
              x = "version",
              y = "Value",
              color = "Perf_parameters",
              title="Total Energy Consumption (Pkg + RAM) in " + language)
# fig.update_traces(textposition="bottom right")
fig.show()

In [None]:
language = "js"
df_perf = from_CSVfiles("perf", norm=False)

x_data = "version"
categories = ["Frontend_Bound","Backend_Bound","Bad_Speculation","Retiring"]
programs = df_perf["path"].unique()


df = df_perf.groupby([x_data,"path"], sort=False)[categories].median().reset_index()
# df = df[df['version'].str.contains("2.5.6|2.7.18|3.0.1|3.4.10|3.5.10") == False]
df = df.query('path == "'+ programs[0] +'"')
df = df.drop('path', axis=1)

if language == "python": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]
if language == "c++": color_list = ["DodgerBlue", "DeepSkyBlue", "OrangeRed", "Salmon", "MediumSeaGreen", "LightGreen", "SlateBlue", "Plum", "Gray", "LightGray"]

df_melted = pd.melt(df, id_vars=['version'], var_name='Perf_parameters', value_name='Value')

fig = px.bar(df_melted,
              x = "version",
              y = "Value",
              color = "Perf_parameters",
              title="Total Energy Consumption (Pkg + RAM) in " + language)
# fig.update_traces(textposition="bottom right")
fig.show()