# Analysis for Performance and Energy Consumption

## Extract information

In [3]:
# Libraries

import sys
import os
import glob
import webbrowser
import natsort
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import MaxAbsScaler
from IPython.display import display, HTML


In [4]:
# Directory
actual_directory = os.getcwd() + '/'  

python_releaseDates = {
    'Python 3.13.0a0': '2023-06-07',
    'Python 3.12.0b1': '2023-05-22',
    'Python 3.11.3': '2023-04-05',
    'Python 3.10.11': '2023-04-05',
    'Python 3.9.16': '2022-12-06',
    'Python 3.8.16': '2022-12-06',
    'Python 3.7.16': '2022-12-06',
    'Python 3.6.15': '2021-09-04',
    'Python 3.5.10': '2020-09-05',
    'Python 3.4.10': '2019-03-18',
    'Python 3.3.7': '2017-09-19',
    'Python 3.2.6': '2014-10-11',
    'Python 3.1.5': '2012-04-09',
    'Python 3.0.1': '2009-02-13',
    'Python 2.7.18': '2020-04-20',
    'Python 2.6.9': '2013-10-29',
    'Python 2.5.6': '2011-05-26',
}

cplusplus_releaseDates = {
    'g++-4.4 4.4.7': '2012-03-13',
    'g++-4.6 4.6.4': '2013-04-12',
    'g++-4.7 4.7.4': '2014-06-12',
    'g++-4.8 4.8.5': '2015-06-23',
    'g++-4.9 4.9.3': '2015-06-26',
    'g++-5 5.5.0': '2017-10-10',
    'g++-6 6.5.0': '2018-10-26',
    'g++-7 7.5.0': '2019-11-14',
    'g++-8 8.5.0': '2021-05-14',
    'g++-8 ': '2021-05-14',
    'g++-9 9.5.0': '2022-05-27',
    'g++-10 10.4.0': '2022-06-28',
    'g++-10 10.5.0': '2023-07-07',
    'g++-11 11.4.0': '2023-05-29',
    'g++-12 12.3.0': '2023-05-08',
    'g++-13 13.1.0': '2023-04-26',
}

java_releaseDates = {
    '1.8.0_362': '2023-04-18',
    '9.0.4': '2018-01-16',
    '10.0.2': '2018-07-17',
    '11.0.19': '2020-10-20',
    '12.0.2': '2019-07-16',
    '13.0.2': '2020-01-14',
    '14.0.2': '2020-07-14',
    '15.0.2': '2021-01-19',
    '16.0.2': '2021-07-20',
    '17.0.7': '2023-04-18',
    '18.0.2-ea': '2022-07-19',
    '19.0.2': '2023-01-17',
    '20.0.2': '2023-07-18',
}

js_releaseDates = {
    '20.5.1': '2023-08-09',
    '19.9.0': '2023-04-10',
    '18.17.1': '2023-08-08',
    '17.9.1': '2022-06-01',
    '16.20.2': '2023-08-08',
    '15.14.0': '2021-04-06',
    '14.21.3': '2023-02-16',
    '13.14.0': '2020-04-29',
    '12.22.12': '2022-04-05',
    '11.15.0': '2019-04-30',
    '10.24.1': '2021-04-06',
    '9.11.2': '2018-06-12',
    '8.17.0': '2019-12-17',
    '7.10.1': '2017-07-11',
    '6.17.1': '2019-04-03',
    '5.12.0': '2016-06-23',
    '4.9.1': '2018-03-29',
    '3.3.1': '2015-09-15',
    '2.5.0': '2015-07-28',
    '1.8.4': '2015-07-09',
    '0.12.18': '2017-02-22',
    '0.10.48': '2016-10-18',
    '0.8.28': '2014-07-31'
}

# Function to get release date for a given Python version
def get_release_date(version):
    if language == 'python':
        return python_releaseDates.get(version, 'Unknown')
    elif language == 'c++':
        return cplusplus_releaseDates.get(version, 'Unknown')
    elif language == 'java':
        return java_releaseDates.get(version, 'Unknown')
    elif language == 'js':
        return js_releaseDates.get(version, 'Unknown')

# Function to convert "g", "m" or KiB to "byte"
def convert_g_to_byte(value):
    value_str = str(value)
    if value_str[:-1] == "":
        Byte_value = 0
    elif value_str[-1].lower() == 'g':
        g_value = float(value_str[:-1])
        Byte_value = g_value * 1024 * 1024 * 1024  # 1 giga = 1,000,000,000
    elif value_str[-1].lower() == 'm':
        m_value = float(value_str[:-1])
        Byte_value = m_value * 1024 * 1024 # 1 giga = 1,000,000
    else:
        k_value = float(value_str)
        Byte_value = k_value * 1024 # 1 kilo = 1,000
    return int(Byte_value)

def convert_toUnit(column):
    column = column.apply(convert_g_to_byte)
    column = pd.to_numeric(column, errors='coerce')
    return column
    
def Data_normalized(df, tool):
    df_data = df[['version', 'release_date', 'path', 'appplication']]
    if tool == "turbostat": 
        df_metric = df.loc[:, ~df.columns.isin(['version', 'release_date', 'path', 'appplication'])]
    elif tool == "top":
        df_metric = df[['virt', 'res', 'shr', 'percent_cpu', 'percent_mem',
                    'nTH', 'P', 'SWAP', 'CODE', 'DATA', 'nMaj',
                    'nDRT', 'USED']]

    transformer = MaxAbsScaler().fit(df_metric)
    scaled = transformer.transform(df_metric)

    df_norm = pd.DataFrame(scaled, columns=df_metric.columns)
    df = pd.concat([df_data,df_norm.reindex(df_data.index)], axis=1)
    return df

# Function to extract information in every file
def from_CSVfile(file, directory, tool):
     # Read CSV file
    df = pd.read_csv(file)
    if language == 'js': df['version'] = df['version'].str.replace('v', '')
    
    # New column 'release_date' as the second 
    df['release_date'] = df['version'].apply(get_release_date)
    df.insert(1, 'release_date', df.pop('release_date'))

    # Convert date into datetime
    df['release_date'] = pd.to_datetime(df['release_date'])

    # New column 'path' as the third
    df['path'] = directory
    df.insert(2, 'path', df.pop('path'))

    # Clean and remove the unnecessary rows
    df.replace(to_replace='-', value=0, inplace=True)

    # Apply the conversion function to the DataFrame column
    if tool == "top":
        df = df.dropna(subset=['command'])
        df['virt'] = convert_toUnit(df['virt'])
        df['res'] = convert_toUnit(df['res'])
        df['shr'] = convert_toUnit(df['shr'])
        df['CODE'] = convert_toUnit(df['CODE'])
        df['DATA'] = convert_toUnit(df['DATA'])
        df['SWAP'] = convert_toUnit(df['SWAP'])
        df['USED'] = convert_toUnit(df['USED'])

    # Changes in the 'version' column
    if language == 'python': df['version'] = df['version'].str.replace('Python ', '')
    if language == 'c++': df['version'] = df['version'].str.split().str[0]

    # with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    #     print(df)

    return df

# Extract all information from ALL files
def from_CSVfiles(tool, norm):

    all_df = pd.DataFrame()

    extra = ''
    if tool == "turbostat": extra = "_performance"

    list_directories = next(os.walk(language))[1]
    list_directories.sort()

    for directory_name in list_directories:
        if directory_name != "older" and directory_name != "test" and directory_name != "general_plots":
            path=language + '/' + directory_name + '/' + tool + '/'
            if tool != "top": path = path + tool + extra
            if tool == "top":

                df = pd.DataFrame()
                list_files = os.listdir(path)

                # Get list of all files only in the given directory
                list_files = natsort.natsorted(list_files)

                for file_name in list_files:
                    if file_name.startswith("temp_top_data_") and file_name.endswith('.csv'):
                        df_Top = from_CSVfile(path + file_name, directory_name, tool)
                        df = pd.concat([df, df_Top])

                df.to_csv(path + "top_data_allVersions.csv", index=False)
            else:
                df = from_CSVfile(path + '_data_allVersions.csv', directory_name, tool)
                if norm: df = Data_normalized(df, tool)
            all_df = pd.concat([all_df, df])

    return all_df

In [7]:
language = "js"

df = from_CSVfiles("perf", norm=False)
df

Unnamed: 0,version,release_date,path,appplication,CPU_Utilization,CPI,Retiring,Frontend_Bound,Bad_Speculation,Backend_Bound,...,percent_LLC_load_misses,L1_icache_load_misses,dTLB_loads,dTLB_load_misses,percent_dTLB_load_misses,iTLB_loads,iTLB_load_misses,percent_iTLB_load_misses,block_rq_issue,block_rq_complete
0,0.8.28,2014-07-31,nbody_50000000,nbody_6.js 50000000,0.98,0.33,0.65,0.31,0.02,0.02,...,0.54%,76630266,69674451505,899710,0.00%,429894,236485,55.01%,0,0
1,0.10.48,2016-10-18,nbody_50000000,nbody_6.js 50000000,0.99,0.33,0.65,0.3,0.03,0.03,...,1.34%,40139174,68735245598,1475756,0.00%,261148,561246,214.91%,0,0
2,0.12.18,2017-02-22,nbody_50000000,nbody_6.js 50000000,1.0,0.4,0.57,0.05,0.0,0.39,...,35.79%,558921,21030626587,338,0.00%,5252,4914,93.56%,0,0
3,1.8.4,2015-07-09,nbody_50000000,nbody_6.js 50000000,1.0,0.4,0.57,0.04,0.0,0.39,...,24.07%,565886,21118730979,94,0.00%,4224,3291,77.91%,0,0
4,2.5.0,2015-07-28,nbody_50000000,nbody_6.js 50000000,1.0,0.42,0.54,0.03,0.0,0.43,...,24.05%,594613,21084415286,101,0.00%,5738,5504,95.92%,0,0
5,3.3.1,2015-09-15,nbody_50000000,nbody_6.js 50000000,1.0,0.41,0.53,0.06,0.0,0.4,...,21.02%,435292,10963450267,10250,0.00%,54093,20468,37.84%,0,0
6,4.9.1,2018-03-29,nbody_50000000,nbody_6.js 50000000,1.0,0.41,0.53,0.08,0.0,0.4,...,15.55%,417019,10940921311,132,0.00%,2338,3421,146.32%,0,0
7,5.12.0,2016-06-23,nbody_50000000,nbody_6.js 50000000,1.0,0.41,0.53,0.06,0.0,0.4,...,32.15%,1816134,10941592609,39,0.00%,4899,3364,68.67%,0,0
8,6.17.1,2019-04-03,nbody_50000000,nbody_6.js 50000000,1.0,0.41,0.54,0.07,0.0,0.39,...,12.12%,423388,10978811091,23,0.00%,3874,3205,82.73%,0,0
9,7.10.1,2017-07-11,nbody_50000000,nbody_6.js 50000000,1.0,0.41,0.54,0.06,0.01,0.39,...,33.38%,576502,10994349440,16651,0.00%,58936,22113,37.52%,0,0


In [158]:
language = "python"

df = from_CSVfiles("perf", norm=False)
df

Unnamed: 0,version,release_date,path,appplication,CPU_Utilization,CPI,Retiring,Frontend_Bound,Bad_Speculation,Backend_Bound,...,percent_LLC_load_misses,L1_icache_load_misses,dTLB_loads,dTLB_load_misses,percent_dTLB_load_misses,iTLB_loads,iTLB_load_misses,percent_iTLB_load_misses,block_rq_issue,block_rq_complete
0,2.5.6,2011-05-26,binaryTrees_21,binarytrees_compatible_all.py 21,1.0,,,,,,...,(0.00%),<not,<not,<not,(0.00%),<not,<not,(0.00%),0,7
1,2.7.18,2020-04-20,binaryTrees_21,binarytrees_compatible_all.py 21,1.0,,,,,,...,(0.00%),<not,<not,<not,(0.00%),<not,<not,(0.00%),0,0
2,3.0.1,2009-02-13,binaryTrees_21,binarytrees_compatible_all.py 21,1.0,,,,,,...,(0.00%),<not,<not,<not,(0.00%),<not,<not,(0.00%),0,0
3,3.4.10,2019-03-18,binaryTrees_21,binarytrees_compatible_all.py 21,1.0,,,,,,...,(0.00%),<not,<not,<not,(0.00%),<not,<not,(0.00%),0,0
4,3.5.10,2020-09-05,binaryTrees_21,binarytrees_compatible_all.py 21,1.0,,,,,,...,(0.00%),<not,<not,<not,(0.00%),<not,<not,(0.00%),0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8,3.9.16,2022-12-06,nbody_50000000_OOflag,nbody.py 50000000,1.0,0.37,0.69,0.18,0.10,0.03,...,1.49%,384999620,1688075244004,10427123,0.00%,3809743,9326817,244.81%,0,0
9,3.10.11,2023-04-05,nbody_50000000_OOflag,nbody.py 50000000,1.0,0.37,0.68,0.20,0.09,0.03,...,11.52%,162406520,1607564461488,26796,0.00%,1511021,1039678,68.81%,0,0
10,3.11.3,2023-04-05,nbody_50000000_OOflag,nbody.py 50000000,1.0,0.35,0.74,0.07,0.09,0.11,...,1.70%,57304013,1035679690311,3279,0.00%,766279,59731889,7795.06%,0,0
11,3.12.0b1,2023-05-22,nbody_50000000_OOflag,nbody.py 50000000,1.0,0.37,0.66,0.15,0.10,0.09,...,1.58%,58596192,1063319634315,118254,0.00%,34202834,323741378,946.53%,0,0


In [128]:
def custom_hover(program, x, y, mean, diff, percent_diff):
    percent_difffromMean = 0 if mean == 0 else 100 * (y/mean)
    return f'Version: {x}<br>Program: {program}<br><b><i>Percentage Difference:</i> {percent_diff:.2f}%</b><br>Difference: {diff:.2f}<br><b><i>Percentage Difference from Mean:</i> {percent_difffromMean:.2f}%</b><br>Difference From Mean: {y:.2f}<br>Mean: {mean:.2f}<br>'


In [153]:
language = "c++"
x_data = "version"
y_data = "virt"
color_data = "path"
type = "barTop"
diff = True

df = from_CSVfiles("top", norm=False)


if type == "barTop": df = df.groupby([x_data,color_data], sort=False)[[y_data]].mean().reset_index()
if type == "lineTop": df = df.groupby([x_data,color_data], sort=False)[[y_data]].mean().reset_index()

mean_per_program = df.groupby(color_data)[y_data].mean()
max_per_program = df.groupby(color_data)[y_data].idxmax()
max_energy_df = df.loc[max_per_program, [color_data, x_data, y_data]]
display(max_energy_df)
min_per_program = df.groupby(color_data)[y_data].idxmin()
min_energy_df = df.loc[min_per_program, [color_data, x_data, y_data]]
display(min_energy_df)
df['DifferenceFromMean'] = df.apply(lambda row: row[y_data] - mean_per_program[row[color_data]], axis=1)

# Create grouped bar traces for each program
bar_traces = []
for program in df[color_data].unique():
    program_df = df[df[color_data] == program]
    mean = mean_per_program[program]
    program_df['Difference'] = program_df[y_data].diff()
    program_df['Percentage_Difference'] = program_df[y_data].pct_change()*100
    program_df['Percentage_DifferenceFromMean'] = program_df.apply(lambda row: 0 if (mean == 0) else 100 * (row["DifferenceFromMean"] / mean), axis=1)
    hover_texts = [custom_hover(program, x, y, mean, diff, percent_diff) for x, y, diff, percent_diff in zip(program_df[x_data], program_df['DifferenceFromMean'],program_df['Difference'],program_df['Percentage_Difference'])]
    if diff:
        min = program_df[program_df['Difference'] == program_df['Difference'].min()]
        max = program_df[program_df['Difference'] == program_df['Difference'].max()]
        bar_trace = go.Bar(x=program_df[x_data], y=program_df['Difference'],
                            name=f'{program} - Mean: { format(mean_per_program[program], ".2f")}',
                            hovertemplate=hover_texts,
                            text=program_df["Percentage_Difference"])
    else:
        min = program_df[program_df['DifferenceFromMean'] == program_df['DifferenceFromMean'].min()]
        max = program_df[program_df['DifferenceFromMean'] == program_df['DifferenceFromMean'].max()]
        bar_trace = go.Bar(x=program_df[x_data], y=program_df['DifferenceFromMean'],
                           name=f'{program} - Mean: { format(mean_per_program[program], ".2f")}',
                           hovertemplate=hover_texts,
                           text=program_df["Percentage_DifferenceFromMean"])
    
    bar_traces.append(bar_trace)
    # bar_traces.append(median_trace)

layout = go.Layout(title='Comparison of ' + y_data,
                       xaxis=dict(title=x_data),
                       yaxis=dict(title='Difference from Mean of ' + y_data, zeroline=False))
        
fig = go.Figure(data=bar_traces, layout=layout)

# for _, row in max_energy_df.iterrows():
#     fig.add_annotation(
#         text=f"Max: {row[y_data]} ({row[color_data]} {row[x_data]})",
#         x=row[x_data],
#         y=row[y_data],
#     )

fig.show()


Unnamed: 0,path,version,virt
12,binaryTrees_21,g++-12,271328051.2
15,binaryTrees_21_O3flag,g++-4.6,272122880.0
37,nbody_50000000,g++-9,6656000.0
42,nbody_50000000_O3flag,g++-4.4,6647808.0


Unnamed: 0,path,version,virt
4,binaryTrees_21,g++-4.9,270211900.0
23,binaryTrees_21_O3flag,g++-9,270876700.0
28,nbody_50000000,g++-4.4,6651904.0
45,nbody_50000000_O3flag,g++-4.8,6643712.0




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/