# Analysis for Performance and Energy Consumption

## Extract information

In [4]:
# Libraries

import sys
import os
import glob
import webbrowser
import natsort
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import MaxAbsScaler
from IPython.display import display, HTML


In [5]:
# Directory
actual_directory = os.getcwd() + '/'  

python_releaseDates = {
    'Python 3.13.0a0': '2023-06-07',
    'Python 3.12.0b1': '2023-05-22',
    'Python 3.11.3': '2023-04-05',
    'Python 3.10.11': '2023-04-05',
    'Python 3.9.16': '2022-12-06',
    'Python 3.8.16': '2022-12-06',
    'Python 3.7.16': '2022-12-06',
    'Python 3.6.15': '2021-09-04',
    'Python 3.5.10': '2020-09-05',
    'Python 3.4.10': '2019-03-18',
    'Python 3.3.7': '2017-09-19',
    'Python 3.2.6': '2014-10-11',
    'Python 3.1.5': '2012-04-09',
    'Python 3.0.1': '2009-02-13',
    'Python 2.7.18': '2020-04-20',
    'Python 2.6.9': '2013-10-29',
    'Python 2.5.6': '2011-05-26',
}

cplusplus_releaseDates = {
    'g++-4.4 4.4.7': '2012-03-13',
    'g++-4.6 4.6.4': '2013-04-12',
    'g++-4.7 4.7.4': '2014-06-12',
    'g++-4.8 4.8.5': '2015-06-23',
    'g++-4.9 4.9.3': '2015-06-26',
    'g++-5 5.5.0': '2017-10-10',
    'g++-6 6.5.0': '2018-10-26',
    'g++-7 7.5.0': '2019-11-14',
    'g++-8 8.5.0': '2021-05-14',
    'g++-8 ': '2021-05-14',
    'g++-9 9.5.0': '2022-05-27',
    'g++-10 10.4.0': '2022-06-28',
    'g++-10 10.5.0': '2023-07-07',
    'g++-11 11.4.0': '2023-05-29',
    'g++-12 12.3.0': '2023-05-08',
    'g++-13 13.1.0': '2023-04-26',
}

java_releaseDates = {
    '1.8.0_362': '2023-04-18',
    '9.0.4': '2018-01-16',
    '10.0.2': '2018-07-17',
    '11.0.19': '2020-10-20',
    '12.0.2': '2019-07-16',
    '13.0.2': '2020-01-14',
    '14.0.2': '2020-07-14',
    '15.0.2': '2021-01-19',
    '16.0.2': '2021-07-20',
    '17.0.7': '2023-04-18',
    '18.0.2-ea': '2022-07-19',
    '19.0.2': '2023-01-17',
    '20.0.2': '2023-07-18',
}

js_releaseDates = {
    '20.5.1': '2023-08-09',
    '19.9.0': '2023-04-10',
    '18.17.1': '2023-08-08',
    '17.9.1': '2022-06-01',
    '16.20.2': '2023-08-08',
    '15.14.0': '2021-04-06',
    '14.21.3': '2023-02-16',
    '13.14.0': '2020-04-29',
    '12.22.12': '2022-04-05',
    '11.15.0': '2019-04-30',
    '10.24.1': '2021-04-06',
    '9.11.2': '2018-06-12',
    '8.17.0': '2019-12-17',
    '7.10.1': '2017-07-11',
    '6.17.1': '2019-04-03',
    '5.12.0': '2016-06-23',
    '4.9.1': '2018-03-29',
    '3.3.1': '2015-09-15',
    '2.5.0': '2015-07-28',
    '1.8.4': '2015-07-09',
    '0.12.18': '2017-02-22',
    '0.10.48': '2016-10-18',
    '0.8.28': '2014-07-31'
}

# Function to get release date for a given Python version
def get_release_date(version):
    if language == 'python':
        return python_releaseDates.get(version, 'Unknown')
    elif language == 'c++':
        return cplusplus_releaseDates.get(version, 'Unknown')
    elif language == 'java':
        return java_releaseDates.get(version, 'Unknown')
    elif language == 'js':
        return js_releaseDates.get(version, 'Unknown')

# Function to convert "g", "m" or KiB to "byte"
def convert_g_to_byte(value):
    value_str = str(value)
    if value_str[:-1] == "":
        Byte_value = 0
    elif value_str[-1].lower() == 'g':
        g_value = float(value_str[:-1])
        Byte_value = g_value * 1024 * 1024 * 1024  # 1 giga = 1,000,000,000
    elif value_str[-1].lower() == 'm':
        m_value = float(value_str[:-1])
        Byte_value = m_value * 1024 * 1024 # 1 giga = 1,000,000
    elif value_str[-1].lower() == 'k':
        k_value = float(value_str[:-1])
        Byte_value = k_value * 1024 # 1 kilo = 1,000
    else:
        k_value = float(value_str)
        Byte_value = k_value * 1024 # 1 kilo = 1,000
    return int(Byte_value)

def convert_toUnit(column):
    column = column.apply(convert_g_to_byte)
    column = pd.to_numeric(column, errors='coerce')
    return column
    
def Data_normalized(df, tool):
    df_data = df[['version', 'release_date', 'path', 'appplication']]
    if tool == "turbostat": 
        df_metric = df.loc[:, ~df.columns.isin(['version', 'release_date', 'path', 'appplication'])]
    elif tool == "top":
        df_metric = df[['virt', 'res', 'shr', 'percent_cpu', 'percent_mem',
                    'nTH', 'P', 'SWAP', 'CODE', 'DATA', 'nMaj',
                    'nDRT', 'USED']]

    transformer = MaxAbsScaler().fit(df_metric)
    scaled = transformer.transform(df_metric)

    df_norm = pd.DataFrame(scaled, columns=df_metric.columns)
    df = pd.concat([df_data,df_norm.reindex(df_data.index)], axis=1)
    return df

# Function to extract information in every file
def from_CSVfile(file, directory, tool):
     # Read CSV file
    df = pd.read_csv(file)
    if language == 'js': df['version'] = df['version'].str.replace('v', '')
    
    # New column 'release_date' as the second 
    df['release_date'] = df['version'].apply(get_release_date)
    df.insert(1, 'release_date', df.pop('release_date'))

    # Convert date into datetime
    df['release_date'] = pd.to_datetime(df['release_date'])

    # New column 'path' as the third
    df['path'] = directory
    df.insert(2, 'path', df.pop('path'))

    # Clean and remove the unnecessary rows
    df.replace(to_replace='-', value=0, inplace=True)

    # Apply the conversion function to the DataFrame column
    if tool == "top":
        df = df.dropna(subset=['command'])
        df['virt'] = convert_toUnit(df['virt'])
        df['res'] = convert_toUnit(df['res'])
        df['shr'] = convert_toUnit(df['shr'])
        df['CODE'] = convert_toUnit(df['CODE'])
        df['DATA'] = convert_toUnit(df['DATA'])
        df['SWAP'] = convert_toUnit(df['SWAP'])
        df['USED'] = convert_toUnit(df['USED'])
        df['nMin'] = convert_toUnit(df['nMin'])
        df['nMaj'] = convert_toUnit(df['nMaj'])

    # Changes in the 'version' column
    if language == 'python': df['version'] = df['version'].str.replace('Python ', '')
    if language == 'c++': df['version'] = df['version'].str.split().str[0]

    # with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    #     print(df)

    return df

# Extract all information from ALL files
def from_CSVfiles(tool, norm):

    all_df = pd.DataFrame()

    list_directories = next(os.walk(language))[1]
    list_directories.sort()

    for directory_name in list_directories:
        # if directory_name != "older" and directory_name != "test" and directory_name != "general_plots" and directory_name != "general_plots_v1":
        
        if directory_name == "binaryTrees_21_original":

            path = language + '/' + directory_name + '/' + tool + '/'
            if tool == "perf": path = path + tool + extra

            if tool == "top":

                df = pd.DataFrame()
                list_files = os.listdir(path)

                # Get list of all files only in the given directory
                list_files = natsort.natsorted(list_files)

                for file_name in list_files:
                    if file_name.startswith("temp_top_data_") and file_name.endswith('.csv'):
                        df_Top = from_CSVfile(path + file_name, directory_name, tool)
                        df = pd.concat([df, df_Top])

                df.to_csv(path + "top_data_allVersions.csv", index=False)
            elif tool == "turbostat":

                df = pd.DataFrame()
                list_files = os.listdir(path)

                # Get list of all files only in the given directory
                list_files = natsort.natsorted(list_files)

                for file_name in list_files:
                    if file_name.startswith("turbostat_performance_data") and file_name.endswith('.csv') and file_name != "turbostat_performance_data_allVersions.csv":
                        df_Top = from_CSVfile(path + file_name, directory_name, tool)
                        df = pd.concat([df, df_Top])

            else:
                df = from_CSVfile(path + '_data_allVersions.csv', directory_name, tool)
                if norm: df = Data_normalized(df, tool)
            all_df = pd.concat([all_df, df])

    return all_df

In [6]:
language = "c++"

df = from_CSVfiles("turbostat", norm=False)
df

Unnamed: 0,test,release_date,path,version,appplication,time_elapsed,usec,Time_Of_Day_Seconds,APIC,X2APIC,...,Pkg%pc9,Pk%pc10,CPU%LPI,SYS%LPI,Pkg_J,Cor_J,GFX_J,RAM_J,PKG_%,RAM_%
0,1,2012-03-13,binaryTrees_21_original,g++-4.4,binaryTrees_2.c 21,19.012345,1575,1.692782e+09,0,0,...,0.0,0.0,0.0,0.0,276.73,228.10,0.26,14.03,0.0,0.0
1,2,2012-03-13,binaryTrees_21_original,g++-4.4,binaryTrees_2.c 21,19.068004,1643,1.692782e+09,0,0,...,0.0,0.0,0.0,0.0,283.23,234.26,0.28,14.00,0.0,0.0
2,3,2012-03-13,binaryTrees_21_original,g++-4.4,binaryTrees_2.c 21,19.485950,1584,1.692782e+09,0,0,...,0.0,0.0,0.0,0.0,292.54,242.32,0.27,14.19,0.0,0.0
3,4,2012-03-13,binaryTrees_21_original,g++-4.4,binaryTrees_2.c 21,19.485471,1601,1.692782e+09,0,0,...,0.0,0.0,0.0,0.0,293.64,243.25,0.28,14.27,0.0,0.0
4,5,2012-03-13,binaryTrees_21_original,g++-4.4,binaryTrees_2.c 21,19.006089,1641,1.692782e+09,0,0,...,0.0,0.0,0.0,0.0,284.07,234.74,0.27,13.97,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,6,2023-04-26,binaryTrees_21_original,g++-13,binaryTrees_2.c 21,20.287041,1659,1.692786e+09,0,0,...,0.0,0.0,0.0,0.0,309.72,256.90,0.28,14.76,0.0,0.0
6,7,2023-04-26,binaryTrees_21_original,g++-13,binaryTrees_2.c 21,19.587371,1647,1.692786e+09,0,0,...,0.0,0.0,0.0,0.0,296.12,245.20,0.29,14.47,0.0,0.0
7,8,2023-04-26,binaryTrees_21_original,g++-13,binaryTrees_2.c 21,20.397967,1678,1.692786e+09,0,0,...,0.0,0.0,0.0,0.0,293.94,240.95,0.28,14.87,0.0,0.0
8,9,2023-04-26,binaryTrees_21_original,g++-13,binaryTrees_2.c 21,19.797566,1581,1.692786e+09,0,0,...,0.0,0.0,0.0,0.0,291.21,239.59,0.28,14.61,0.0,0.0


In [7]:
df_median = df.groupby(["version","path"], sort=False)[["Pkg_J","Cor_J"]].median().reset_index()

In [21]:
fig = px.scatter(df, x='time_elapsed', y='Pkg_J', color='version')
fig.show()

In [24]:
import plotly.figure_factory as ff

In [41]:
fig = ff.create_scatterplotmatrix(df[['time_elapsed', 'Pkg_J', 'RAM_J', 'path']], 
                                  height=1000,
                                  width=1000,
                                  diag='histogram',
                                  text=df['version'],
                                  index='path')
fig.show()

In [51]:
df[['Avg_MHz', 'Busy%', 'IPC', 'IRQ', 'POLL', 'C1%','C1E%','C3%','C6%','C7s%','C8%','C9%','C10%','CPU%c1','CPU%c3','CPU%c6','CPU%c7','CoreTmp','PkgTmp','GFX%rc6','GFXMHz','GFXAMHz','Totl%C0','Any%C0','GFX%C0','CPUGFX%']].corr()

Unnamed: 0,Avg_MHz,Busy%,IPC,IRQ,POLL,C1%,C1E%,C3%,C6%,C7s%,...,CPU%c7,CoreTmp,PkgTmp,GFX%rc6,GFXMHz,GFXAMHz,Totl%C0,Any%C0,GFX%C0,CPUGFX%
Avg_MHz,1.0,0.949295,-0.639685,0.127118,0.664125,0.843616,0.896732,0.86304,0.562058,0.297209,...,-0.8545,-0.081253,-0.066351,0.214902,-0.068422,,0.943248,0.135033,-0.004125,0.011713
Busy%,0.949295,1.0,-0.649045,0.275152,0.656393,0.909427,0.923712,0.891452,0.663989,0.27934,...,-0.946757,0.082701,0.101449,0.227728,-0.025784,,0.998492,-0.016913,-0.055614,-0.043858
IPC,-0.639685,-0.649045,1.0,-0.301365,-0.53076,-0.634232,-0.563347,-0.531579,-0.442463,-0.222202,...,0.633362,-0.127857,-0.151094,-0.190811,-0.06936,,-0.645058,0.102184,0.240681,0.251673
IRQ,0.127118,0.275152,-0.301365,1.0,0.182587,0.230824,0.279776,0.359766,0.795369,0.070062,...,-0.555162,0.479205,0.536046,0.100105,-0.027448,,0.291727,-0.943534,-0.093802,-0.147461
POLL,0.664125,0.656393,-0.53076,0.182587,1.0,0.570681,0.681866,0.649342,0.509147,0.4491,...,-0.628899,0.015621,0.049007,0.213706,-0.032209,,0.651839,0.009984,-0.095825,-0.098435
C1%,0.843616,0.909427,-0.634232,0.230824,0.570681,1.0,0.790677,0.717365,0.50929,0.182891,...,-0.858664,0.018379,0.031918,0.306434,-0.027028,,0.909934,-0.019958,-0.170989,-0.154625
C1E%,0.896732,0.923712,-0.563347,0.279776,0.681866,0.790677,1.0,0.933318,0.687771,0.305797,...,-0.895891,0.066967,0.088624,0.236198,-0.025132,,0.921857,-0.025989,-0.047873,-0.041727
C3%,0.86304,0.891452,-0.531579,0.359766,0.649342,0.717365,0.933318,1.0,0.752257,0.317717,...,-0.884374,0.134988,0.163151,0.172945,-0.042543,,0.887375,-0.095814,-0.005547,0.00255
C6%,0.562058,0.663989,-0.442463,0.795369,0.509147,0.50929,0.687771,0.752257,1.0,0.239579,...,-0.841668,0.340964,0.395172,0.136172,-0.073616,,0.671625,-0.629711,0.012703,-0.022963
C7s%,0.297209,0.27934,-0.222202,0.070062,0.4491,0.182891,0.305797,0.317717,0.239579,1.0,...,-0.264949,-0.024045,-0.011224,0.139709,-0.050443,,0.27629,0.015901,-0.042598,-0.041842


In [55]:
corrs = df[['Avg_MHz', 'Busy%', 'IPC', 'IRQ', 'POLL', 'C1%','C1E%','C3%','C6%','C7s%','C8%','C9%','C10%','CPU%c1','CPU%c3','CPU%c6','CPU%c7','CoreTmp','PkgTmp','GFX%rc6','GFXMHz','GFXAMHz','Totl%C0','Any%C0','GFX%C0','CPUGFX%','Pkg_J','Cor_J','GFX_J','RAM_J']].corr()

figure = ff.create_annotated_heatmap(z=corrs.values,
                                     x=list(corrs.columns),
                                     y=list(corrs.index),
                                     annotation_text=corrs.round(2).values,
                                     showscale=True, reversescale=True)
figure.show()