# Analysis for Performance and Energy Consumption

## Extract information

In [11]:
# Libraries

import sys
import os
import glob
import webbrowser
import natsort
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import MaxAbsScaler
from IPython.display import display, HTML


In [51]:
# language = sys.argv[1]
# html_filename = sys.argv[2]
# directory=sys.argv[3]

# Directory
actual_directory = os.getcwd() + '/'  

python_releaseDates = {
    'Python 3.13.0a0': '2023-06-07',
    'Python 3.12.0b1': '2023-05-22',
    'Python 3.11.3': '2023-04-05',
    'Python 3.10.11': '2023-04-05',
    'Python 3.9.16': '2022-12-06',
    'Python 3.8.16': '2022-12-06',
    'Python 3.7.16': '2022-12-06',
    'Python 3.6.15': '2021-09-04',
    'Python 3.5.10': '2020-09-05',
    'Python 3.4.10': '2019-03-18',
    'Python 3.3.7': '2017-09-19',
    'Python 3.2.6': '2014-10-11',
    'Python 3.1.5': '2012-04-09',
    'Python 3.0.1': '2009-02-13',
    'Python 2.7.18': '2020-04-20',
    'Python 2.6.9': '2013-10-29',
    'Python 2.5.6': '2011-05-26',
}

cplusplus_releaseDates = {
    'g++-4.4 4.4.7': '2012-03-13',
    'g++-4.6 4.6.4': '2013-04-12',
    'g++-4.7 4.7.4': '2014-06-12',
    'g++-4.8 4.8.5': '2015-06-23',
    'g++-4.9 4.9.3': '2015-06-26',
    'g++-5 5.5.0': '2017-10-10',
    'g++-6 6.5.0': '2018-10-26',
    'g++-7 7.5.0': '2019-11-14',
    'g++-8 8.5.0': '2021-05-14',
    'g++-8 ': '2021-05-14',
    'g++-9 9.5.0': '2022-05-27',
    'g++-10 10.4.0': '2022-06-28',
    'g++-10 10.5.0': '2023-07-07',
    'g++-11 11.4.0': '2023-05-29',
    'g++-12 12.3.0': '2023-05-08',
    'g++-13 13.1.0': '2023-04-26',
}

java_releaseDates = {
    '1.8.0_362': '2023-04-18',
    '9.0.4': '2018-01-16',
    '10.0.2': '2018-07-17',
    '11.0.19': '2020-10-20',
    '12.0.2': '2019-07-16',
    '13.0.2': '2020-01-14',
    '14.0.2': '2020-07-14',
    '15.0.2': '2021-01-19',
    '16.0.2': '2021-07-20',
    '17.0.7': '2023-04-18',
    '18.0.2-ea': '2022-07-19',
    '19.0.2': '2023-01-17',
    '20.0.2': '2023-07-18',
}

# Function to get release date for a given Python version
def get_release_date(version):
    if language == 'python':
        return python_releaseDates.get(version, 'Unknown')
    elif language == 'c++':
        return cplusplus_releaseDates.get(version, 'Unknown')
    elif language == 'java':
        return java_releaseDates.get(version, 'Unknown')

# Function to convert "g", "m" or KiB to "byte"
def convert_g_to_byte(value):
    value_str = str(value)
    if value_str[:-1] == "":
        Byte_value = 0
    elif value_str[-1].lower() == 'g':
        g_value = float(value_str[:-1])
        Byte_value = g_value * 1024 * 1024 * 1024  # 1 giga = 1,000,000,000
    elif value_str[-1].lower() == 'm':
        m_value = float(value_str[:-1])
        Byte_value = m_value * 1024 * 1024 # 1 giga = 1,000,000
    else:
        k_value = float(value_str)
        Byte_value = k_value * 1024 # 1 kilo = 1,000
    return int(Byte_value)

def convert_toUnit(column):
    column = column.apply(convert_g_to_byte)
    column = pd.to_numeric(column, errors='coerce')
    return column
    
def Data_normalized(df, tool):
    df_data = df[['version', 'release_date', 'path', 'appplication']]
    if tool == "turbostat": 
        df_metric = df.loc[:, ~df.columns.isin(['version', 'release_date', 'path', 'appplication'])]
    elif tool == "top":
        df_metric = df[['virt', 'res', 'shr', 'percent_cpu', 'percent_mem',
                    'nTH', 'P', 'SWAP', 'CODE', 'DATA', 'nMaj',
                    'nDRT', 'USED']]

    transformer = MaxAbsScaler().fit(df_metric)
    scaled = transformer.transform(df_metric)

    df_norm = pd.DataFrame(scaled, columns=df_metric.columns)
    df = pd.concat([df_data,df_norm.reindex(df_data.index)], axis=1)
    return df

# Function to extract information in every file
def from_CSVfile(file, directory, tool):
     # Read CSV file
    df = pd.read_csv(file)

    if (tool == "turbostat") or (tool == "top"):
        # New column 'release_date' as the second 
        df['release_date'] = df['version'].apply(get_release_date)
        df.insert(1, 'release_date', df.pop('release_date'))

        # Convert date into datetime
        df['release_date'] = pd.to_datetime(df['release_date'])

    # New column 'path' as the third
    df['path'] = directory
    df.insert(2, 'path', df.pop('path'))

    # Clean and remove the unnecessary rows
    df = df.dropna(subset=['command'])
    df.replace(to_replace='-', value=0, inplace=True)

    # Apply the conversion function to the DataFrame column
    df['virt'] = convert_toUnit(df['virt'])
    df['res'] = convert_toUnit(df['res'])
    df['shr'] = convert_toUnit(df['shr'])
    df['CODE'] = convert_toUnit(df['CODE'])
    df['DATA'] = convert_toUnit(df['DATA'])
    df['SWAP'] = convert_toUnit(df['SWAP'])
    df['USED'] = convert_toUnit(df['USED'])

    # Changes in the 'version' column
    if language == 'python': df['version'] = df['version'].str.replace('Python ', '')
    if language == 'c++': df['version'] = df['version'].str.split().str[0]

    # with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    #     print(df)

    return df

# Extract all information from ALL files
def from_CSVfiles(tool, norm):

    all_df = pd.DataFrame()

    extra = ''
    if tool == "turbostat": extra = "_performance"

    list_directories = next(os.walk(language))[1]
    list_directories.sort()

    for directory_name in list_directories:
        if directory_name != "older" and directory_name != "test" and directory_name != "general_plots":
            path=language + '/' + directory_name + '/' + tool + '/'
            if tool != "top": path = path + tool + extra
            if tool == "top":

                df = pd.DataFrame()
                list_files = os.listdir(path)

                # Get list of all files only in the given directory
                list_files = natsort.natsorted(list_files)

                for file_name in list_files:
                    if file_name.startswith("temp_top_data_") and file_name.endswith('.csv'):
                        df_Top = from_CSVfile(path + file_name, directory_name, tool)
                        df = pd.concat([df, df_Top])

                df.to_csv(path + "top_data_allVersions.csv", index=False)
            else:
                df = from_CSVfile(path + '_data_allVersions.csv', directory_name, tool)
                if norm: df = Data_normalized(df, tool)
            all_df = pd.concat([all_df, df])

    return all_df

In [52]:
language = "python"

df_top = from_CSVfiles("top", norm=False)
df_top

Unnamed: 0,version,release_date,path,appplication,no_measurement,timestamp,pid,command,time,virt,...,cpu_st,mem_total,mem_free,mem_used,mem_buff_cache,swap_total,swap_free,swap_used,swap_avail,Unnamed: 43
0,2.5.6,2011-05-26,binaryTrees_21,binarytrees_compatible_all.py 21,0,1691298597,644594,python,0:00.12,98013184,...,0.0,15795.0 MiB,10634.8 MiB,1104.0 MiB,4056.2 MiB,4096.0 MiB,2815.0 MiB,1281.0 MiB,14071.6 MiB,
1,2.5.6,2011-05-26,binaryTrees_21,binarytrees_compatible_all.py 21,1,1691298598,644594,python,0:01.29,98013184,...,0.0,15795.0 MiB,10643.4 MiB,1102.0 MiB,4049.6 MiB,4096.0 MiB,2815.0 MiB,1281.0 MiB,14080.3 MiB,
2,2.5.6,2011-05-26,binaryTrees_21,binarytrees_compatible_all.py 21,2,1691298599,644594,python,0:02.46,98013184,...,0.0,15795.0 MiB,10642.4 MiB,1103.1 MiB,4049.5 MiB,4096.0 MiB,2815.0 MiB,1281.0 MiB,14079.3 MiB,
3,2.5.6,2011-05-26,binaryTrees_21,binarytrees_compatible_all.py 21,3,1691298600,644594,python,0:03.63,98013184,...,0.0,15795.0 MiB,10642.4 MiB,1103.1 MiB,4049.4 MiB,4096.0 MiB,2815.0 MiB,1281.0 MiB,14079.3 MiB,
4,2.5.6,2011-05-26,binaryTrees_21,binarytrees_compatible_all.py 21,4,1691298601,644594,python,0:04.81,98013184,...,0.0,15795.0 MiB,10642.2 MiB,1103.4 MiB,4049.4 MiB,4096.0 MiB,2815.0 MiB,1281.0 MiB,14079.1 MiB,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,3.13.0a0,2023-06-07,nbody_50000000_OOflag,nbody.py 50000000,263,1691151531,579851,python,5:08.38,20090880,...,0.0,15795.0 MiB,4004.4 MiB,1972.2 MiB,9818.3 MiB,4096.0 MiB,4092.2 MiB,3.8 MiB,13033.4 MiB,
264,3.13.0a0,2023-06-07,nbody_50000000_OOflag,nbody.py 50000000,264,1691151533,579851,python,5:09.55,20090880,...,0.0,15795.0 MiB,4003.9 MiB,1972.7 MiB,9818.3 MiB,4096.0 MiB,4092.2 MiB,3.8 MiB,13032.9 MiB,
265,3.13.0a0,2023-06-07,nbody_50000000_OOflag,nbody.py 50000000,265,1691151534,579851,python,5:10.72,20090880,...,0.0,15795.0 MiB,4003.7 MiB,1973.0 MiB,9818.4 MiB,4096.0 MiB,4092.2 MiB,3.8 MiB,13032.6 MiB,
266,3.13.0a0,2023-06-07,nbody_50000000_OOflag,nbody.py 50000000,266,1691151535,579851,python,5:11.90,20090880,...,0.0,15795.0 MiB,4003.7 MiB,1973.0 MiB,9818.4 MiB,4096.0 MiB,4092.2 MiB,3.8 MiB,13032.6 MiB,
