# Research questions datasheet generator

In [1]:
import pandas as pd
import warnings
import re
warnings.filterwarnings('ignore')

In [2]:
def transform_to_list(string):
    match = re.findall("'(.*?)'", string)
    return match

def extract_mean(value):
    if value is not None and isinstance(value, str):
        lines = value.split('\n')
        for line in lines:
            if line.startswith('mean'):
                return float(line.split()[1])
    return None

## RQ1

In [3]:
import pandas as pd
import numpy as np

systems = ['Dune', 'LLVM','x264','BerkeleyDBC','Irzip','Polly','Hipacc','7z']

new_df = pd.DataFrame(columns=["System","Algorithm","Feature Selection","config_size","N=30","N=50","N=70","N=90"])

system_sizes = {}

for system in systems:
    path = f'results/{system}/{system}-results_random_t3_full_v2.csv'
    
    df = pd.read_csv(path)

    combined_df = df.copy()

    for algorithm in combined_df['algorithm_name'].unique():
        for sampling in combined_df['origin_ft_selection'].unique():

            temp_df = combined_df[(combined_df['algorithm_name'] == algorithm) & 
                                (combined_df['origin_ft_selection'] == sampling)]

            accuracy_30 = temp_df[temp_df['test_size'] == 0.3]['accuracy'].values[0] if len(temp_df[temp_df['test_size'] == 0.3]) > 0 else None
            accuracy_50 = temp_df[temp_df['test_size'] == 0.5]['accuracy'].values[0] if len(temp_df[temp_df['test_size'] == 0.5]) > 0 else None
            accuracy_70 = temp_df[temp_df['test_size'] == 0.7]['accuracy'].values[0] if len(temp_df[temp_df['test_size'] == 0.7]) > 0 else None
            accuracy_90 = temp_df[temp_df['test_size'] == 0.9]['accuracy'].values[0] if len(temp_df[temp_df['test_size'] == 0.9]) > 0 else None

            new_df = new_df.append({"System": system,
                                    "Algorithm": algorithm, 
                                    "Feature Selection": sampling,
                                    "config_size": 0, 
                                    "N=30": accuracy_30,
                                    "N=50": accuracy_50, 
                                    "N=70": accuracy_70, 
                                    "N=90": accuracy_90}, ignore_index=True)

for system in systems:
    path = system+"/sampledConfigurations_random_t3.csv"
    
    df = pd.read_csv(path)
    config_size = len(df)

    new_df.loc[new_df['System'] == system, 'config_size'] = config_size


new_df['N=30'] = new_df['N=30'].apply(extract_mean).astype(float)
new_df['N=50'] = new_df['N=50'].apply(extract_mean).astype(float)
new_df['N=70'] = new_df['N=70'].apply(extract_mean).astype(float)
new_df['N=90'] = new_df['N=90'].apply(extract_mean).astype(float)

new_df['N=30'] = new_df['N=30'].apply(lambda x: '{:.2f}'.format(x))
new_df['N=50'] = new_df['N=50'].apply(lambda x: '{:.2f}'.format(x))
new_df['N=70'] = new_df['N=70'].apply(lambda x: '{:.2f}'.format(x))
new_df['N=90'] = new_df['N=90'].apply(lambda x: '{:.2f}'.format(x))

new_df['N=30'] = new_df['N=30'].astype(float)
new_df['N=50'] = new_df['N=50'].astype(float)
new_df['N=70'] = new_df['N=70'].astype(float)
new_df['N=90'] = new_df['N=90'].astype(float)


new_df['Weighted Mean MAPE'] = new_df.apply(lambda row: np.average([row['N=30'], row['N=50'], row['N=70'], row['N=90']], 
                                                                   weights=[0.3*row['config_size'], 0.5*row['config_size'], 0.7*row['config_size'], 0.9*row['config_size']]), axis=1)

best_algorithm = new_df.loc[new_df['Weighted Mean MAPE'].idxmin()]['Algorithm']

In [4]:
new_df_copy = new_df.copy()

new_df_copy = new_df_copy.sort_values('System')

idx = new_df_copy.groupby('System')['Weighted Mean MAPE'].idxmin()

best_algorithms = new_df_copy.loc[idx]

best_algorithms.reset_index(drop=True, inplace=True)

best_algorithms.drop(columns=['Weighted Mean MAPE', 'Feature Selection', 'config_size'], inplace=True)

display(best_algorithms)

best_algorithms.to_latex()


Unnamed: 0,System,Algorithm,N=30,N=50,N=70,N=90
0,7z,RandomForest,4.53,3.73,3.45,3.35
1,BerkeleyDBC,DecisionTree,5.98,1.01,0.45,0.58
2,Dune,RandomForest,6.73,5.32,4.94,4.61
3,Hipacc,RandomForest,3.59,2.18,1.66,1.54
4,Irzip,DecisionTree,12.73,13.39,11.07,2.79
5,LLVM,LinearRegression,2.65,2.88,3.18,2.88
6,Polly,RandomForest,1.7,1.24,1.17,1.06
7,x264,DecisionTree,2.13,1.4,0.57,0.18


'\\begin{tabular}{lllrrrr}\n\\toprule\n{} &       System &         Algorithm &   N=30 &   N=50 &   N=70 &  N=90 \\\\\n\\midrule\n0 &           7z &      RandomForest &   4.53 &   3.73 &   3.45 &  3.35 \\\\\n1 &  BerkeleyDBC &      DecisionTree &   5.98 &   1.01 &   0.45 &  0.58 \\\\\n2 &         Dune &      RandomForest &   6.73 &   5.32 &   4.94 &  4.61 \\\\\n3 &       Hipacc &      RandomForest &   3.59 &   2.18 &   1.66 &  1.54 \\\\\n4 &        Irzip &      DecisionTree &  12.73 &  13.39 &  11.07 &  2.79 \\\\\n5 &         LLVM &  LinearRegression &   2.65 &   2.88 &   3.18 &  2.88 \\\\\n6 &        Polly &      RandomForest &   1.70 &   1.24 &   1.17 &  1.06 \\\\\n7 &         x264 &      DecisionTree &   2.13 &   1.40 &   0.57 &  0.18 \\\\\n\\bottomrule\n\\end{tabular}\n'

In [5]:
import re
import glob

def find_maior_numero(path):
    arquivos = glob.glob(path)

    maior_numero = -1
    arquivo_maior_numero = None

    for arquivo in arquivos:
        match = re.search(r"engine_(\d+)_v3", arquivo)
        if match:
            numero = int(match.group(1))
            if numero > maior_numero:
                maior_numero = numero
                arquivo_maior_numero = arquivo
    
    return arquivo_maior_numero


In [6]:
systems = ['Dune', 'LLVM','x264','BerkeleyDBC','Irzip','Polly','Hipacc','7z']
df_best_alg_fs = pd.DataFrame(columns=["System","Algorithm","Feature Selection",'config_size', "num_features", "N=30","N=50","N=70","N=90"])

for system in systems:
    
    path = f'results/{system}/{system}-results_with_feature_importance_random_t3_features_engine_[0-9]*.csv'
    arquivo_maior_numero = find_maior_numero(path)
    df_novo = pd.read_csv(arquivo_maior_numero)
    df_novo = df_novo[['algorithm_name', 'origin_ft_selection', 'test_size', 'accuracy', 'num_features']]
    df_novo['accuracy'] = df_novo['accuracy'].apply(extract_mean).astype(float)    

    for algorithm in df_novo['algorithm_name'].unique():
        for sampling in df_novo['origin_ft_selection'].unique():
            for features in df_novo['num_features'].unique():
                temp_df = df_novo[(df_novo['algorithm_name'] == algorithm) & 
                                    (df_novo['origin_ft_selection'] == sampling) & 
                                     (df_novo['num_features'] == features)]

                accuracy_30 = temp_df[temp_df['test_size'] == 0.3]['accuracy'].values[0] if len(temp_df[temp_df['test_size'] == 0.3]) > 0 else None
                accuracy_50 = temp_df[temp_df['test_size'] == 0.5]['accuracy'].values[0] if len(temp_df[temp_df['test_size'] == 0.5]) > 0 else None
                accuracy_70 = temp_df[temp_df['test_size'] == 0.7]['accuracy'].values[0] if len(temp_df[temp_df['test_size'] == 0.7]) > 0 else None
                accuracy_90 = temp_df[temp_df['test_size'] == 0.9]['accuracy'].values[0] if len(temp_df[temp_df['test_size'] == 0.9]) > 0 else None

                df_best_alg_fs = df_best_alg_fs.append({"System": system,
                                                        "Algorithm": algorithm, 
                                                        "Feature Selection": sampling,
                                                        "config_size": 0,
                                                        "num_features": features,
                                                        "N=30": accuracy_30,
                                                        "N=50": accuracy_50, 
                                                        "N=70": accuracy_70, 
                                                        "N=90": accuracy_90}, ignore_index=True)


KeyError: "['num_features'] not in index"

In [None]:
df_best_alg_fs

In [None]:
def map_percentages(group):
    unique_values = group.unique()
    value_to_percentage = {value: percentage for value, percentage in zip(sorted(unique_values), [30, 40, 50, 60, 100])}
    return group.map(value_to_percentage)


In [None]:
# df_best_alg_fs_temp = df_best_alg_fs[(df_best_alg_fs['System'] == '7z') & (df_best_alg_fs['Algorithm'] == 'RandomForest') | 
#               (df_best_alg_fs['System'] == 'BerkeleyDBC') & (df_best_alg_fs['Algorithm'] == "DecisionTree") | 
#             (df_best_alg_fs['System'] == 'Dune') & (df_best_alg_fs['Algorithm'] == "RandomForest") |
#             (df_best_alg_fs['System'] == 'Hipacc') & (df_best_alg_fs['Algorithm'] == "RandomForest") |
#             (df_best_alg_fs['System'] == 'Irzip') & (df_best_alg_fs['Algorithm'] == "DecisionTree") |
#             (df_best_alg_fs['System'] == 'LLVM') & (df_best_alg_fs['Algorithm'] == "LinearRegression") |
#             (df_best_alg_fs['System'] == 'Polly') & (df_best_alg_fs['Algorithm'] == "RandomForest") | 
#             (df_best_alg_fs['System'] == 'x264') & (df_best_alg_fs['Algorithm'] == "DecisionTree")]

df_temp_all = df_best_alg_fs.copy()
df_rq1 = df_best_alg_fs[['System', 'Algorithm', 'config_size', 'N=30', 'N=50', 'N=70', 'N=90']]

# df_temp_all.sort_values(by=['System', 'Algorithm', 'num_features'], inplace=True)
df_temp_all['Percentage'] = df_temp_all.groupby(['System', 'Algorithm'])['num_features'].transform(map_percentages)
df_temp_all = df_temp_all[df_temp_all['Percentage'] == 50]

# df_temp_all = df_temp_all[df_temp_all['System'] != 'Irzip']
df_temp_all.drop(columns=['Feature Selection', 'num_features'])

for system in systems:
    path = system+"/sampledConfigurations_random_t3.csv"
    
    df = pd.read_csv(path)
    config_size = len(df)

    df_temp_all.loc[df_temp_all['System'] == system, 'config_size'] = config_size
    
df_temp_all['Weighted Mean MAPE'] = df_temp_all.apply(lambda row: np.average([row['N=30'], row['N=50'], row['N=70'], row['N=90']], 
                                                                   weights=[0.3*row['config_size'], 0.5*row['config_size'], 0.7*row['config_size'], 0.9*row['config_size']]), axis=1)

df_temp_all_copy = df_temp_all.copy()

df_temp_all_copy = df_temp_all_copy.sort_values('System')

idx = df_temp_all_copy.groupby('System')['Weighted Mean MAPE'].idxmin()

best_algorithms_fs = df_temp_all_copy.loc[idx]

best_algorithms_fs.reset_index(drop=True, inplace=True)

best_algorithms_fs.drop(columns=['Weighted Mean MAPE', 'Feature Selection', 'config_size', 'Percentage', 'num_features'], inplace=True)

display(best_algorithms_fs)

best_algorithms_fs.to_latex()

In [None]:
df_temp_lrzip = df_best_alg_fs[['System', 'Algorithm', 'Feature Selection', 'num_features','N=90']]

df_temp_lrzip.sort_values(by=['System', 'Algorithm', 'num_features'], inplace=True)
df_rq1.sort_values(by=['System', 'Algorithm', 'num_features'], inplace=True)

df_temp_lrzip = df_temp_lrzip.rename(columns={'N=90': 'N=70'})

df_temp_lrzip = df_temp_lrzip[(df_temp_lrzip['System'] == 'Irzip') & (df_temp_lrzip['Algorithm'] == 'DecisionTree')]

df_temp_lrzip['Percentage'] = df_temp_lrzip.groupby(['System', 'Algorithm'])['num_features'].transform(map_percentages)
df_rq1['Percentage'] = df_rq1.groupby(['System', 'Algorithm'])['num_features'].transform(map_percentages)
df_temp_lrzip = df_temp_lrzip.drop(columns=['Feature Selection', 'num_features'])

In [None]:
df_all = pd.concat([df_temp_all, df_temp_lrzip], ignore_index=True)
# df_all = df_all[df_all['Percentage'] == 50]
df_all = df_all.drop(columns=['Feature Selection', 'num_features'])
df_all

for system in systems:
    display(system)
    display(df_all[df_all['System']==system]['config_size'].mean())


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))

grouped = df_all.groupby(['System', 'Algorithm'])

for name, group in grouped:
    if name[0] == 'Irzip':
        plt.plot(group['Percentage'], group['N=70'], marker='o', linestyle='-', label=f'{name} N=90')
    else:
        plt.plot(group['Percentage'], group['N=70'], marker='o', linestyle='-', label=f'{name} N=70')

plt.xlabel('Feature Percentage (%)')
plt.ylabel('MAPE')
plt.title('MAPE per Feature Percentage for each System and Algorithm')
plt.legend()
plt.show()


In [None]:
df_rq1 

# df_rq1 = df_rq1[df_rq1['Percentage'] == 50]
# df_rq1.drop(columns=['Percentage', 'num_features'], inplace=True)
# df_rq1.to_latex()

In [None]:
df_rq1

In [None]:
import pandas as pd
import numpy as np

systems = ['Dune', 'LLVM','x264','BerkeleyDBC','Irzip','Polly','Hipacc','7z']

new_df = pd.DataFrame(columns=["System","Algorithm","Feature Selection","config_size","N=30","N=50","N=70","N=90"])

system_sizes = {}

for system in systems:
    path = f'results/{systems[2]}/{systems[2]}-results_with_feature_importance_random_t3_features_engine_[0-9]*.csv'
    
    df = pd.read_csv(path)

    combined_df = df.copy()

    for algorithm in combined_df['algorithm_name'].unique():
        for sampling in combined_df['origin_ft_selection'].unique():

            temp_df = combined_df[(combined_df['algorithm_name'] == algorithm) & 
                                (combined_df['origin_ft_selection'] == sampling)]

            accuracy_30 = temp_df[temp_df['test_size'] == 0.3]['accuracy'].values[0] if len(temp_df[temp_df['test_size'] == 0.3]) > 0 else None
            accuracy_50 = temp_df[temp_df['test_size'] == 0.5]['accuracy'].values[0] if len(temp_df[temp_df['test_size'] == 0.5]) > 0 else None
            accuracy_70 = temp_df[temp_df['test_size'] == 0.7]['accuracy'].values[0] if len(temp_df[temp_df['test_size'] == 0.7]) > 0 else None
            accuracy_90 = temp_df[temp_df['test_size'] == 0.9]['accuracy'].values[0] if len(temp_df[temp_df['test_size'] == 0.9]) > 0 else None

            new_df = new_df.append({"System": system,
                                    "Algorithm": algorithm, 
                                    "Feature Selection": sampling,
                                    "config_size": 0, 
                                    "N=30": accuracy_30,
                                    "N=50": accuracy_50, 
                                    "N=70": accuracy_70, 
                                    "N=90": accuracy_90}, ignore_index=True)

for system in systems:
    path = system+"/sampledConfigurations_random_t3.csv"
    
    df = pd.read_csv(path)
    config_size = len(df)

    new_df.loc[new_df['System'] == system, 'config_size'] = config_size


new_df['N=30'] = new_df['N=30'].apply(extract_mean).astype(float)
new_df['N=50'] = new_df['N=50'].apply(extract_mean).astype(float)
new_df['N=70'] = new_df['N=70'].apply(extract_mean).astype(float)
new_df['N=90'] = new_df['N=90'].apply(extract_mean).astype(float)

new_df['N=30'] = new_df['N=30'].apply(lambda x: '{:.2f}'.format(x))
new_df['N=50'] = new_df['N=50'].apply(lambda x: '{:.2f}'.format(x))
new_df['N=70'] = new_df['N=70'].apply(lambda x: '{:.2f}'.format(x))
new_df['N=90'] = new_df['N=90'].apply(lambda x: '{:.2f}'.format(x))

new_df['N=30'] = new_df['N=30'].astype(float)
new_df['N=50'] = new_df['N=50'].astype(float)
new_df['N=70'] = new_df['N=70'].astype(float)
new_df['N=90'] = new_df['N=90'].astype(float)


new_df['Weighted Mean MAPE'] = new_df.apply(lambda row: np.average([row['N=30'], row['N=50'], row['N=70'], row['N=90']], 
                                                                   weights=[0.3*row['config_size'], 0.5*row['config_size'], 0.7*row['config_size'], 0.9*row['config_size']]), axis=1)

best_algorithm = new_df.loc[new_df['Weighted Mean MAPE'].idxmin()]['Algorithm']

new_df

In [None]:
#aplicar para número de features

import matplotlib.pyplot as plt

# systems = ['Dune', 'LLVM','x264','BerkeleyDBC','Irzip','Polly','Hipacc','7z']
systems = ['x264']

df_melt = df_best_alg_fs.melt(id_vars=['System', 'Algorithm', 'Feature Selection', 'config_size', 'Weighted Mean MAPE'],
                  value_vars=['N=30', 'N=50', 'N=70', 'N=90'],
                  var_name='Feature Number',
                  value_name='MAPE')

df_melt['Feature Number'] = df_melt['Feature Number'].str.replace('N=', '').astype(int)

grouped = df_melt.groupby(['System', 'Feature Number']).mean().reset_index()

for system in grouped['System'].unique():
    subset = grouped[grouped['System'] == system]
    plt.plot(subset['Feature Number'], subset['MAPE'], marker='o', label=system)

plt.legend()
plt.xlabel('Number of Features')
plt.ylabel('MAPE')
plt.title('MAPE for Different Feature Numbers and Systems')
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import glob

samples_config = ['random']
systems = ['Dune', 'LLVM','x264','BerkeleyDBC','Irzip','Polly','Hipacc','7z']
samples_config = ['distBased', 'divDistBased','henard', 'random', 'solverBased', 'twise']
# Number of 60% of features in order of above systems list: 'Dune', 'LLVM','x264','BerkeleyDBC','Irzip','Polly','Hipacc','7z' 
max_features = [17,7,9,11,12,20,31,25]

system_name = 'Dune'
num_features = '17'
ft = False

combined_df = pd.DataFrame()
if ft:
    file_pattern = "results/"+system_name+"/"+system_name+"-results_with_feature_importance_random_t3_features_engine_"+num_features+"_v2.csv"
else:
    file_pattern = "results/"+system_name+"/"+system_name+"-results_random_t3_full_v2.csv"


df = pd.read_csv(file_pattern, sep=',')
display(df)
df['system'] = system_name
df['ft_selection_technique'] = 'random'
combined_df = df.copy()

#for sample in samples_config:
    #file_pattern = f'results/{systems[2]}/{systems[2]}-results_with_feature_importance_{sample}_t3_features_engine_[0-9]*.csv'

    #file_pattern = f'results/{systems[0]}/{systems[0]}-results_with_feature_importance_{sample}_t3_features_engine_11.csv'

#     file_pattern = f'results/bkp-with_dummy/{system}/{system}-results_with_feature_importance_{sample}_t3_features_engine_*.csv'
#     file_pattern = f'results/{system}/{system}-results_with_feature_importance_{sample}_t3_features_engine_*.csv'

#    file_list = glob.glob(file_pattern)
#    for file in file_list:
#        df = pd.read_csv(file, sep=',')
#        print(file)
#        display(df)
#        number = file.rsplit('_', 1)[-1].split('.')[0]
#        #print(len(combined_df))
#        combined_df = pd.concat([combined_df, df])
        


#combined_df.reset_index(drop=True, inplace=True)
#combined_df.drop(columns=["Unnamed: 0"], inplace=True)


# RQ2

In [None]:
import pandas as pd
import glob


def _getDataBase(system, sample, ft=False, num_features=0):
    combined_df = pd.DataFrame()
    if ft:
        file_pattern = f"results/{system}/{system}-results_with_feature_importance_{sample}_t3_features_engine_{num_features}_v3.csv"
    else:
        file_pattern = f"results/{system}/{system}-results_{sample}_t3_full_v2.csv"
    
    df = pd.read_csv(file_pattern, sep=',')
    df['technique'] = technique
    df['system'] = system
    df['ft_selection_technique'] = sample
    combined_df = df.copy()
    
    return combined_df

    

In [None]:
def _getNumFeatures(system):
    if system == '7z': 
        num_features = [12,16,20,25]
    elif system == 'BerkeleyDBC':
        num_features = [5,7,9,11]
    elif system == 'Dune':
        num_features = [8,11,14,17]
    elif system == 'Hipacc':
        num_features = [16,21,26,31]
    elif system == 'Irzip':
        num_features = [6,8,10,12]
    elif system == 'LLVM':
        num_features = [4,5,6,7]
    elif system == 'Polly':
        num_features = [10,13,16,20]
    elif system == 'x264':
        num_features = [4,6,8,9]
        
    return num_features

In [None]:
print(_getNumFeatures('7z'))

In [None]:
systems = ['Dune', 'LLVM','x264','BerkeleyDBC','Irzip','Polly','Hipacc','7z']
samples_config = ['distBased', 'divDistBased','henard', 'random', 'solverBased', 'twise']

new_df_2 = pd.DataFrame(columns=["System","Algorithm","Feature Selection","Technique","N=30","N=50","N=70","N=90"])

for system in systems:
    num_features = _getNumFeatures(system)
    new_df = pd.DataFrame(columns=['algorithm_name','ft_dummyfication','origin_ft_selection','ft_selection','hyperparameters','size_target','test_size','coef_order','accuracy','num_features'])
    
    for sample in samples_config:
        for ft in [True, False]:
            if ft:
                path = f'results/{system}/{system}-results_with_feature_importance_{sample}_t3_features_engine_[0-9]*.csv'
                arquivo_maior_numero = find_maior_numero(path)
                df = pd.read_csv(arquivo_maior_numero)
                df['sample'] = sample
                df['system'] = system
                new_df = pd.concat([new_df, df])
            else:
                path = f"results/{system}/{system}-results_{sample}_t3_full_v2.csv"
                df = pd.read_csv(path)
                df_columns = pd.read_csv(f"{system}/sampledConfigurations_solverBased_t3.csv", sep=";")
                df['num_features'] = len(df_columns.columns) - 1
                df['sample'] = sample
                df['system'] = system
                new_df = pd.concat([new_df, df])
          
                        
    csv_path = 'results/'+system+'/'+system+'_summary_results_2.csv'
    new_df = new_df[['system', 'algorithm_name', 'test_size', 'num_features', 'accuracy', 'sample']]
    new_df = new_df.drop_duplicates(subset=['system', 'algorithm_name', 'test_size', 'num_features', 'accuracy']).reset_index(drop=True)
    new_df['accuracy'] = new_df['accuracy'].apply(extract_mean).astype(float).round(2)
    new_df.to_csv(csv_path)
    

# new_df.drop(columns=['ft_dummyfication', 'Unnamed: 0.1', 'ft_selection', 'hyperparameters', 'size_target', 'coef_order', 'Unnamed: 0'])
# new_df['Percentage'] = df_temp_all.groupby(['algorithm_name', 'Algorithm', 'Technique'])['num_features'].transform(map_percentages)

new_df

In [None]:
df_all = pd.DataFrame(columns=new_df.columns)

new_df_all = pd.DataFrame(columns=["System","Algorithm","Sample","N=70", "Percentage"])

for system in systems:
    csv_path = 'results/'+system+'/'+system+'_summary_results_2.csv'
    df = pd.read_csv(csv_path)
    df_all = pd.concat([df_all, df],ignore_index=True)
    
    display(df_all)

df_all['Percentage'] = df_all.groupby(['algorithm_name', 'system', 'sample'])['num_features'].transform(map_percentages)
df_all = df_all[((df_all['Percentage'] == 50) | (df_all['Percentage'] == 100)) & (df_all['test_size'] == 0.7)]
df_all = df_all.drop(columns=['num_features']).reset_index(drop=True)

display(df_all) 
for algorithm in df_all['algorithm_name'].unique():
    for sample in df_all['sample'].unique():
        for system in df_all['system'].unique(): 
            for pct in df_all['Percentage'].unique():
                temp_df = df_all[(df_all['algorithm_name'] == algorithm) & 
                                                    (df_all['sample'] == sample) & 
                                                    (df_all['system'] == system) &
                                                    (df_all['Percentage'] == pct)]

                accuracy_70 = temp_df[temp_df['test_size'] == 0.7]['accuracy'].values[0] if len(temp_df[temp_df['test_size'] == 0.7]) > 0 else None

                new_df_all = new_df_all.append({"System": system,
                                        "Algorithm": algorithm, 
                                        "Sample": sample, 
                                        "N=70": accuracy_70,
                                        "Percentage": pct}, ignore_index=True)
                    
new_df_all['N=70'] = new_df_all['N=70'].astype(float)
new_df_all

In [None]:
# Use a função pivot_table() para montar a tabela desejada
df_pivot = pd.pivot_table(new_df_all, values='N=70', index=['System', 'Algorithm'], columns=['Sample', 'Percentage'])

# Substituir valores NaN por '-'
df_pivot = df_pivot.fillna('-')

# Converter valores em notação científica para float com 2 casas decimais
df_pivot = df_pivot.applymap(lambda x: '{:.2f}'.format(x) if isinstance(x, float) else x)

# Substituir valores maiores que 1000 por '-'
df_pivot = df_pivot.applymap(lambda x: '-' if isinstance(x, float) and x > 1000 else x)

df_pivot.to_latex(multirow=True)

In [None]:
df_temp = pd.read_csv("results/LLVM/LLVM_summary_results_2.csv")
df_temp[df_temp['test_size'] == 0.1]

In [None]:
            if ft:
                for algorithm in combined_df['algorithm_name'].unique():
                    for sampling in combined_df['origin_ft_selection'].unique():
                        for system in combined_df['system'].unique(): 
                            for features in combined_df['num_features'].unique():
                                for technique in combined_df['ft_selection_technique'].unique():

                                    temp_df = combined_df[(combined_df['algorithm_name'] == algorithm) & 
                                                        (combined_df['origin_ft_selection'] == sampling) & 
                                                        (combined_df['system'] == system) &
                                                          (combined_df['num_features'] == features) &
                                                        (combined_df['ft_selection_technique'] == technique)]

                                    accuracy_30 = temp_df[temp_df['test_size'] == 0.3]['accuracy'].values[0] if len(temp_df[temp_df['test_size'] == 0.3]) > 0 else None
                                    accuracy_50 = temp_df[temp_df['test_size'] == 0.5]['accuracy'].values[0] if len(temp_df[temp_df['test_size'] == 0.5]) > 0 else None
                                    accuracy_70 = temp_df[temp_df['test_size'] == 0.7]['accuracy'].values[0] if len(temp_df[temp_df['test_size'] == 0.7]) > 0 else None
                                    accuracy_90 = temp_df[temp_df['test_size'] == 0.9]['accuracy'].values[0] if len(temp_df[temp_df['test_size'] == 0.9]) > 0 else None

                                    new_df = new_df.append({"System": system,
                                                            "Algorithm": algorithm, 
                                                            "Feature Selection": sampling, 
                                                            "Technique": technique,
                                                            "num_features": features,
                                                            "N=30": accuracy_30,
                                                            "N=50": accuracy_50, 
                                                            "N=70": accuracy_70, 
                                                            "N=90": accuracy_90}, ignore_index=True)
            else:
                for algorithm in combined_df['algorithm_name'].unique():
                    for sampling in combined_df['origin_ft_selection'].unique():
                        for system in combined_df['system'].unique(): 
                            for technique in combined_df['ft_selection_technique'].unique():
                                temp_df = combined_df[(combined_df['algorithm_name'] == algorithm) & 
                                                    (combined_df['origin_ft_selection'] == sampling) & 
                                                    (combined_df['system'] == system) &
                                                    (combined_df['ft_selection_technique'] == technique)]

                                accuracy_30 = temp_df[temp_df['test_size'] == 0.3]['accuracy'].values[0] if len(temp_df[temp_df['test_size'] == 0.3]) > 0 else None
                                accuracy_50 = temp_df[temp_df['test_size'] == 0.5]['accuracy'].values[0] if len(temp_df[temp_df['test_size'] == 0.5]) > 0 else None
                                accuracy_70 = temp_df[temp_df['test_size'] == 0.7]['accuracy'].values[0] if len(temp_df[temp_df['test_size'] == 0.7]) > 0 else None
                                accuracy_90 = temp_df[temp_df['test_size'] == 0.9]['accuracy'].values[0] if len(temp_df[temp_df['test_size'] == 0.9]) > 0 else None

                                new_df = new_df.append({"System": system,
                                                        "Algorithm": algorithm, 
                                                        "Feature Selection": sampling, 
                                                        "Technique": technique,
                                                        "num_features": len(df),
                                                        "N=30": accuracy_30,
                                                        "N=50": accuracy_50, 
                                                        "N=70": accuracy_70, 
                                                        "N=90": accuracy_90}, ignore_index=True)

                                

                new_df['N=30'] = new_df['N=30'].apply(extract_mean)
                new_df['N=50'] = new_df['N=50'].apply(extract_mean)
                new_df['N=70'] = new_df['N=70'].apply(extract_mean)
                new_df['N=90'] = new_df['N=90'].apply(extract_mean)

                new_df['N=30'] = new_df['N=30'].apply(lambda x: '{:.2f}'.format(x))
                new_df['N=50'] = new_df['N=50'].apply(lambda x: '{:.2f}'.format(x))
                new_df['N=70'] = new_df['N=70'].apply(lambda x: '{:.2f}'.format(x))
                new_df['N=90'] = new_df['N=90'].apply(lambda x: '{:.2f}'.format(x))

In [None]:
import xml.etree.ElementTree as ET
from anytree import Node, RenderTree

def generate_feature_model_tree(root_element):
    print()
    root = Node(root_element.attrib["name"])
    for child_element in root_element:
        child_node = generate_feature_model_tree(child_element)
        child_node.parent = root
    return root

def generate_feature_model_diagram(file_path):
    tree = ET.parse(file_path)
    root_element = tree.getroot()
    root = generate_feature_model_tree(root_element)
    for pre, fill, node in RenderTree(root):
        print(f"{pre}{node.name}")

# Exemplo de uso:
file_path = "FeatureModel.xml"
generate_feature_model_diagram(file_path)


# RQ4: Training time

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# system = 'LLVM'
systems = ['Dune', 'LLVM','x264','BerkeleyDBC','Irzip','Polly','7z', 'Hipacc']
samples_config = ['distBased', 'divDistBased','henard', 'random', 'solverBased', 'twise']

df_time_summary = pd.DataFrame(columns=["System","Sample Algorithm","Feature Selection","Algorithm", "%Features", "N=10%", "N=20%", "N=50%", "N=80%", "N=90%"])
df_time_per_sample = pd.DataFrame(columns=["System","Sample Algorithm","Feature Selection","Total Elapsed Time"])

df_time_summary_all = df_time_summary
df_time_per_sample_all = df_time_per_sample
df_time_all_systems = pd.DataFrame(columns=["System", "Total Elapsed Time", "Total Elapsed Time with Feature Selection"])

df_list = []

for system in systems:
    training_time_file = r'results/'+system+'/'+system+'-simulation_time_information.csv'
    training_time_file_v2 = r'results/'+system+'/'+system+'-simulation_time_information_v2.csv'
    df_time = pd.read_csv(training_time_file, sep=',')
    df_time = pd.concat([df_time, pd.read_csv(training_time_file_v2, sep=',')], ignore_index=True)
    
    df_time.append(pd.read_csv(training_time_file_v2, sep=','))
    
    training_time_file_ft = r'results/'+system+'/'+system+'-simulation_time_information-with_feature_importance.csv'
    training_time_file_ft_v2 = r'results/'+system+'/'+system+'-simulation_time_information-with_feature_importance_v2.csv'
    df_time_ft = pd.read_csv(training_time_file_ft, sep=',')
    df_time_ft = pd.concat([df_time_ft, pd.read_csv(training_time_file_ft_v2, sep=',')], ignore_index=True)

    df_time["StartTime"] = pd.to_datetime(df_time["StartTime"])
    df_time["EndTime"] = pd.to_datetime(df_time["EndTime"])
    df_time["ElapsedTime"] = (df_time["EndTime"] - df_time["StartTime"]).dt.seconds
    df_time["FT"] = False
    df_time["System"] = system
    
#     display(df_time)
    
#     display(df_time[0])
    
    df_time_ft["StartTime"] = pd.to_datetime(df_time_ft["StartTime"])
    df_time_ft["EndTime"] = pd.to_datetime(df_time_ft["EndTime"])
    df_time_ft["ElapsedTime"] = (df_time_ft["EndTime"] - df_time_ft["StartTime"]).dt.seconds
    df_time_ft["FT"] = True
    df_time_ft["System"] = str(system)
    
    
#     df_time_combined = [(df_time, df_time_ft)]
    df_time_combined = pd.concat([df_time, df_time_ft], ignore_index=True)
    
    
#     display(df_time_combined)

#     print(df_time_combined[(df_time_combined["%Configurations"] == 0.1) & (df_time_combined["%FeatureSelection"] == features_number)]["ElapsedTime"].mean())

    data_list = []
    data_list_sample = []
    data_list_all = []
    
    data_dict3 = {
                "System": system,
                "Total Elapsed Time": df_time_combined[(df_time_combined["FT"] == False) & (df_time_combined["System"] == system)]["ElapsedTime"].sum(),
                "Total Elapsed Time with Feature Selection": df_time_combined[(df_time_combined["FT"] == True) & (df_time_combined["System"] == system)]["ElapsedTime"].sum()
            }
    data_list_all.append(data_dict3)

    for sample in samples_config:
        
        for algorithm in df_time["Algorithm"].unique():
            for ft in [True, False]:
                data_dict2 = {
                        "System": system,
                        "Sample Algorithm": sample,
                        "Feature Selection": ft,
                        "Total Elapsed Time": df_time_combined[(df_time_combined["FT"] == ft) & (df_time_combined["System"] == system) & (df_time_combined["SampleAlgorithm"] == sample) & (df_time_combined["%Configurations"].isna().any())]["ElapsedTime"].sum()
                    }
                data_list_sample.append(data_dict2)
            
                
                for features_number in [0.3, 0.4, 0.5, 0.6]:
                    data_dict = {
                        "System": system,
                        "Sample Algorithm": sample,
                        "Feature Selection": ft,
                        "Algorithm": algorithm,
                        "%Features": features_number,
                        "#Features": df_time_combined[(df_time_combined["%FeatureSelection"] == features_number)]["#Features"],
                        "N=10%": df_time_combined[(df_time_combined["FT"] == ft) & (df_time_combined["%Configurations"] == 0.1) & (df_time_combined["%FeatureSelection"] == features_number)]["ElapsedTime"].mean(),
                        "N=20%": df_time_combined[(df_time_combined["FT"] == ft) & (df_time_combined["%Configurations"] == 0.2) & (df_time_combined["%FeatureSelection"] == features_number)]["ElapsedTime"].mean(),
                        "N=50%": df_time_combined[(df_time_combined["FT"] == ft) & (df_time_combined["%Configurations"] == 0.5) & (df_time_combined["%FeatureSelection"] == features_number)]["ElapsedTime"].mean(),
                        "N=80%": df_time_combined[(df_time_combined["FT"] == ft) & (df_time_combined["%Configurations"] == 0.8) & (df_time_combined["%FeatureSelection"] == features_number)]["ElapsedTime"].mean(),
                        "N=90%": df_time_combined[(df_time_combined["FT"] == ft) & (df_time_combined["%Configurations"] == 0.9) & (df_time_combined["%FeatureSelection"] == features_number)]["ElapsedTime"].mean(),
                    }
                data_list.append(data_dict)

    
    print(system)
    display(pd.DataFrame(data_list))
#     display(df_time_combined)
    
    df_time_summary = pd.DataFrame(data_list)
#     display(df_time_summary)
    df_time_per_sample = pd.DataFrame(data_list_sample)
    df_time_all_systems = pd.concat([df_time_all_systems, pd.DataFrame(data_list_all)], ignore_index=True) 
    df_list.append(df_time_summary)
    df_list.append(df_time_per_sample)
    
display(df_time_summary)    
df_list.append(df_time_all_systems)    
# display(df_time_all_systems)
    




In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# %pip install fpdf

# from fpdf import FPDF



df_Graph = pd.DataFrame({'without Feature Selection': df_time_all_systems['Total Elapsed Time'].values ,'with Feature Selection': df_time_all_systems['Total Elapsed Time with Feature Selection'].values}, df_time_all_systems['System'])
display(df_Graph)

# # df_Graph.export('results/table-simulation_time_all_system-Ntest.pdf')
# plt.figure()  # Adjust the figure size as needed
# plt.axis('off')  # Hide axes
# plt.table(cellText=df_Graph.values, colLabels=df_Graph.columns, loc='center', cellLoc='center')
# plt.savefig('results/table-simulation_time_all_system-Ntest.pdf', bbox_inches='tight')
# plt.close()



df_Graph.plot(kind='bar')
plt.xlabel('Systems')
plt.ylabel('Elapsed Time (seconds)')
# plt.show()
plt.savefig('results/simulation_time_all_system-Ntest.png')



In [None]:
#choose the graph to be plotted
#systems = ['Dune', 'LLVM','x264','BerkeleyDBC','Irzip','Polly','Hipacc','7z']

# system= system name ; detailed: True-detailed and False-resumed
def choose_df(detailed=True, system=None):
    df_return = df_list[len(df_list)-1]
    series = pd.Series(systems)
    
    if system==None:
        df_return = df_list[len(df_list)-1]
    else:
        index = series.index[series == system][0]
        display(index)
        if detailed:
            df_return = df_list[index*2]
        else:
            df_return = df_list[index*2+1]

    return df_return




In [None]:
# type = False

df_to_graph = choose_df(False,'Dune')
mask = df_to_graph['Total Elapsed Time']!=0
df_to_graph = df_to_graph[mask]
display(df_to_graph)

# display(df_to_graph['Total Elapsed Time'])
# display(df_to_graph[df_to_graph['Feature Selection']==True]['Total Elapsed Time'])

# if (type):
# #     n10 = df_to_graph[df_to_graph['Feature Selection']==True]["N=10%"].values
#     n10 = pd.DataFrame({'without Feature Selection': df_to_graph[df_to_graph['Feature Selection']==False]["N=10%"].values ,'with Feature Selection':df_to_graph[df_to_graph['Feature Selection']==True]["N=10%"].values}, samples_config)
# #     df_to_graph = pd.DataFrame({'without Feature Selection': df_to_graph[False]['Total Elapsed Time'].values ,'with Feature Selection': df_to_graph[True]['Total Elapsed Time'].values}, df_to_graph['Sample Algorithm'])
#     n10.plot(kind='bar')
#     plt.xlabel('Sample Algorithm')
#     plt.ylabel('Elapsed Time (seconds)')
#     plt.show()
# else:
df_to_graph = pd.DataFrame({'without Feature Selection': df_to_graph[df_to_graph['Feature Selection']==False]['Total Elapsed Time'] ,'with Feature Selection': df_to_graph[df_to_graph['Feature Selection']==True]['Total Elapsed Time']}, df_to_graph['Sample Algorithm'])
df_to_graph.plot(kind='bar')
plt.xlabel('Sample Algorithm')
plt.ylabel('Elapsed Time (seconds)')
plt.show()