In [3]:
import pandas as pd
import glob
from pathlib import Path
from functools import reduce
import operator

In [4]:
ROOT_PATH='../datasets/results/ranks/'
algorithms = ["EA-Ei-LM", "TS-Ei-LM", "SA-Ei-LM", "EA-Ei-LM-10", "EA-Ei-LM-50",
          "EAL-Ei-LM", "TSL-Ei-LM", "SAL-Ei-LM", "EAL-Ei-LM-10", "EAL-Ei-LM-50",
          "EA-Si-LM", "TS-Si-LM", "SA-Si-LM",
          "EAL-Si-LM", "TSL-Si-LM", "SAL-Si-LM",
          ]
algorithms_ts_sa = ["TS-Ei-LM", "SA-Ei-LM", 
           "TSL-Ei-LM", "SAL-Ei-LM", 
           "TS-Si-LM", "SA-Si-LM",
           "TSL-Si-LM", "SAL-Si-LM",
          ]

In [6]:
def equivalent_algorithms(input_directory, filter="*"):
    """
    identifies problems where all algorithms are equivalent
    """
    ranks_files = sorted(glob.glob(f'{input_directory}/{filter}'))

    info = {'dataset':[], 'dataset_size':[], 'all_algs_eqivalent':[], 'istances_name': []}
    for rank_file in ranks_files:
        #print(rank_file)
        data = pd.read_csv(rank_file)
        data['stdev'] = data.std(axis=1, numeric_only=True)
        
        df = data.query('stdev==0')
        #print(f"All algorithms have the same performance {df.shape[0]}\n",df['Problem'])

        info['dataset'].append(Path(rank_file).stem.split("_")[1])
        info['dataset_size'].append(data.shape[0])
        info['all_algs_eqivalent'].append(df.shape[0])
        info['istances_name'].append(list(df['Problem']))

    #print(pd.DataFrame(info))
    return info


In [7]:
info = equivalent_algorithms(f'{ROOT_PATH}/runTime_1min_sa_ts')

In [8]:
def remove_instances(input_directory, output_directory, 
                     ranks_directory, filter="*", instance_id_key='Problem'):
    """
    input_directory: - directory with all instances ranking
    output_directory: -ddirectory to store filteed instances
    """
    Path(output_directory).mkdir(exist_ok=True)
    
    ranks_files = sorted(glob.glob(f'{input_directory}/{filter}'))
    
    equivalent_algs = pd.DataFrame(equivalent_algorithms(ranks_directory))
    list_instances = list(equivalent_algs['istances_name'])
    list_instances = reduce(operator.concat, list_instances)
    print(list_instances)
    
    for rank_file in ranks_files:
        print(rank_file)
        data = pd.read_csv(rank_file)
        print(data.shape)
        data  = data.query(f'{instance_id_key} not in @list_instances')
        fileName = Path(rank_file).name
        data.to_csv(f'{output_directory}/{fileName}', index=False)
        print(data.shape)


In [145]:
remove_instances(f'{ROOT_PATH}/runTime_1min_sa_ts', 
                 f'{ROOT_PATH}/runTime_1min_sa_ts_filtered',
                 f'{ROOT_PATH}/runTime_1min_sa_ts')


['DAFJS03', 'DAFJS04', 'DAFJS08', 'P1', 'P11', 'P15', 'P17', 'P18', 'P2', 'P24', 'P25', 'P26', 'P27', 'P3', 'P4', 'P5', 'P6', 'P7', 'P9', 'la16.fjs', 'la17.fjs', 'la18.fjs', 'la20.fjs', 'bom_wide_2_10_10_5_No2', 'bom_wide_2_7_20_10_No2', 'bom_wide_2_7_20_5_No2', 'YFJS01', 'YFJS04', 'YFJS08', 'YFJS09', 'YFJS10', 'YFJS14']
../datasets/results/ranks//runTime_1min_sa_ts/rank_2asp_1min_8alg.csv
(50, 9)
(50, 9)
../datasets/results/ranks//runTime_1min_sa_ts/rank_dafjs_1min_8alg.csv
(30, 9)
(27, 9)
../datasets/results/ranks//runTime_1min_sa_ts/rank_deep_1min_8alg.csv
(50, 9)
(50, 9)
../datasets/results/ranks//runTime_1min_sa_ts/rank_dyuthi_1min_8alg.csv
(28, 9)
(12, 9)
../datasets/results/ranks//runTime_1min_sa_ts/rank_fjssp_1min_8alg.csv
(50, 9)
(46, 9)
../datasets/results/ranks//runTime_1min_sa_ts/rank_mixed_1min_8alg.csv
(50, 9)
(50, 9)
../datasets/results/ranks//runTime_1min_sa_ts/rank_wide_1min_8alg.csv
(50, 9)
(47, 9)
../datasets/results/ranks//runTime_1min_sa_ts/rank_yfjs_1min_8alg.csv


In [146]:
def multilabel_output(input_directory, filter="*", remove_all_eq_alg=False, alg_no=8):
    """
    transform rank in binary
    """
    ranks_files = sorted(glob.glob(f'{input_directory}/{filter}'))

    all_binary = pd.DataFrame()
    all_data =  pd.DataFrame()

    for rank_file in ranks_files:
        print(rank_file)
        data = pd.read_csv(rank_file)

        #transform rank to binary for multilabel
        df_binary = data.iloc[:,1:].apply(lambda row: row.apply(lambda x: 1 if x==min(row) else 0), axis=1)

        #add instance name column
        data = pd.concat([data['Problem'], df_binary], axis=1)

        #number of algorithms that are equivalent / row
        data['Sum'] = data.iloc[:,1:].sum(axis=1)
        data['Dataset'] = Path(rank_file).stem.split("_")[1]

        if remove_all_eq_alg:
            data = data.query(f'Sum<{alg_no}')
       
        print(data[data['Sum']>1][['Problem', 'Sum']])
        all_binary = pd.concat([all_binary, df_binary])
        all_data = pd.concat([all_data, data])
        
    #for each algorithm find how may times is the best strategy    
    aa = all_binary.sum()
    print((aa / all_binary.shape[0]) * 100)

    
    df = all_data.sort_values(['Dataset', 'Sum'], ascending=[True, True])

    # get first k rows from a group
    # first_rows = df.groupby('Dataset').apply(lambda x: x.iloc[:12]).reset_index(drop=True)
    # print(list(first_rows['Problem']))
    # print(first_rows.groupby(['Dataset'])['Dataset'].count())
    
    print("Dataset instances no")
    print(all_data.groupby(['Dataset'])['Dataset'].count())

In [147]:
multilabel_output(f'{ROOT_PATH}/runTime_1min_sa_ts', remove_all_eq_alg=True)

../datasets/results/ranks//runTime_1min_sa_ts/rank_2asp_1min_8alg.csv
Empty DataFrame
Columns: [Problem, Sum]
Index: []
../datasets/results/ranks//runTime_1min_sa_ts/rank_dafjs_1min_8alg.csv
    Problem  Sum
4   DAFJS05    2
10  DAFJS11    2
14  DAFJS15    2
18  DAFJS19    2
21  DAFJS22    2
22  DAFJS23    2
../datasets/results/ranks//runTime_1min_sa_ts/rank_deep_1min_8alg.csv
                    Problem  Sum
1    bom_deep_10_3_10_5_No2    2
9   bom_deep_10_5_30_15_No2    2
17   bom_deep_6_5_20_10_No2    2
38    bom_deep_8_5_10_5_No1    2
49   bom_deep_9_5_30_15_No2    2
../datasets/results/ranks//runTime_1min_sa_ts/rank_dyuthi_1min_8alg.csv
   Problem  Sum
1      P10    5
3      P12    4
4      P13    2
5      P14    6
7      P16    4
10     P19    2
13     P21    4
15     P23    2
20     P28    3
26      P8    2
../datasets/results/ranks//runTime_1min_sa_ts/rank_fjssp_1min_8alg.csv
                        Problem  Sum
11                     la12.fjs    3
12                     la13.f

In [24]:
def generate_multilabel(input_directory, output_directory,  filter="*"):
    """
    Transform ranking files to multilabel
    
    input_directory: - directory with all instances ranking
    output_directory: -ddirectory to store filteed instances
    """
    Path(output_directory).mkdir(exist_ok=True)

    ranks_files = sorted(glob.glob(f'{input_directory}/{filter}'))

    for rank_file in ranks_files:
        print(rank_file)
        data = pd.read_csv(rank_file)

        #transform rank to binary for multilabel
        df_binary = data.iloc[:,1:].apply(lambda row: row.apply(lambda x: 1 if x==min(row) else 0), axis=1)

        #add instance name column
        data = pd.concat([data['Problem'], df_binary], axis=1)

        #save file
        fileName = Path(rank_file).name
        data.to_csv(f'{output_directory}/{fileName}', index=False)

from scipy.stats import wilcoxon

def wilcoxon_test(data1, data2, label):
    stat, p = wilcoxon(data1, data2)
    
    # Display results
    # print(f'Wilcoxon Statistic ({label}): {stat}')
    # print(f'P-value ({label}): {p}')
    
    # Interpretation
    alpha = 0.05
    if p < alpha:
        return p, "yes"
        #print("Reject the null hypothesis: Significant difference between datasets")
    else:
        return p, "no"
        #print("Fail to reject the null hypothesis: No significant difference")
        
import re
def get_dataset_name(dataset_file_name):
    pattern = r"rank_([a-zA-Z0-9]+)_1min_8alg"
    match = re.match(pattern, dataset_file_name)
    
    if match:
        return match.group(1)
    return None

def build_df_wilcoxon_row(data, dataset_name):
    row_df_wilcoxon =[dataset_name]
        
    strategy_SA = ["SA-Ei-LM", "SAL-Ei-LM", "SA-Si-LM", "SAL-Si-LM"]
    strategy_TS = ["TS-Ei-LM", "TSL-Ei-LM", "TS-Si-LM", "TSL-Si-LM"]
    data['strategy-sa'] = data[strategy_SA].mean(axis=1)
    data['strategy-ts'] = data[strategy_TS].mean(axis=1)
    p, rez = wilcoxon_test(data['strategy-sa'], data['strategy-ts'], 'staregy SAvsTS')
    row_df_wilcoxon.extend([p, rez])

    initialization_letsa  = ["TSL-Ei-LM", "SAL-Ei-LM", "TSL-Si-LM", "SAL-Si-LM"]
    initialization_random = ["TS-Ei-LM", "SA-Ei-LM", "TS-Si-LM", "SA-Si-LM"]
    data['init-letsa']  = data[initialization_letsa].mean(axis=1)
    data['init-random'] = data[initialization_random].mean(axis=1)
    p, rez = wilcoxon_test(data['init-letsa'], data['init-random'], 'init LETSAvsRandom')
    row_df_wilcoxon.extend([p, rez])
    
    perturbation_ei = ["TS-Ei-LM", "SA-Ei-LM", "TSL-Ei-LM", "SAL-Ei-LM"]
    perturbation_si = ["TS-Si-LM", "SA-Si-LM", "TSL-Si-LM", "SAL-Si-LM"]
    data['perturbation_ei']  = data[perturbation_ei].mean(axis=1)
    data['perturbation_si'] = data[perturbation_si].mean(axis=1)
    p, rez = wilcoxon_test(data['perturbation_ei'], data['perturbation_si'], 'perturbation EivsSi')
    row_df_wilcoxon.extend([p, rez])
    
    return row_df_wilcoxon, data
                        
def generate_agregated_multilabel(input_directory, output_directory,  filter="*"):
    """
    Transform ranking files to multilabel using 3 agregation (heuristic method, initialization, perturbation)
    
    input_directory: - directory with all instances ranking
    output_directory: -directory to store filtered instances
    """
    Path(output_directory).mkdir(exist_ok=True)

    ranks_files = sorted(glob.glob(f'{input_directory}/{filter}'))

    df_wilcoxon = pd.DataFrame(columns=['DataSet', 
                                       'staregy SAvsTS -pval', 'staregy SAvsTS - Significant difference',
                                       'init LETSAvsRandom - pval', 'init LETSAvsRandom - Significant difference',
                                       'perturbation EivsSi - pval', 'perturbation EivsSi - Significant difference'])

    df_all = pd.DataFrame()
    for rank_file in ranks_files:
        print(rank_file)
        data = pd.read_csv(rank_file)
        df_wilcoxon.loc[len(df_wilcoxon)],data = build_df_wilcoxon_row(data, get_dataset_name(Path(rank_file).name))
        df_all = pd.concat([df_all, data], ignore_index=True)

        #save file
        fileName = Path(rank_file).name
        data[['Problem', 'strategy-sa','strategy-ts', 'init-letsa','init-random','perturbation_ei','perturbation_si']].to_csv(f'{output_directory}/{fileName}', index=False)

    df_wilcoxon.loc[len(df_wilcoxon)], df_all = build_df_wilcoxon_row(df_all, 'all')

    print(df_wilcoxon)

def find_value(row, name1, name2):
    val1 = row[name1]
    val2 = row[name2]
    if val1 == val2: return 2 
    elif val1 < val2: return 0
    else: return 1
    

def generate_agregated_multilabel_binary(input_directory, output_directory,  filter="*", multilabel=False):
    """
    Transform ranking files to multilabel using 3 agregation (heuristic method, initialization, perturbation)
    
    input_directory: - directory with all instances ranking
    output_directory: -ddirectory to store filtered instances
    """
    Path(output_directory).mkdir(exist_ok=True)

    ranks_files = sorted(glob.glob(f'{input_directory}/{filter}'))

    df_all = pd.DataFrame()
    for rank_file in ranks_files:
        print(rank_file)
        data = pd.read_csv(rank_file)

        #transform rank to binary for multilabel
        df_binary = data.iloc[:,1:].apply(lambda row: row.apply(lambda x: 1 if x==min(row) else 0), axis=1)

        strategy_SA = ["SA-Ei-LM", "SAL-Ei-LM", "SA-Si-LM", "SAL-Si-LM"]
        strategy_TS = ["TS-Ei-LM", "TSL-Ei-LM", "TS-Si-LM", "TSL-Si-LM"]
        df_binary['strategy-sa'] = df_binary[strategy_SA].sum(axis=1)
        df_binary['strategy-ts'] = df_binary[strategy_TS].sum(axis=1)
        df_binary['stategy'] = df_binary.apply(find_value, axis=1, name1='strategy-sa', name2='strategy-ts')
        # p, rez = wilcoxon_test(data['strategy-sa'], data['strategy-ts'], 'staregy SAvsTS')
        # row_df_wilcoxon.extend([p, rez])
    
        initialization_letsa  = ["TSL-Ei-LM", "SAL-Ei-LM", "TSL-Si-LM", "SAL-Si-LM"]
        initialization_random = ["TS-Ei-LM", "SA-Ei-LM", "TS-Si-LM", "SA-Si-LM"]
        df_binary['init-letsa']  = df_binary[initialization_letsa].sum(axis=1)
        df_binary['init-random'] = df_binary[initialization_random].sum(axis=1)
        df_binary['init'] = df_binary.apply(find_value, axis=1, name1='init-letsa', name2='init-random')

        # p, rez = wilcoxon_test(data['init-letsa'], data['init-random'], 'init LETSAvsRandom')
        # row_df_wilcoxon.extend([p, rez])
        
        perturbation_ei = ["TS-Ei-LM", "SA-Ei-LM", "TSL-Ei-LM", "SAL-Ei-LM"]
        perturbation_si = ["TS-Si-LM", "SA-Si-LM", "TSL-Si-LM", "SAL-Si-LM"]
        df_binary['perturbation_ei']  = df_binary[perturbation_ei].sum(axis=1)
        df_binary['perturbation_si'] = df_binary[perturbation_si].sum(axis=1)
        df_binary['perturbation'] = df_binary.apply(find_value, axis=1, name1='perturbation_ei', name2='perturbation_si')

    
        #add instance name column
        data = pd.concat([data['Problem'], df_binary], axis=1)
        
        df_all = pd.concat([df_all, data], ignore_index=True)

        #save file
        fileName = Path(rank_file).name
        if multilabel:
            data[['Problem', 'stategy', 'init', 'perturbation']].to_csv(f'{output_directory}/{fileName}', index=False)
        else:
            data.to_csv(f'{output_directory}/{fileName}', index=False)

    # df_wilcoxon.loc[len(df_wilcoxon)] = build_df_wilcoxon_row(df_all, 'all')

    # print(df_wilcoxon)

In [15]:
# generate_agregated_multilabel(f'{ROOT_PATH}/runTime_1min_sa_ts', 
#                     f'{ROOT_PATH}/runTime_1min_sa_ts_agredated'
#                     )
# generate_agregated_multilabel_binary(f'{ROOT_PATH}/runTime_1min_sa_ts', 
#                     f'{ROOT_PATH}/runTime_1min_sa_ts_agredated_binary'
#                     )

generate_agregated_multilabel(f'{ROOT_PATH}/runTime_1min_sa_ts', 
                    f'{ROOT_PATH}/runTime_1min_sa_ts_agregated_multilabel')#,
                    #multilabel=True)

remove_instances(f'{ROOT_PATH}/runTime_1min_sa_ts_agregated_multilabel', 
                 f'{ROOT_PATH}/runTime_1min_sa_ts_agregated_multilabel_filtered',
                 f'{ROOT_PATH}/runTime_1min_sa_ts')

../datasets/results/ranks//runTime_1min_sa_ts\rank_2asp_1min_8alg.csv
../datasets/results/ranks//runTime_1min_sa_ts\rank_dafjs_1min_8alg.csv
../datasets/results/ranks//runTime_1min_sa_ts\rank_deep_1min_8alg.csv
../datasets/results/ranks//runTime_1min_sa_ts\rank_dyuthi_1min_8alg.csv
../datasets/results/ranks//runTime_1min_sa_ts\rank_fjssp_1min_8alg.csv
../datasets/results/ranks//runTime_1min_sa_ts\rank_mixed_1min_8alg.csv
../datasets/results/ranks//runTime_1min_sa_ts\rank_wide_1min_8alg.csv
../datasets/results/ranks//runTime_1min_sa_ts\rank_yfjs_1min_8alg.csv
  DataSet  staregy SAvsTS -pval staregy SAvsTS - Significant difference  \
0    2asp          5.335711e-01                                      no   
1   dafjs          7.269918e-05                                     yes   
2    deep          1.422338e-03                                     yes   
3  dyuthi          2.661661e-01                                      no   
4   fjssp          3.641284e-02                             

In [28]:
remove_instances(f'{ROOT_PATH}/runTime_1min_sa_ts_lehmer', 
                 f'{ROOT_PATH}/runTime_1min_sa_ts_lehmer_filtered',
                 f'{ROOT_PATH}/runTime_1min_sa_ts')

['DAFJS03', 'DAFJS04', 'DAFJS08', 'P1', 'P11', 'P15', 'P17', 'P18', 'P2', 'P24', 'P25', 'P26', 'P27', 'P3', 'P4', 'P5', 'P6', 'P7', 'P9', 'la16.fjs', 'la17.fjs', 'la18.fjs', 'la20.fjs', 'bom_wide_2_10_10_5_No2', 'bom_wide_2_7_20_10_No2', 'bom_wide_2_7_20_5_No2', 'YFJS01', 'YFJS04', 'YFJS08', 'YFJS09', 'YFJS10', 'YFJS14']
../datasets/results/ranks//runTime_1min_sa_ts_lehmer\rank_2asp_1min_8alg.csv
(50, 9)
(50, 9)
../datasets/results/ranks//runTime_1min_sa_ts_lehmer\rank_dafjs_1min_8alg.csv
(30, 9)
(27, 9)
../datasets/results/ranks//runTime_1min_sa_ts_lehmer\rank_deep_1min_8alg.csv
(50, 9)
(50, 9)
../datasets/results/ranks//runTime_1min_sa_ts_lehmer\rank_dyuthi_1min_8alg.csv
(28, 9)
(12, 9)
../datasets/results/ranks//runTime_1min_sa_ts_lehmer\rank_fjssp_1min_8alg.csv
(50, 9)
(46, 9)
../datasets/results/ranks//runTime_1min_sa_ts_lehmer\rank_mixed_1min_8alg.csv
(50, 9)
(50, 9)
../datasets/results/ranks//runTime_1min_sa_ts_lehmer\rank_wide_1min_8alg.csv
(50, 9)
(47, 9)
../datasets/results/r

In [149]:
generate_multilabel(f'{ROOT_PATH}/runTime_1min_sa_ts', 
                    f'{ROOT_PATH}/runTime_1min_sa_ts_multilabel'
                    )

remove_instances(f'{ROOT_PATH}/runTime_1min_sa_ts_multilabel', 
                 f'{ROOT_PATH}/runTime_1min_sa_ts_multilabel_filtered',
                 f'{ROOT_PATH}/runTime_1min_sa_ts')

../datasets/results/ranks//runTime_1min_sa_ts/rank_2asp_1min_8alg.csv
../datasets/results/ranks//runTime_1min_sa_ts/rank_dafjs_1min_8alg.csv
../datasets/results/ranks//runTime_1min_sa_ts/rank_deep_1min_8alg.csv
../datasets/results/ranks//runTime_1min_sa_ts/rank_dyuthi_1min_8alg.csv
../datasets/results/ranks//runTime_1min_sa_ts/rank_fjssp_1min_8alg.csv
../datasets/results/ranks//runTime_1min_sa_ts/rank_mixed_1min_8alg.csv
../datasets/results/ranks//runTime_1min_sa_ts/rank_wide_1min_8alg.csv
../datasets/results/ranks//runTime_1min_sa_ts/rank_yfjs_1min_8alg.csv
['DAFJS03', 'DAFJS04', 'DAFJS08', 'P1', 'P11', 'P15', 'P17', 'P18', 'P2', 'P24', 'P25', 'P26', 'P27', 'P3', 'P4', 'P5', 'P6', 'P7', 'P9', 'la16.fjs', 'la17.fjs', 'la18.fjs', 'la20.fjs', 'bom_wide_2_10_10_5_No2', 'bom_wide_2_7_20_10_No2', 'bom_wide_2_7_20_5_No2', 'YFJS01', 'YFJS04', 'YFJS08', 'YFJS09', 'YFJS10', 'YFJS14']
../datasets/results/ranks//runTime_1min_sa_ts_multilabel/rank_2asp_1min_8alg.csv
(50, 9)
(50, 9)
../datasets/res

In [86]:
60 antrenare
20 testare
20 validare

96

In [46]:
#remove from instances characteristics
ROOT_PATH_FEATURES='../datasets/results/instances_characteristics/februarie2025-v3/'
# remove_instances(f'{ROOT_PATH_FEATURES}/heterogeneous', 
#                  f'{ROOT_PATH_FEATURES}/heterogeneous_filtered',
#                  f'{ROOT_PATH}/runTime_1min_sa_ts', instance_id_key='problem')
# remove_instances(f'{ROOT_PATH_FEATURES}/operations', 
#                  f'{ROOT_PATH_FEATURES}/operations_filtered',
#                 f'{ROOT_PATH}/runTime_1min_sa_ts', instance_id_key='problem')
# remove_instances(f'{ROOT_PATH_FEATURES}/statistics', 
#                  f'{ROOT_PATH_FEATURES}/statistics_filtered',
#                  f'{ROOT_PATH}/runTime_1min_sa_ts', instance_id_key='problem')

# remove_instances(f'../datasets/results/ranks/class_alg_unfiltered', 
#                  f'../datasets/results/ranks/class_alg',
#                  f'{ROOT_PATH}/runTime_1min_sa_ts', instance_id_key='Problem')
remove_instances(f'../datasets/results/forLehmerCode/runTime_1min_sa_ts', 
                 f'../datasets/results/forLehmerCode/runTime_1min_sa_ts',
                 f'{ROOT_PATH}/runTime_1min_sa_ts', instance_id_key='Problem')

['DAFJS03', 'DAFJS04', 'DAFJS08', 'P1', 'P11', 'P15', 'P17', 'P18', 'P2', 'P24', 'P25', 'P26', 'P27', 'P3', 'P4', 'P5', 'P6', 'P7', 'P9', 'la16.fjs', 'la17.fjs', 'la18.fjs', 'la20.fjs', 'bom_wide_2_10_10_5_No2', 'bom_wide_2_7_20_10_No2', 'bom_wide_2_7_20_5_No2', 'YFJS01', 'YFJS04', 'YFJS08', 'YFJS09', 'YFJS10', 'YFJS14']
../datasets/results/ranks/class_alg_unfiltered\class_2asp.csv
(50, 2)
(50, 2)
../datasets/results/ranks/class_alg_unfiltered\class_dafjs.csv
(30, 2)
(27, 2)
../datasets/results/ranks/class_alg_unfiltered\class_deep.csv
(50, 2)
(50, 2)
../datasets/results/ranks/class_alg_unfiltered\class_dyuthi.csv
(28, 2)
(12, 2)
../datasets/results/ranks/class_alg_unfiltered\class_fjssp.csv
(50, 2)
(46, 2)
../datasets/results/ranks/class_alg_unfiltered\class_mixed.csv
(50, 2)
(50, 2)
../datasets/results/ranks/class_alg_unfiltered\class_wide.csv
(50, 2)
(47, 2)
../datasets/results/ranks/class_alg_unfiltered\class_yfjs.csv
(20, 2)
(14, 2)


# Train - Test - Validate split

In [181]:

import  random

def get_train_validate_split(rank_files, train_size = 0.8):
    #datasets order in features files
    datasets  = ["deep", "wide", "fjssp", "dyuthi", "2asp", "mixed", "dafjs", "yfjs"]

    df = pd.DataFrame(equivalent_algorithms(rank_files))

    df['actual_dataset_size'] = df['dataset_size'] -  df['all_algs_eqivalent']

    index = 0
    train = []
    test =[]
    for dataset  in datasets:
        ds_dim = df.loc[df["dataset"] == dataset, "actual_dataset_size"].iloc[0]
        indexes = list(range(index, index + ds_dim))
        random.shuffle(indexes)
        cut_point = int(ds_dim * train_size)
        
        index +=  ds_dim
        train.extend(indexes[:cut_point])
        test.extend(indexes[cut_point:])

    random.shuffle(test)
    random.shuffle(train)
    
    print("test", test)
    print("train", train)
    
    
get_train_validate_split(f'{ROOT_PATH}/runTime_1min_sa_ts')


test [99, 161, 95, 175, 45, 166, 224, 50, 249, 102, 107, 284, 177, 121, 136, 241, 34, 260, 119, 199, 40, 24, 264, 169, 280, 51, 152, 110, 290, 4, 6, 55, 77, 167, 217, 124, 278, 56, 144, 91, 170, 74, 159, 232, 27, 41, 245, 164, 214, 230, 246, 263, 101, 18, 292, 92, 147, 115, 277, 15, 254, 62]
train [30, 194, 248, 17, 81, 16, 58, 188, 160, 287, 129, 240, 213, 130, 256, 79, 282, 84, 145, 36, 257, 220, 28, 134, 265, 286, 142, 195, 201, 66, 273, 157, 128, 279, 150, 125, 109, 96, 26, 29, 227, 259, 276, 61, 73, 209, 178, 215, 11, 80, 218, 163, 98, 253, 20, 225, 168, 205, 104, 200, 197, 94, 106, 105, 118, 22, 187, 112, 202, 60, 237, 153, 75, 7, 294, 219, 285, 151, 204, 222, 196, 156, 90, 193, 10, 72, 155, 1, 247, 57, 13, 131, 113, 35, 5, 266, 139, 182, 38, 47, 12, 141, 207, 233, 123, 43, 88, 180, 165, 46, 267, 203, 179, 242, 184, 3, 198, 25, 39, 281, 87, 234, 138, 132, 126, 149, 68, 173, 216, 33, 171, 100, 86, 44, 255, 231, 23, 174, 71, 235, 172, 283, 250, 89, 192, 143, 8, 14, 65, 78, 93, 146,

# Class assign
C1 = instantele pt care e TS-Ei-LM algoritmul cu cea mai buna performanta

C2 =  instantele pt care e SA-Ei-LM algoritmul cu cea mai buna performanta  

...

C8  =  instantele pt care e SA-Ei-LM algoritmul cu cea mai buna performanta  SAL-Si-LM

In [18]:
def find_column_name(row, value):
    name = []
    for col in row.index:
        if row[col] == value:
            name.append(col)
    str = name[0]
    for i in range(1, len(name)):
        str += " & " + name[i]
    return str
    
def class_by_alg(input_directory, filter='*'):

    print(input_directory)
    multiclass_files = sorted(glob.glob(f'{input_directory}/{filter}'))
    print(multiclass_files)

    labels_set = set()
    for rank_file in multiclass_files:
        print(rank_file)
        data = pd.read_csv(rank_file)

        #transform rank to binary for multilabel
        #data['Labe'] = data.apply(lambda row: row.apply(lambda x: x.index[0] if x==1 else "" ), axis=1)
        data['Label'] = data.apply(find_column_name, axis=1, value=1)

        #print(data)
        labels_set.update(data['Label'])
    print(len(labels_set), labels_set)
    
class_by_alg(f'{ROOT_PATH}/runTime_1min_sa_ts_multilabel')                  

../datasets/results/ranks//runTime_1min_sa_ts_multilabel
['../datasets/results/ranks//runTime_1min_sa_ts_multilabel\\rank_2asp_1min_8alg.csv', '../datasets/results/ranks//runTime_1min_sa_ts_multilabel\\rank_dafjs_1min_8alg.csv', '../datasets/results/ranks//runTime_1min_sa_ts_multilabel\\rank_deep_1min_8alg.csv', '../datasets/results/ranks//runTime_1min_sa_ts_multilabel\\rank_dyuthi_1min_8alg.csv', '../datasets/results/ranks//runTime_1min_sa_ts_multilabel\\rank_fjssp_1min_8alg.csv', '../datasets/results/ranks//runTime_1min_sa_ts_multilabel\\rank_mixed_1min_8alg.csv', '../datasets/results/ranks//runTime_1min_sa_ts_multilabel\\rank_wide_1min_8alg.csv', '../datasets/results/ranks//runTime_1min_sa_ts_multilabel\\rank_yfjs_1min_8alg.csv']
../datasets/results/ranks//runTime_1min_sa_ts_multilabel\rank_2asp_1min_8alg.csv
../datasets/results/ranks//runTime_1min_sa_ts_multilabel\rank_dafjs_1min_8alg.csv
../datasets/results/ranks//runTime_1min_sa_ts_multilabel\rank_deep_1min_8alg.csv
../datasets/r

In [42]:
import random
import re
from pathlib import Path

def get_best_alg(row, value):
    name = []
    for col in row.index:
        if row[col] == value:
            name.append(col)
    index = random.randint(0, len(name)-1)#if multiples alg choose one random
    return name[index]
    
def class_by_algorithm(input_directory, output_directory, filter='*'):

    print(input_directory)
    multiclass_files = sorted(glob.glob(f'{input_directory}/{filter}'))
    print(multiclass_files)

    for rank_file in multiclass_files:
        print(rank_file)
        data = pd.read_csv(rank_file)

        #transform rank to binary for multilabel
        #data['Labe'] = data.apply(lambda row: row.apply(lambda x: x.index[0] if x==1 else "" ), axis=1)
        data['BestAlg'] = data.apply(get_best_alg, axis=1, value=1)

        
        pattern = re.compile( r'^rank_(\w+?)_')
        # Find all matches
        matches = pattern.findall(Path(rank_file).stem)
        print(matches)
    
        dataset_name = matches[0]

        
        data[['Problem','BestAlg']].to_csv(f'{output_directory}/class_{dataset_name}.csv', index=False)

    print(data)
    
class_by_algorithm(f'{ROOT_PATH}/runTime_1min_sa_ts_multilabel', f'{ROOT_PATH}/class_alg')                  

../datasets/results/ranks//runTime_1min_sa_ts_multilabel
['../datasets/results/ranks//runTime_1min_sa_ts_multilabel\\rank_2asp_1min_8alg.csv', '../datasets/results/ranks//runTime_1min_sa_ts_multilabel\\rank_dafjs_1min_8alg.csv', '../datasets/results/ranks//runTime_1min_sa_ts_multilabel\\rank_deep_1min_8alg.csv', '../datasets/results/ranks//runTime_1min_sa_ts_multilabel\\rank_dyuthi_1min_8alg.csv', '../datasets/results/ranks//runTime_1min_sa_ts_multilabel\\rank_fjssp_1min_8alg.csv', '../datasets/results/ranks//runTime_1min_sa_ts_multilabel\\rank_mixed_1min_8alg.csv', '../datasets/results/ranks//runTime_1min_sa_ts_multilabel\\rank_wide_1min_8alg.csv', '../datasets/results/ranks//runTime_1min_sa_ts_multilabel\\rank_yfjs_1min_8alg.csv']
../datasets/results/ranks//runTime_1min_sa_ts_multilabel\rank_2asp_1min_8alg.csv
['2asp']
../datasets/results/ranks//runTime_1min_sa_ts_multilabel\rank_dafjs_1min_8alg.csv
['dafjs']
../datasets/results/ranks//runTime_1min_sa_ts_multilabel\rank_deep_1min_8al

In [86]:
from sklearn.datasets import make_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
import numpy as np
X, y1 = make_classification(n_samples=10, n_features=100,
                            n_informative=30, n_classes=3,
                            random_state=1)
y2 = shuffle(y1, random_state=1)
y3 = shuffle(y1, random_state=2)
Y = np.vstack((y1, y2, y3)).T
print(y1,y2,y3, Y)
n_samples, n_features = X.shape # 10,100
n_outputs = Y.shape[1] # 3
n_classes = 3
forest = RandomForestClassifier(random_state=1)
multi_target_forest = MultiOutputClassifier(forest, n_jobs=2)
multi_target_forest.fit(X, Y).predict(X)

[2 1 2 0 0 0 1 1 0 2] [2 2 1 0 2 0 1 1 0 0] [0 1 0 2 1 2 0 1 2 0] [[2 2 0]
 [1 2 1]
 [2 1 0]
 [0 0 2]
 [0 2 1]
 [0 0 2]
 [1 1 0]
 [1 1 1]
 [0 0 2]
 [2 0 0]]


array([[2, 2, 0],
       [1, 2, 1],
       [2, 1, 0],
       [0, 0, 2],
       [0, 2, 1],
       [0, 0, 2],
       [1, 1, 0],
       [1, 1, 1],
       [0, 0, 2],
       [2, 0, 0]])