# Databases Informations

In [2]:
import pandas as pd
import numpy as np


databases = [
    'depressed', 
    'ionosphere', 
    'iris', 
    'lung_cancer', 
    'mushroom', 
    'parkinsons',
    'pima', 
    'titanic',
    'toms', 
    'transfusion',
    'twitter',
    'wdbc' 
]

sizes = []

for database in databases:
    Xy = pd.read_csv(f'../databases/{database}.csv')

    X = Xy.drop(['Class'], axis=1)
    y = Xy['Class']

    sizes.append({
        'database': database,
        'Features': X.columns.size,
        'Samples': X.index.size,
        'E^-': len(y[y == 0]),
        'E^+': len(y[y == 1])
    })

database_sorted = sorted(sizes, key=lambda x: x['Samples'])

for database in database_sorted:
    print(f"Database: {database['database']}")
    print(f"Features: {database['Features']}")
    print(f"Samples: {database['Samples']}")
    print(f"E^-: {database['E^-']}")
    print(f"E^+: {database['E^+']}")
    print('-------------------')

Database: lung_cancer
Features: 6
Samples: 59
E^-: 31
E^+: 28
-------------------
Database: iris
Features: 4
Samples: 150
E^-: 100
E^+: 50
-------------------
Database: parkinsons
Features: 22
Samples: 195
E^-: 48
E^+: 147
-------------------
Database: ionosphere
Features: 33
Samples: 351
E^-: 126
E^+: 225
-------------------
Database: wdbc
Features: 30
Samples: 569
E^-: 357
E^+: 212
-------------------
Database: transfusion
Features: 4
Samples: 748
E^-: 570
E^+: 178
-------------------
Database: pima
Features: 8
Samples: 768
E^-: 500
E^+: 268
-------------------
Database: titanic
Features: 6
Samples: 1309
E^-: 809
E^+: 500
-------------------
Database: depressed
Features: 22
Samples: 1429
E^-: 1191
E^+: 238
-------------------
Database: mushroom
Features: 22
Samples: 8124
E^-: 3916
E^+: 4208
-------------------
Database: toms
Features: 96
Samples: 28179
E^-: 19104
E^+: 9075
-------------------
Database: twitter
Features: 77
Samples: 49999
E^-: 34612
E^+: 15387
-------------------


# Script to generate a table to compare the models in all databases

In [5]:
import pandas as pd


databases = [
    'lung_cancer',
    'iris',
    'parkinsons',
    'ionosphere',
    'wdbc',
    'transfusion',
    'pima',
    'titanic',
    'depressed',
    'mushroom',
    'toms',
    'twitter'
]
models = ['imli', 'ikkrr', 'iminds3']
results_path = './tests/imli_vs_ikkrr_vs_iminds3_results/imli_vs_ikkrr_vs_iminds3_results/'
create_table_path = './tests/imli_vs_ikkrr_vs_iminds3_results/imli_vs_ikkrr_vs_iminds3_tables/'
config_combination_number = 12
realization_number = 10

table_columns = ['Databases', 'Models', 'Number of rules', '|R|', 'Largest rule size', 'Accuracy', 'Training time']
table = pd.DataFrame([], columns=table_columns)

best_config_model = 'iminds3'

for database in databases:
    df_results = pd.read_csv(f'{results_path}{database}_{best_config_model}.csv')

    best_averages_line_index = 0
    best_average_accuracy = 0
    best_averages_line = pd.DataFrame([])

    for averages_line_index in range(0, config_combination_number*(realization_number+1), realization_number+1):
        if df_results['Accuracy'].iloc[averages_line_index + realization_number] > best_average_accuracy:
            best_averages_line_index = averages_line_index
            best_average_accuracy = df_results['Accuracy'].iloc[averages_line_index + realization_number]

            especific_results = df_results[table_columns[2:]].iloc[averages_line_index: averages_line_index + realization_number]

            averages_std = []
            for column in table_columns[2:]:
                cell = ''
                column_mean = especific_results[column].mean()
                column_std = especific_results[column].std()

                cell += f'{column_mean:,.2f}' if column != 'Training time' else f'{column_mean:,.4f}'
                cell += ' ± '
                cell += f'{column_std:,.2f}' if column != 'Training time' else f'{column_std:,.4f}'

                averages_std.append(cell)

            best_averages_line = pd.DataFrame(
                [[database, best_config_model] + averages_std], 
                columns=['Databases', 'Models'] + table_columns[2:]
            )

    table = pd.concat([table, best_averages_line])

    for model in models:
        if model == best_config_model: continue
        
        df_results = pd.read_csv(f'{results_path}{database}_{model}.csv')

        especific_results = df_results[table_columns[2:]].iloc[best_averages_line_index: best_averages_line_index + realization_number]

        averages_std = []
        for column in table_columns[2:]:
            cell = ''
            column_mean = especific_results[column].mean()
            column_std = especific_results[column].std()

            cell += f'{column_mean:,.2f}' if column != 'Training time' else f'{column_mean:,.4f}'
            cell += ' ± '
            cell += f'{column_std:,.2f}' if column != 'Training time' else f'{column_std:,.4f}'

            averages_std.append(cell)
        
        new_line_table = pd.DataFrame(
            [['', model] + averages_std], 
            columns=['Databases', 'Models'] + table_columns[2:]
        )

        table = pd.concat([table, new_line_table])

table.to_csv(create_table_path+f'best_config_{best_config_model}.csv', index=False)

table

Unnamed: 0,Databases,Models,Number of rules,|R|,Largest rule size,Accuracy,Training time
0,lung_cancer,iminds3,1.00 ± 0.00,2.60 ± 1.35,2.60 ± 1.35,0.94 ± 0.06,0.0091 ± 0.0014
0,,imli,1.00 ± 0.00,3.00 ± 1.83,3.00 ± 1.83,0.89 ± 0.10,0.0075 ± 0.0010
0,,ikkrr,1.00 ± 0.00,2.80 ± 1.87,2.80 ± 1.87,0.88 ± 0.10,0.0073 ± 0.0010
0,iris,iminds3,1.00 ± 0.00,4.20 ± 0.92,4.20 ± 0.92,0.91 ± 0.04,0.0026 ± 0.0003
0,,imli,1.00 ± 0.00,4.30 ± 0.67,4.30 ± 0.67,0.90 ± 0.08,0.0021 ± 0.0004
0,,ikkrr,1.00 ± 0.00,4.40 ± 0.70,4.40 ± 0.70,0.91 ± 0.08,0.0022 ± 0.0004
0,parkinsons,iminds3,3.00 ± 0.00,8.50 ± 1.18,3.50 ± 0.71,0.81 ± 0.08,0.0260 ± 0.0068
0,,imli,3.00 ± 0.00,8.30 ± 2.21,3.80 ± 1.40,0.82 ± 0.06,0.0172 ± 0.0035
0,,ikkrr,3.00 ± 0.00,7.80 ± 1.40,3.50 ± 0.53,0.82 ± 0.07,0.0166 ± 0.0031
0,ionosphere,iminds3,1.00 ± 0.00,3.10 ± 0.88,3.10 ± 0.88,0.83 ± 0.09,0.0213 ± 0.0029


In [173]:
database = 'depressed'
model = 'imli'
start_line = 101

all_results = pd.read_csv(f'./tests/imlib_vs_imli_results_means_fixed/{database}_{model}.csv')
columns = ['Number of rules', '|R|', 'Largest rule size', 'Accuracy', 'Training time']
especific_results = all_results[columns].iloc[start_line-2: start_line-2 + 10]

averages_std = []
for column in columns:
    cell = ''
    column_mean = especific_results[column].mean()
    column_std = especific_results[column].std()

    cell += f'{column_mean:,.2f}' if column != 'Training time' else f'{column_mean:,.4f}'
    cell += ' ± '
    cell += f'{column_std:,.2f}' if column != 'Training time' else f'{column_std:,.4f}'

    averages_std.append(cell)

pd.DataFrame([averages_std], columns=columns)

Unnamed: 0,Number of rules,|R|,Largest rule size,Accuracy,Training time
0,1.90 ± 0.32,9.70 ± 2.58,6.20 ± 1.62,0.71 ± 0.11,0.1892 ± 0.0152


# Test Mean and Script to fix Means

In [169]:
acc = [
    0.7447552447552448,
    0.7587412587412588,
    0.7902097902097902,
    0.7657342657342657,
    0.7622377622377622,
    0.8181818181818182,
    0.6573426573426573,
    0.7762237762237763,
    0.7972027972027972,
    0.8076923076923077
]
sum(acc) / 10

0.7678321678321678

In [165]:
databases = [
    'depressed', 
    'ionosphere', 
    'iris', 
    'lung_cancer', 
    'pima', 
    'transfusion', 
    'wdbc', 
    'titanic', 
    'mushroom', 
    'parkinsons'
]
models = ['imli', 'imlib']

for database in databases:
    for model in models:
        all_results = pd.read_csv(f'./tests/imlib_vs_imli_results/{database}_{model}.csv')
        columns = ['Rule set size', 'Sum rules size', 'Larger rule size', 'Accuracy', 'Training time']
        start_lines = 2 # += 11

        for line in range(start_lines, 12*11, 11):
            especific_results = all_results[columns].iloc[line-2: line-2 + 10]
            averages = ['Averages', '']
            for column in columns:
                column_mean = especific_results[column].mean()
                averages.append(column_mean)
            all_results.loc[line-2+10, :] = averages

        new_columns = ['Configuration', 'Rules size', 'Number of rules', '|R|', 'Largest rule size', 'Accuracy', 'Training time']
        all_results.rename(columns={'Rule set size': 'Number of rules', 'Sum rules size': '|R|', 'Larger rule size': 'Largest rule size'}, inplace = True)
        all_results.to_csv(f'./tests/imlib_vs_imli_results_means_fixed/{database}_{model}.csv', index=False)