In [None]:
import pandas as pd
import numpy as np
import importlib

import rank_eval_pipeline as rep
import rank_algos
import eval_algos

%load_ext autoreload
%autoreload 2

In [None]:
data = pd.read_csv('data/full_data.csv')
data = data.head(100)

In [13]:
rank_method = rank_algos.mutual_info_score
eval_method = eval_algos.fuzzy_jaccard

rank_eval = rep.RankEval(data, rank_method, eval_method)

### Mutual info results (on all data)

In [None]:
# mutual info on full data results:
results = np.array([0.09717625, 0.06888329, 0.06126243, 0.06269187, 0.17329753, 0.14716816
        , 0.19778081, 0.07532905, 0.0766885, 0.06631544, 0.06275608, 0.04218489
        , 0.15705678, 0.13656907, 0.19607233, 0.20788328, 0.23036554, 0.10723845
        , 0.05922656, 0.06632573, 0.07530304, 0.0628231, 0.0557336, 0.08918849
        , 0.20410476, 0.13251569, 0.05500895, 0.18283456, 0.02394768, 0.03650781
        , 0.07144077, 0.15222733, 0.18366797, 0.06169769, 0.00195838, 0.18430796
        , 0.14075448, 0.1427426, 0.12788617, 0.17901007, 0.17435444, 0.1229047
        , 0.13191568, 0.09527463, 0.0694764, 0.14802611, 0.12330489, 0.12995531
        , 0.16931972, 0.08140305, 0.11965908, 0.11227323, 0.10220764, 0.17162563
        , 0.14953368, 0.06142728, 0.1352397, 0.06594784, 0.06246859, 0.01162358
        , 0.06505164, 0.11061847, 0.15131267, 0.09317203, 0.10165634, 0.15379381
        , 0.16504244, 0.11004817, 0.10467595, 0.04368644, 0.24423437, 0.03381416
        , 0.21592521, 0.17266281, 0.22476187, 0.21147982, 0.25214441, 0.1834268
        , 0.20025225, 0.21258738, 0.2518063, 0.04038459, 0.07872028, 0.05843621
        , 0.05958591, 0.08510234, 0.06077662, 0.07338735, 0.06669493, 0.06683989
        , 0.19029739, 0.18159669, 0.17475133, 0.18460458, 0.21162839, 0.22964289
        , 0.20058981, 0.20365378, 0.53797382, 0.70876002])

In [14]:
gds = eval_algos.get_ground_truths()

gd_singles = gds[0]["rig"].to_numpy().astype(float)
gd_first_gen = gds[1]["rig"].to_numpy().astype(float)

# add min value to gd_singles to make it positive
gd_singles = gd_singles + abs(gd_singles.min())
gd_first_gen = gd_first_gen + abs(gd_first_gen.min())

In [15]:
importlib.reload(rep)

rank_eval.df_scores = results
rank_eval.ground_truth_singles = gd_singles
rank_eval.ground_truth_first_gen = gd_first_gen

rank_eval.evaluate(eval_only=True)

(0.8635530367032218, 0.7392145047450169)

In [16]:
importlib.reload(eval_algos)

gds = eval_algos.get_ground_truths_ordered()

gd_singles = gds[0][:10]
gd_first_gen = gds[1][:10]

# add feature names to results
results = pd.DataFrame(results, columns=["score"])
results["feature"] = range(0, len(results))
print(results)

print(gd_singles)
print(gd_first_gen)
print(results[:10])

       score  feature
0   0.097176        0
1   0.068883        1
2   0.061262        2
3   0.062692        3
4   0.173298        4
..       ...      ...
95  0.229643       95
96  0.200590       96
97  0.203654       97
98  0.537974       98
99  0.708760       99

[100 rows x 2 columns]
   change       rig
0      98  0.940000
1      99  0.940000
0      23  0.763097
1      12  0.763097
2      10  0.759176
3      21  0.754557
4      16  0.752791
5      19  0.751879
6      15  0.751157
7      49  0.750405
   change                  rig
0      99   0.9985502828799538
1      98   0.9985502828799538
2      23  0.23865014720115485
3      22   0.1873707610194254
4      86  0.18647041667281428
5      85   0.1864668119884656
6      88  0.18645454168679054
7      89  0.18645454168679054
8      87  0.18645454168679054
9      84   0.1861464287539384
      score  feature
0  0.097176        0
1  0.068883        1
2  0.061262        2
3  0.062692        3
4  0.173298        4
5  0.147168        5
6  0

In [17]:
a_singles = gd_singles["rig"].to_numpy(float)
a_first_gen = gd_first_gen["rig"].to_numpy(float)

b_singles = results[:10]["feature"].to_numpy(float)
b_first_gen = results[:10]["feature"].to_numpy(float)

print(eval_algos.fuzzy_jaccard(a_singles, b_singles)[1])
print(eval_algos.fuzzy_jaccard(a_first_gen, b_first_gen)[1])

0.7004396806244233
0.6393842363753732


### ReliefF results

In [41]:
def results_transform(results: dict):
 results_pd = pd.DataFrame.from_dict(results, orient='index')      # change dict to pd dataframe
 results_pd = results_pd.sort_index()      # sort by index
 results_pd = results_pd.astype(float)      # change type to float
 results_pd = results_pd + abs(results_pd.min())     # change negative values to positive

 return results_pd

def prepare_gd():
 gds = eval_algos.get_ground_truths()

 gd_singles = gds[0]["rig"].to_numpy().astype(float)
 gd_first_gen = gds[1]["rig"].to_numpy().astype(float)

 gd_singles = gd_singles + abs(gd_singles.min())
 gd_first_gen = gd_first_gen + abs(gd_first_gen.min())

 return gd_singles, gd_first_gen

def evaluate_results(results_pd, gd_singles, gd_first_gen):
 rank_eval.df_scores = results_pd[0].to_numpy().astype(float)
 rank_eval.ground_truth_singles = gd_singles
 rank_eval.ground_truth_first_gen = gd_first_gen

 return rank_eval.evaluate(eval_only=True)

In [None]:
relieff_promil = {99: 157200.0,
 98: 157200.0,
 6: 98138.0,
 74: 98138.0,
 75: 98138.0,
 76: 98138.0,
 77: 98138.0,
 78: 98138.0,
 79: 77578.0,
 80: 78786.0,
 70: -88722.0,
 73: -88722.0,
 92: -88722.0,
 93: -88722.0,
 94: -88722.0,
 95: -88726.0,
 96: -88718.0,
 97: -88636.0,
 72: -88146.0,
 16: 98138.0,
 34: 98138.0,
 35: 98138.0,
 14: 98138.0,
 48: 98138.0,
 27: 98138.0,
 90: 98138.0,
 32: 98138.0,
 15: 96842.0,
 91: -25630.0,
 24: 98138.0,
 4: 10452.0,
 39: 7998.0,
 25: -14532.0,
 42: 36396.0,
 40: 42478.0,
 65: -22012.0,
 62: -15334.0,
 53: 40912.0,
 37: 18610.0,
 38: -82762.0,
 36: 5904.0,
 5: -86480.0,
 66: -94110.0,
 28: 14004.0,
 12: -93936.0,
 45: 20234.0,
 13: 40816.0,
 54: 18148.0,
 46: 9612.0,
 61: -484.0,
 52: -52750.0,
 31: 82902.0,
 47: 16064.0,
 56: 18654.0,
 41: 29386.0,
 69: -72916.0,
 51: 3936.0,
 68: 56116.0,
 59: 10962.0,
 43: 55094.0,
 50: 56562.0,
 11: 40414.0,
 67: 40748.0,
 63: 38318.0,
 64: 84274.0,
 71: 87800.0,
 30: -98130.0,
 17: 78704.0,
 29: 17346.0,
 49: -35198.0,
 18: -45148.0,
 26: 30502.0,
 0: 82826.0,
 7: -63702.0,
 44: 56232.0,
 20: 66908.0,
 8: -85698.0,
 9: -93780.0,
 19: -92534.0,
 60: -74840.0,
 23: -81542.0,
 58: -56948.0,
 81: -41030.0,
 82: 93800.0,
 83: 78222.0,
 85: 82956.0,
 86: 26508.0,
 87: 30282.0,
 88: -14068.0,
 89: -93778.0,
 84: -80580.0,
 1: -80578.0,
 21: -70330.0,
 10: 98138.0,
 22: 38122.0,
 55: 56834.0,
 57: -94806.0,
 3: -96098.0,
 2: -92044.0,
 33: -64788.0}

relieff_procent = {99: 1572800.0,
 98: 1572800.0,
 75: 1097214.0,
 77: 1097214.0,
 78: 1097214.0,
 79: 1097214.0,
 80: 1097214.0,
 74: 1097214.0,
 73: 912950.0,
 70: 940584.0,
 76: -990000.0,
 92: -990000.0,
 93: -990000.0,
 94: -989994.0,
 95: -990000.0,
 96: -989960.0,
 97: -989932.0,
 6: -989538.0,
 72: -982026.0,
 16: 1097214.0,
 34: 1097214.0,
 14: 1097214.0,
 90: 1097214.0,
 32: 1097214.0,
 35: 1097214.0,
 91: 1097214.0,
 48: 1097214.0,
 27: 1077812.0,
 15: -120804.0,
 24: 1097214.0,
 4: 210694.0,
 39: 241728.0,
 42: -85136.0,
 40: 438014.0,
 25: 545664.0,
 62: -127670.0,
 65: -33552.0,
 45: 584622.0,
 5: 444442.0,
 53: -862940.0,
 28: 89444.0,
 13: -945620.0,
 31: -1048056.0,
 61: 395596.0,
 37: -993510.0,
 66: 299750.0,
 38: 498046.0,
 36: 255932.0,
 12: 165972.0,
 56: -4156.0,
 54: -562128.0,
 52: 906466.0,
 46: 206042.0,
 68: 249064.0,
 69: 516584.0,
 47: -572368.0,
 51: 165628.0,
 43: 686630.0,
 41: 156432.0,
 59: 661390.0,
 50: 695066.0,
 63: 433542.0,
 11: 441630.0,
 67: 408366.0,
 71: 935474.0,
 64: 985724.0,
 30: -1096884.0,
 17: 938548.0,
 29: 456018.0,
 26: -233278.0,
 18: -328524.0,
 49: 497980.0,
 44: 902202.0,
 0: -456852.0,
 7: 622916.0,
 8: 792492.0,
 9: -928200.0,
 20: -1039468.0,
 19: -942056.0,
 60: -749604.0,
 1: -792516.0,
 23: -557236.0,
 10: -314724.0,
 21: 1042880.0,
 58: 875246.0,
 3: 971768.0,
 81: 491172.0,
 82: 407722.0,
 83: -72348.0,
 84: -935652.0,
 86: -730650.0,
 85: -729920.0,
 87: -663996.0,
 88: 1095702.0,
 89: 504550.0,
 55: 732768.0,
 2: -975644.0,
 22: -1009346.0,
 57: -890614.0,
 33: -592118.0}

relieff_10procentov = {99: 15728700.0,
 98: 15728700.0,
 70: 12141824.0,
 97: 12141824.0,
 96: 12141824.0,
 95: 12141824.0,
 94: 12141824.0,
 93: 12141824.0,
 92: 10619714.0,
 75: 10998412.0,
 76: -10736482.0,
 77: -10736482.0,
 78: -10736482.0,
 79: -10736450.0,
 80: -10736482.0,
 74: -10735960.0,
 73: -10735508.0,
 6: -10729812.0,
 72: -10646330.0,
 16: 12140002.0,
 14: 12140002.0,
 90: 12140002.0,
 32: 12140002.0,
 34: 12140002.0,
 91: 12140002.0,
 48: 12140002.0,
 35: 12140002.0,
 27: 11937076.0,
 15: 294892.0,
 24: 12141824.0,
 4: 3511436.0,
 39: 4447360.0,
 42: -385500.0,
 25: 5024478.0,
 31: 6476978.0,
 62: -749820.0,
 40: 409780.0,
 45: 7294686.0,
 5: 6092582.0,
 13: -9142888.0,
 65: 1344154.0,
 56: -10098828.0,
 28: -11472552.0,
 61: 6377216.0,
 53: -10105972.0,
 12: 4312314.0,
 37: 5435600.0,
 66: 3541550.0,
 38: 2856160.0,
 36: 187390.0,
 68: -5752988.0,
 54: 10371082.0,
 43: 2326250.0,
 52: 3004622.0,
 69: 7096854.0,
 46: -2314792.0,
 51: 3789598.0,
 47: 7698936.0,
 41: 1915902.0,
 59: 7234264.0,
 63: 7834936.0,
 71: 4934584.0,
 50: 5056292.0,
 11: 4469552.0,
 67: 10337318.0,
 64: 10849062.0,
 17: -12131116.0,
 26: 10970072.0,
 30: 7390012.0,
 44: -1683034.0,
 29: -2719398.0,
 0: 6222192.0,
 18: 10298882.0,
 7: -1228896.0,
 8: 7558094.0,
 9: 8911430.0,
 49: -9883166.0,
 20: -11361618.0,
 19: -8282190.0,
 1: -6214150.0,
 10: -6573784.0,
 21: -4255182.0,
 3: -1095396.0,
 2: 11384942.0,
 60: 9499810.0,
 23: 11112332.0,
 58: 6534996.0,
 55: 5188838.0,
 81: -74320.0,
 82: -7462824.0,
 83: -4925592.0,
 84: -4918006.0,
 86: -4482190.0,
 87: 12115218.0,
 88: 6813108.0,
 89: 8871874.0,
 85: -8396194.0,
 22: -9119718.0,
 57: -6747704.0,
 33: -3857856.0}

In [43]:
relieff_promil_pd = results_transform(relieff_promil)
relieff_procent_pd = results_transform(relieff_procent)
relieff_10procentov_pd = results_transform(relieff_10procentov)

relieff_promil_pd

Unnamed: 0,0
0,180956.0
1,17552.0
2,6086.0
3,2032.0
4,108582.0
...,...
95,9404.0
96,9412.0
97,9494.0
98,255330.0


In [39]:
gd_singles, gd_first_gen = prepare_gd()

In [44]:
relieff_promil_evaluated = evaluate_results(relieff_promil_pd, gd_singles, gd_first_gen)
relieff_procent_evaluated = evaluate_results(relieff_procent_pd, gd_singles, gd_first_gen)
relieff_10procentov_evaluated = evaluate_results(relieff_10procentov_pd, gd_singles, gd_first_gen)

print(f"relieff_promil_evaluated: {relieff_promil_evaluated}")
print(f"relieff_procent_evaluated: {relieff_procent_evaluated}")
print(f"relieff_10procentov_evaluated: {relieff_10procentov_evaluated}")

relieff_promil_evaluated: (0.8224700130720496, 0.744032342866586)
relieff_procent_evaluated: (0.8370984227548818, 0.733525658248738)
relieff_10procentov_evaluated: (0.8514649310560692, 0.7461393376478086)
