In [1]:
import numpy as np
import pandas as pd
from rank_eval_pipeline import RankEval
import rank_algos
from collections import defaultdict
from operator import itemgetter
from helper_functions import get_true_baseline, area_under_the_curve

import time
import timeit

In [2]:
def average_ranking(rankings):
    """
    Using the borda count method, average the rankings in the list of rankings.

    Returns: a list of features in order of their average ranking,
                a list of the average scores for each feature
    """
    scores = defaultdict(int)
    for ranking in rankings:
        for i, feature in enumerate(ranking):
            scores[feature] += len(ranking) - i

    # sort by score, highest first
    average_ranking = sorted(scores.items(), key=itemgetter(1), reverse=True)

    # extract the features and their scores from the (feature, score) pairs
    average_features = [feature for feature, score in average_ranking]
    average_scores = [score for feature, score in average_ranking]

    return average_features, average_scores

In [3]:
def evaluate_ranking(features, scores):
    """
    Takes feature ranking and scores and returns the AUC of the first generation and singles as well as the execution time if
    it was passed to the function.
    """
    RE = RankEval("","")
    RE.ranking = features
    RE.scores = scores
    RE.evaluate_ranking()

    baseline = get_true_baseline()

    auc_first_gen = np.mean(RE.eval_res_first_gen[0] - baseline)/(1 - area_under_the_curve(baseline))

    return auc_first_gen

In [4]:
def main_regular(subsampling):
    start_time = time.time()
    # load data
    data = pd.read_csv('data/full_data.csv')

    # create the RankEval object
    RE = RankEval(data, rank_algos.random_forest_score,
                    subsampling_proportion=subsampling)

    # get the scores
    features, scores = RE.get_scores()[0], RE.get_scores()[1]

    end_time = time.time()
    return evaluate_ranking(features, scores), end_time - start_time

In [5]:
def main_ensemble(seeds, subsampling):
    start_time = time.time()
    # load data
    data = pd.read_csv('data/full_data.csv')

    all_rankings = []

    for seed in seeds:
        # create the RankEval object
        RE = RankEval(data, rank_algos.random_forest_score,
                      seed=seed, 
                      subsampling_proportion=subsampling)

        # get the scores
        results = RE.get_scores()
        all_rankings.append(list(results[0]))

    avg = average_ranking(all_rankings)

    end_time = time.time()
    return evaluate_ranking(*avg), end_time - start_time

In [6]:
# results = {}
# subsampling = []
# seeds = list(range(100))
# for sub in subsampling:
#     print(f"Subsampling: {sub}")
#     print(f"Number of runs: 100 , approximated by {len(seeds)}")
#     time = timeit.timeit('main_ensemble(seeds, sub)', number=1, globals=globals())
#     time_without_dataread = time -  12.6
#     time_per_seed = time_without_dataread/len(seeds)
#     time_for_100 = 12.6 + time_per_seed*100
#     print(f"Ensemble runtime: {time_for_100} seconds.")

In [7]:
# results = {}
# subsampling = [0.0001, 0.001, 0.01, 0.1]
# seeds = list(range(10))
# for sub in subsampling:
#     print(f"Subsampling: {sub}")
#     print((f"Number of runs: {len(seeds)}"))
#     auc, runtime = main_ensemble(seeds, sub)
#     print(f"Ensemble runtime: {runtime} seconds.")
#     print(f"Ensemble AUC: {auc}")

Subsampling: 0.0001
Number of runs: 10
Ensemble runtime: 16.044456958770752 seconds.
Ensemble AUC: 0.4028707745528648
Subsampling: 0.001
Number of runs: 10
Ensemble runtime: 15.81665825843811 seconds.
Ensemble AUC: 0.4838189506482014
Subsampling: 0.01
Number of runs: 10
Ensemble runtime: 28.3675217628479 seconds.
Ensemble AUC: 0.5029717173636098
Subsampling: 0.1
Number of runs: 10
Ensemble runtime: 189.19193720817566 seconds.
Ensemble AUC: 0.5051891350445996
