# Notebook for results concatenation and preprocessing

In [1]:
from pathlib import Path
import pandas as pd

## Random seed selector processing

In [2]:
random_path = Path("experiments/random/results.csv")
random_df = pd.read_csv(random_path, index_col=0)
random_df.head()

Unnamed: 0,network,protocol,seeding_budget,mi_value,repetition_run,diffusion_len,active_actors_prct,seed_actors_prct,gain
0,aucs,OR,1,0.1,1,3,100.0,1.639344,100.0
1,aucs,OR,1,0.1,2,3,100.0,1.639344,100.0
2,aucs,OR,1,0.1,3,3,100.0,1.639344,100.0
3,aucs,OR,1,0.1,4,3,100.0,1.639344,100.0
4,aucs,OR,1,0.1,5,4,100.0,1.639344,100.0


In [3]:
experiment_params = set(random_df.columns)
experiment_params.remove("repetition_run")
experiment_params.remove("gain")
experiment_params.remove("diffusion_len")
experiment_params.remove("active_actors_prct")
experiment_params.remove("seed_actors_prct")

experiment_metrics = set(random_df.columns).difference(experiment_params)

experiment_params = list(experiment_params)
experiment_metrics = list(experiment_metrics)

print(f"Columns that are multi-indices: {experiment_params}")
print(f"Columns that have been left: {experiment_metrics}")

Columns that are multi-indices: ['mi_value', 'protocol', 'network', 'seeding_budget']
Columns that have been left: ['repetition_run', 'seed_actors_prct', 'diffusion_len', 'gain', 'active_actors_prct']


In [4]:
reindexed_df = random_df.set_index(experiment_params)
reindexed_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,repetition_run,diffusion_len,active_actors_prct,seed_actors_prct,gain
mi_value,protocol,network,seeding_budget,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.1,OR,aucs,1,1,3,100.0,1.639344,100.0
0.1,OR,aucs,1,2,3,100.0,1.639344,100.0
0.1,OR,aucs,1,3,3,100.0,1.639344,100.0
0.1,OR,aucs,1,4,3,100.0,1.639344,100.0
0.1,OR,aucs,1,5,4,100.0,1.639344,100.0


In [5]:
averaged_random_df = pd.DataFrame()
for metric in experiment_metrics:
    avg = reindexed_df.groupby(reindexed_df.index)[str(metric)].mean()
    averaged_random_df = pd.concat([averaged_random_df, avg], axis=1)
averaged_random_df.index.set_names(experiment_params, inplace=True)
averaged_random_df = averaged_random_df.reset_index()

averaged_random_df["selection_metric"] = "random"

averaged_random_df.head()

Unnamed: 0,mi_value,protocol,network,seeding_budget,repetition_run,seed_actors_prct,diffusion_len,gain,active_actors_prct,selection_metric
0,0.1,AND,aucs,1,10.5,1.639344,0.7,3.25,4.836066,random
1,0.1,AND,aucs,2,10.5,3.278689,2.8,13.813559,16.639344,random
2,0.1,AND,aucs,3,10.5,3.278689,2.0,9.915254,12.868852,random
3,0.1,AND,aucs,4,10.5,4.918033,2.85,18.62069,22.622951,random
4,0.1,AND,aucs,5,10.5,6.557377,4.4,32.192982,36.639344,random


## Greedy seed selector processing

In [6]:
reference_df = averaged_random_df

greedy_path = Path("experiments/greedy/results.csv")
greedy_df = pd.read_csv(greedy_path)
greedy_df = greedy_df.drop("Unnamed: 0", axis=1)

greedy_df.head()

Unnamed: 0,network,protocol,seeding_budget,mi_value,repetition_run,diffusion_len,active_actors_prct,seed_actors_prct,gain
0,aucs,OR,1.639344,0.1,1,3,100.0,1.639344,100.0
1,ckm_physicians,OR,0.414938,0.1,1,4,48.547718,0.414938,48.333333
2,ckm_physicians,OR,0.829876,0.1,1,4,68.46473,0.829876,68.200837
3,ckm_physicians,OR,1.244813,0.1,1,4,85.477178,1.244813,85.294118
4,ckm_physicians,OR,1.659751,0.1,1,4,100.0,1.659751,100.0


In [7]:
print(f"Length of raw dataframe: {len(greedy_df)}")

for net in greedy_df["network"].unique():

    # take unique seed_actors_prct vals for all ssm except greedy given the net
    allowed_values = reference_df.loc[
        reference_df["network"] == net
    ]["seed_actors_prct"].unique()

    # take all rows where ssm is greedy and seed_actors_prct is not a good val
    greedy_rows_to_be_dropped = greedy_df.loc[
        (greedy_df["network"] == net) &
        (~greedy_df["seed_actors_prct"].round(2).isin(allowed_values.round(2)))
    ]

    print(f"Removing {len(greedy_rows_to_be_dropped)} rows for net: {net}")
    greedy_df = greedy_df.drop(greedy_rows_to_be_dropped.index)

print(f"Length of processed dataframe: {len(greedy_df)}")

greedy_df["selection_metric"] = "greedy"
# greedy_df.to_csv("greedy_imporved.csv")
greedy_df.head()


Length of raw dataframe: 1136
Removing 64 rows for net: aucs
Removing 541 rows for net: ckm_physicians
Removing 104 rows for net: lazega
Length of processed dataframe: 427


Unnamed: 0,network,protocol,seeding_budget,mi_value,repetition_run,diffusion_len,active_actors_prct,seed_actors_prct,gain,selection_metric
0,aucs,OR,1.639344,0.1,1,3,100.0,1.639344,100.0,greedy
3,ckm_physicians,OR,1.244813,0.1,1,4,85.477178,1.244813,85.294118,greedy
5,lazega,OR,1.408451,0.1,1,2,100.0,1.408451,100.0,greedy
6,aucs,OR,1.639344,0.2,1,4,100.0,1.639344,100.0,greedy
9,ckm_physicians,OR,1.244813,0.2,1,4,85.477178,1.244813,85.294118,greedy


## Processing of another metrics

In [8]:
root_path = Path("experiments")
experiments = [*root_path.glob("*")]

In [9]:
def prepare_csv(metric_path):
    df = pd.read_csv(metric_path.joinpath("results.csv"), index_col=0)
    df["selection_metric"] = metric_path.stem
    return df

In [10]:
exp_dfs = [
    prepare_csv(e_name) for e_name in experiments if 
    ("random" not in str(e_name) or "greedy" not in str(e_name))
]

## Saving final dataframe

In [11]:
exp_dfs.append(averaged_random_df)
exp_dfs.append(greedy_df)
final_df = pd.concat(exp_dfs)
final_df = final_df.drop("repetition_run", axis=1)
final_df = final_df.reset_index().drop("index", axis=1)

In [None]:
final_df.to_csv(root_path.joinpath("all_results.csv"))