# Analyse obtained results

## Imports and necessary definitions valid for all analysis

In [None]:
from pathlib import Path

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import seaborn as sns

from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.colors import ListedColormap
from matplotlib.patches import Rectangle
from pandas_profiling import ProfileReport

%load_ext autoreload
%autoreload 2

In [None]:
df = pd.read_csv("experiments/all_results.csv")
df = df.drop("Unnamed: 0", axis=1)
df.head()

In [None]:
ss_methods = df["selection_metric"].unique()
protocols = df["protocol"].unique()

# in some experiments mi_value was saved tih too big precision (differences 
# in 15th floating place) hence we need to round them 
df["mi_value"] = df["mi_value"].round(1)
mi_vals = df["mi_value"].unique()

# permute networks so that they're ordered as following:
# - real networks, ER SF
nets_ml = [
    'aucs', 'ckm_physicians', 'eu_transportation', 'lazega', 
    'er2', 'er3', 'er5', 'sf2', 'sf3', 'sf5',
]
nets_sl = ["eu_trans_1", "er1", "sf1"]
assert set(df["network"].unique()) == set(nets_ml + nets_sl)


# <----------- SELECT networks to analyse (multilayer of singlelayer)
_net_choice = "ml"
if _net_choice == "ml":
    nets = nets_ml
    net_case = "multilayer"
elif _net_choice == "sl":
    nets = nets_sl
    net_case = "singlelayer"
elif _net_choice == "all":
    nets = nets_ml + nets_sl
    net_case = "all" 
else:
    raise ValueError
print(f"Choice: {net_case}")


ss_methods_abbrv_map ={
    'degree_centrality': "deg-c",
    'greedy': "greedy",
    'k_shell': "k-sh",
    'k_shell_mln': "k-sh-m",
    'neighbourhood_size': "nghb-1s",
    'neighbourhood_2_hop_size': "nghb-2s",
    'page_rank': "p-rnk",
    'page_rank_mln': "p-rnk-m",
    'random': "random",
    'vote_rank': "v-rnk",
    'vote_rank_mln': "v-rnk-m",
}

nets_abbrv_map = {
    "aucs": "aucs",
    "ckm_physicians": "ckmp",
    "lazega": "lazega",
    "eu_transportation": "eutr-A",
    "eu_trans_1": "eutr-1",
    "er1": "er-1",
    "er2": "er-2",
    "er3": "er-3",
    "er5": "er-5",
    "sf1": "sf-1",
    "sf2": "sf-2",
    "sf3": "sf-3",
    "sf5": "sf-5",
}

In [None]:
# for protocol OR we had budgets like [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30] 
# for protocol AND we had budgets like 
# [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40],
# but we'd like to skip wery low budgets where all methods are unaccurate and keep only:
# [15, 20, 25, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40] 
rows_to_drop = df.loc[
    (df["protocol"] == "AND") & 
    (
        ~df["seeding_budget"].isin(
            {15, 20, 25, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40}
        )
    )
]
df = df.drop(rows_to_drop.index)
s_budgets = df["seeding_budget"].unique()

# drop unselected nets
rows_to_drop = df.loc[~df["network"].isin(set(nets))]
df = df.drop(rows_to_drop.index)


ss_methods, protocols, mi_vals, s_budgets, nets

In [None]:
from typing import Optional, Union

def prepare_ticklabels(series: pd.Index) -> Union[np.ndarray, str]:
    try:
        return series.to_numpy().round(2)
    except:
        return "auto"

def plot_heatmap(
    vis_df: pd.DataFrame,
    heatmap_ax: plt.Axes,
    bar_ax: plt.Axes,
    vrange=(0, 100),
    cmap="RdYlGn",
    mask: Optional[pd.DataFrame] = None,
    fmt: Optional[str] = ".0f",
) -> None:
    sns.heatmap(
        vis_df,
        ax=heatmap_ax,
        cbar_ax=bar_ax,
        cmap=cmap,
        vmin=vrange[0],
        vmax=vrange[1],
        annot=True,
        annot_kws={"size": 7},
        fmt=fmt,
        yticklabels=prepare_ticklabels(vis_df.index),
        xticklabels=prepare_ticklabels(vis_df.columns),
        linewidth=.5,
        mask=mask,
        cbar= True if bar_ax is not None else False,
    )
    heatmap_ax.invert_yaxis()
    # heatmap_ax.tick_params(axis="x", rotation=80)

## Detailed heatmaps for all cases

Charts of f(mi_value, seeding_budget) = gain for each network, protocol, ssm case

### single plot

In [None]:
x = "mi_value"
y = "seeding_budget"
z = "gain"

net = "aucs"
ssm = "greedy"

fig, ax = plt.subplots(
    nrows=1, ncols=3, figsize=(10, 4), gridspec_kw={"width_ratios": [49, 49, 2]}
)
fig.tight_layout(pad=0.5, rect=(0.05, 0.05, 0.95, 0.95))
title = f"{z} achieved by {ssm} s.s. method on {net} net; protocols from left:"

for idx, proto in enumerate(df["protocol"].unique()):
    df_plotted = df.loc[
        (df["network"] == net) &
        (df["protocol"] == proto) &
        (df["selection_metric"] == ssm)
    ]
    df_plotted = pd.pivot_table(df_plotted, index=x, columns=y, values=z)
    plot_heatmap(df_plotted, ax[idx], ax[2])
    title += f" {proto}"

fig.suptitle(title)

### bulk plot (pdf)

In [None]:
# define values to visualise as well as attributes of plots
x = "mi_value"
y = "seeding_budget" 
Z = {
    "gain": {"vrange": (0, 100), "cmap": "RdYlGn"},
    "diffusion_len": {"vrange": (0, df["diffusion_len"].max()), "cmap": "BuPu"},
}

# create file descriptor where to save visualisations
workdir = Path(".")
workdir.mkdir(exist_ok=True)
pdf = PdfPages(workdir.joinpath(f"heatmaps_{'_'.join(Z)}_by_{x}_{y}.pdf"))

for net in sorted(nets):

    # a dummy plot that contains just name of processed network
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 4))
    ax.set_visible(False)
    fig.suptitle(f"{net} network - results", x=0.5, y=.5, fontsize = 15)
    fig.savefig(pdf, format="pdf")
    plt.close(fig)

    for ssm in sorted(ss_methods):

        for z, z_attrs in Z.items():
            print(f"processing: {net}, {ssm}, {z}")

            # prepare canvas - proto, proto, legend
            fig, ax = plt.subplots(
                nrows=1,
                ncols=3,
                figsize=(12, 4),
                gridspec_kw={"width_ratios": [49, 49, 2]}
            )
            fig.tight_layout(pad=0.5, rect=(0.1, 0.1, 0.9, 0.9))
            title = (
                f"{z} achieved by {ssm} s.s. method on {net} network; "
                "protocols from left:"
            )

            # prepare and plot heatmap for each proto
            for idx, proto in enumerate(protocols):
                df_plot = df.loc[
                    (df["network"] == net) &
                    (df["protocol"] == proto) &
                    (df["selection_metric"] == ssm)
                ]
                df_plot = pd.pivot_table(df_plot, index=x, columns=y, values=z)
                if len(df_plot) == 0:  # greedy wasn"t evaluated for all nets
                    continue
                plot_heatmap(
                    df_plot, ax[idx], ax[2], z_attrs["vrange"], z_attrs["cmap"]
                )
                title += f" {proto}"

            # add title and save plot to pdf
            fig.suptitle(title)
            fig.savefig(pdf, format="pdf")
            plt.close(fig)

pdf.close()

## Ranking for each selection method f(mi_Value, seeding_budget) -> position in the ranking 

First create a tensor if shape (ss_methods x nets x mi_vals x s_budgets_proto)
valued with gains

In [None]:
proto = "AND"  # <---------- change this to plot stats for AND!!!
s_budgets_proto = df.loc[df["protocol"] == proto]["seeding_budget"].unique()
cube = {}
consist_shape = []

for ssm in ss_methods:
    for net in nets:
        ddf = df.loc[
            (df["network"] == net) &
            (df["protocol"] == proto) &
            (df["selection_metric"] == ssm)
        ]
        ddf = pd.pivot_table(
            ddf, index="mi_value", columns="seeding_budget", values="gain"
        ).to_numpy()
        if len(consist_shape) == 0:
            consist_shape = ddf.shape
        if len(ddf) == 0:
            ddf = np.empty(shape=consist_shape)
            ddf[:] = np.nan

        if not cube.get(ssm):
            cube[ssm] = {}
        cube[ssm][net] = ddf

len(cube)

In [None]:
list_arrays = []
ssm_axis = []
nets_axis = []

for ssm, net_dict in cube.items(): 
    ssm_axis.append(ssm)
    _arrs = []
    for idx, (net, arr) in enumerate(net_dict.items()):
        if len(nets_axis) == len(nets):
            assert nets_axis[idx] == net
        else:
            nets_axis.append(net)
        _arrs.append(arr)

    list_arrays.append(np.stack(_arrs))

cube_gain = np.stack(list_arrays)

# array with all gains concatenated for all test runs - check shape (L==R)
cube_gain.shape, (len(ss_methods), len(nets), len(mi_vals), len(s_budgets_proto))

When tensor is obtained now compute rankings of ss methods, i.e. place of ss 
method for certain evaluated case: net, mi_val, s_budgets_proto by obtained gain

In [None]:
from typing import Dict, List
from scipy.stats import rankdata

def rank_result(arr: np.ndarray) -> np.ndarray:
    """
    Create renking e.g for this array:
    [52.63, 3.50, 100., 100., 3.50, 100., 100., 100., 100., 22.54, 100.]
    return [2, 4, 1, 1, 4, 1, 1, 1, 1, 3, 1]
    """
    if np.nan in arr:
        raise ValueError
    raw_rank = rankdata(arr, method="min")
    inverted_rank = np.abs(raw_rank - max(raw_rank) - 1)
    _ = {j: i for i, j in enumerate(sorted(np.unique(inverted_rank)), 1)}
    return np.vectorize(lambda x: _[x])(inverted_rank)

def stat_to_dict(stat: np.ndarray, names: List) -> Dict:
    return {n: stat[idx] for idx, n in enumerate(names)}

ranks = np.apply_along_axis(rank_result, 0, cube_gain)
print(ranks.shape)

Then reduce dimensionality of tensor by mean to obtain flat vector

In [None]:
ranks_reduded_by_mean_in_nets = np.mean(ranks, 1)
print(ranks_reduded_by_mean_in_nets.shape)

ranks_reduded_by_mean_in_nets_sb = np.mean(ranks_reduded_by_mean_in_nets, 1)
print(ranks_reduded_by_mean_in_nets_sb.shape)

ranks_reduded_by_mean_in_nets_sb_mi = np.mean(ranks_reduded_by_mean_in_nets_sb, 1)
print(ranks_reduded_by_mean_in_nets_sb_mi.shape)

overall_gain = stat_to_dict(ranks_reduded_by_mean_in_nets_sb_mi, ssm_axis)

print("Average performance of ssm in total")
overall_gain

In [None]:
# we need to manually obtain indices of real, sf and er nets
print(nets_axis)
rn_names = ("real", nets_axis[0])  # [:4])
er_names = ("er",   nets_axis[1])  # [4:7])
sf_names = ("sf",   nets_axis[2])  # [7:])

print(rn_names)
print(er_names)
print(sf_names)

In [None]:
# compute the same ranks but splitted through types of nets
ranks_real_nets = (ranks[:, 0:1, ...], *rn_names)
ranks_er_nets = (ranks[:, 1:2, ...], *er_names)
ranks_sf_nets = (ranks[:, 2:, ...], *sf_names)

net_type_gain = {}

for rank_ in (ranks_real_nets, ranks_er_nets, ranks_sf_nets):
    arr_, net_type_, names_ = rank_[0], rank_[1], rank_[2]

    rank_reduded_by_mean_in_nets = np.mean(arr_, 1)
    print(ranks_reduded_by_mean_in_nets.shape)

    rank_reduded_by_mean_in_nets_sb = np.mean(rank_reduded_by_mean_in_nets, 1)
    print(rank_reduded_by_mean_in_nets_sb.shape)

    rank_reduded_by_mean_in_nets_sb_mi = np.mean(rank_reduded_by_mean_in_nets_sb, 1)
    print(rank_reduded_by_mean_in_nets_sb_mi.shape)

    net_type_gain[net_type_] = stat_to_dict(rank_reduded_by_mean_in_nets_sb_mi, ss_methods)

print("Average performance of ssm by network type")
net_type_gain

In [None]:
net_type_gain["all"] = overall_gain
ranked_ssms = pd.DataFrame(net_type_gain)
ranked_ssms.to_csv(f"data/findings/ssm_ranking_{proto}_{net_case}.csv")
ranked_ssms

In [None]:
# debugging
_  = cube_gain[:, :1, ...]
np.apply_along_axis(rank_result, 0, _)[:, 0, 4, 8], _[:, 0, 4, 8]

## General, averaged heatmap

Charts of f(network, ssm) = average(gain) for each protocol

In [None]:
jet = plt.get_cmap("jet")

def plot_series(
    series: pd.DataFrame, ax: plt.Axes, x: str, y:str, label: str
) -> None:
    # color = next(colors)
    avg = series.groupby(x)[y].mean()
    std = series.groupby(x)[y].std()
    ax.scatter(x=avg.index, y=avg.values, label=label, alpha=0.8)
    # ax.fill_between(x=avg.index, y1=avg-std, y2=avg+std, alpha=0.1, color=color)

In [None]:
x = "network"
y = "selection_metric"
z = "gain"

fig, ax = plt.subplots(
    nrows=1, ncols=3, figsize=(10, 4), gridspec_kw={"width_ratios": [49, 49, 2]}
)
fig.tight_layout(pad=.5, rect=(0.05, 0.15, 0.95, 0.95))
title = f"Average {z} achieved by s.s. methods on nets; protocols from left:"

for idx, proto in enumerate(df["protocol"].unique()):
    df_plotted = df.loc[df["protocol"] == proto]
    df_plotted = pd.pivot_table(
        df_plotted, index=x, columns=y, values=z, aggfunc=np.mean
    )
    df_plotted = df_plotted.reindex(index=nets)
    df_plotted = df_plotted.rename(
        index=nets_abbrv_map, columns=ss_methods_abbrv_map
    )
    plot_heatmap(df_plotted, ax[idx], ax[2], mask=df_plotted.isnull())
    title += f" {proto}"
    df_plotted.to_numpy().argmax(axis=1)

    # # mark the best value
    # for net_idx, ssm_idx in enumerate(np.nan_to_num(df_plotted.to_numpy(), 0).argmax(axis=1)):
    #     ax[idx].add_patch(
    #         Rectangle((ssm_idx, net_idx), 1, 1, fill=False, edgecolor="blue", lw=2)
    #     )

fig.subplots_adjust(wspace=.4)
fig.suptitle(title)

from matplotlib.patches import Rectangle
fig.savefig(f"data/findings/means_{net_case}.png", dpi=300)
plt.close(fig)

In [None]:
# debugging

print(df_plotted.to_numpy().argmax(axis=1))
for net_idx, ssm_idx in enumerate(np.nan_to_num(df_plotted.to_numpy(), -100).argmax(axis=1)):
    print(net_idx, ssm_idx)

## Wilcoxon tests

In [None]:
from misc.wilcoxon_test import test_samples
from itertools import combinations

def get_sorted_gain_series(network, selection_method, protocol):
    x = df.loc[
        (df["network"] == network) & 
        (df["selection_metric"] == selection_method) & 
        (df["protocol"] == protocol)
    ].sort_values(by=["seeding_budget", "mi_value"]).reindex()
    if len(x) == 0:
        raise ValueError
    return x["gain"].to_numpy()

def get_selection_metrics_for_net(net):
    return set(df.loc[df["network"] == net]["selection_metric"].unique())

ss_methods, nets, protocols

### single plot

In [None]:
net = nets[0]

fig, ax = plt.subplots(
    nrows=1, ncols=2, figsize=(10, 4), gridspec_kw={"width_ratios": [50, 50]}
)
fig.tight_layout(pad=0.5, rect=(0.05, 0.05, 0.95, 0.95))
title = f"P-values of Wilcoxon test for network: {net}; protocols from left:"


for idx, proto in enumerate(protocols):
    title += f" {proto}"

    p_dict = {}
    for ss1, ss2 in combinations(sorted(ss_methods), 2):
        gain_ss1 = get_sorted_gain_series(net, ss1, proto)
        gain_ss2 = get_sorted_gain_series(net, ss2, proto)
        p_value = test_samples(gain_ss1, gain_ss2)
        if p_dict.get(ss1) is None:
            p_dict[ss1] = {ss2: p_value}
        else:
            p_dict[ss1][ss2] = p_value
    p_values = pd.DataFrame.from_dict(p_dict, orient='index')
    p_values.loc[p_values.columns[-1]] = np.NaN
    p_values.insert(0, p_values.iloc[0].name, np.NaN)

    # shorten names of records
    p_values = p_values.rename(
        index=ss_methods_abbrv_map, columns=ss_methods_abbrv_map
    )

    # plot heatmap
    plot_heatmap(
        p_values, 
        ax[idx],
        bar_ax=None,
        cmap=ListedColormap(['whitesmoke']),
        fmt=".2f"
    )

    # mark the best value
    rows, cols = np.where(p_values.to_numpy() >= 0.05)
    for r_idx, c_idx in zip(rows, cols):
        ax[idx].add_patch(
            Rectangle((c_idx, r_idx), 1, 1, fill=True, color="lightcoral", lw=0)
        )

fig.subplots_adjust(wspace=.4)
fig.suptitle(title)

In [None]:
# p_values.insert(0, p_values.iloc[0].name, None)
# p_values.loc[p_values.columns[-1]] = None
p_values

In [None]:
gain_ss1.shape, gain_ss2.shape, ss1, ss2, net, proto

### bulk plot

In [None]:
# create file descriptor where to save visualisations
pdf = PdfPages(f"data/findings/heatmaps_wilcoxon_test_{net_case}.pdf")

for net in sorted(nets):

    # prepare canvas
    fig, ax = plt.subplots(
        nrows=1, ncols=2, figsize=(10, 4), gridspec_kw={"width_ratios": [50, 50]}
    )
    fig.tight_layout(pad=0.5, rect=(0.05, 0.05, 0.95, 0.95))
    title = f"P-values of Wilcoxon test for network: {net}; protocols from left:"

    # prepare and plot heatmap for each proto
    for idx, proto in enumerate(protocols):
        title += f" {proto}"

        p_dict = {}
        for ss1, ss2 in combinations(sorted(ss_methods), 2):
            if not {ss1, ss2}.issubset(get_selection_metrics_for_net(net)):
                continue  # greedy have not been computed for all nets
            gain_ss1 = get_sorted_gain_series(net, ss1, proto)
            gain_ss2 = get_sorted_gain_series(net, ss2, proto)
            p_value = test_samples(gain_ss1, gain_ss2)
            if p_dict.get(ss1) is None:
                p_dict[ss1] = {ss2: p_value}
            else:
                p_dict[ss1][ss2] = p_value
        p_values = pd.DataFrame.from_dict(p_dict, orient='index')
        p_values.loc[p_values.columns[-1]] = np.NaN
        p_values.insert(0, p_values.iloc[0].name, np.NaN)

        # shorten names of records
        p_values = p_values.rename(
            index=ss_methods_abbrv_map, columns=ss_methods_abbrv_map
        )

        # plot heatmap
        plot_heatmap(
            p_values, ax[idx], bar_ax=None, 
            cmap=ListedColormap(['whitesmoke']), fmt=".2f"
        )

        # mark the best value
        rows, cols = np.where(p_values.to_numpy() >= 0.05)
        for r_idx, c_idx in zip(rows, cols):
            ax[idx].add_patch(
                Rectangle((c_idx, r_idx), 1, 1, fill=True, color="lightcoral", lw=0)
            )

    # add title and save plot to pdf
    fig.subplots_adjust(wspace=.4)
    fig.suptitle(title)
    fig.savefig(pdf, format="pdf")
    plt.close(fig)

pdf.close()

### merged stats

In [None]:
import warnings
warnings.filterwarnings("error")

tests = {}

for net in sorted(nets):
    tests[net] = {}

    for idx, proto in enumerate(protocols):

        p_dict = {}
        for ss1, ss2 in combinations(sorted(ss_methods), 2):
            if not {ss1, ss2}.issubset(get_selection_metrics_for_net(net)):
                p_value = None  # greedy have not been computed for all nets
            else:
                gain_ss1 = get_sorted_gain_series(net, ss1, proto)
                gain_ss2 = get_sorted_gain_series(net, ss2, proto)
                try:
                    p_value = test_samples(gain_ss1, gain_ss2)
                except Warning as e:
                    print(ss1, ss2, net, proto, p_value)
            if p_dict.get(ss1) is None:
                p_dict[ss1] = {ss2: p_value}
            else:
                p_dict[ss1][ss2] = p_value

        p_values = pd.DataFrame.from_dict(p_dict, orient='index')
        p_values.loc[p_values.columns[-1]] = np.NaN
        p_values.insert(0, p_values.iloc[0].name, np.NaN)
        tests[net][proto] = p_values

warnings.resetwarnings()

In [None]:
# concatenate obtained statistics by counting occurances of p_value > 0.05
reference_df = tests[nets[0]]["AND"]
reference_cols = list(reference_df.columns)
reference_idx = list(reference_df.index)

# containers for counts of p_values > 0.05
p_counts_or = np.zeros_like(reference_df.to_numpy())
p_counts_and = p_counts_or.copy()

# for each statistic computed for given net and proto determine p_values > 0.05
# and count them in container
for net, net_tests in tests.items():
    for proto, test in net_tests.items():
        assert list(test.columns) == reference_cols
        assert list(test.index) == reference_idx
        p_counts = (test > 0.05).to_numpy().astype(int)
        if proto == "OR":
            p_counts_or += p_counts
        if proto == "AND":
            p_counts_and += p_counts

# obtained matrices ought to be triangular, so replace zeros above diagonal 
# with nans
p_counts_or[np.tril_indices(p_counts_or.shape[0], -1)] = np.nan
p_counts_and[np.tril_indices(p_counts_and.shape[0], -1)] = np.nan

# convert matrices to dateframes
p_counts_or = pd.DataFrame(p_counts_or, columns=reference_cols, index=reference_idx)
p_counts_and = pd.DataFrame(p_counts_and, columns=reference_cols, index=reference_idx)

In [None]:
# visualise obtained results
fig, ax = plt.subplots(
    nrows=1, ncols=3, figsize=(10, 4), gridspec_kw={"width_ratios": [49, 49, 2]}
)
fig.tight_layout(pad=0.5, rect=(0.05, 0.1, 0.95, 0.85))
title = (
    "Occurences 'p-value > 0.05' for Wilcoxon tests performed on series " 
    + "'f(mi_value, seeding_budget)->gain' \n aggregated over all evaluated " 
    + "networks; protocols from left:"
)

vrange = (0, max(p_counts_or.max().max(), p_counts_and.max().max()))

for idx, (proto, proto_df) in enumerate(zip(["OR", "AND"], [p_counts_or, p_counts_and])):
    proto_df = proto_df.rename(index=ss_methods_abbrv_map, columns=ss_methods_abbrv_map)
    plot_heatmap(proto_df, ax[idx], ax[2], cmap="Reds", fmt=".0f", vrange=vrange)
    title += f" {proto}"

fig.subplots_adjust(wspace=.4)
fig.suptitle(title)
fig.savefig(f"data/findings/p_value_occurences_{net_case}.png", dpi=300)
plt.close(fig)