In [None]:
from pathlib import Path

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import seaborn as sns

from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.colors import ListedColormap
from matplotlib.patches import Rectangle
from pandas_profiling import ProfileReport

%load_ext autoreload
%autoreload 2

In [None]:
df = pd.read_csv("experiments/all_results.csv")
df = df.drop("Unnamed: 0", axis=1)
df.head()

In [None]:
ss_methods = df["selection_metric"].unique()
nets = df["network"].unique()
protocols = df["protocol"].unique()

ss_methods_abbrv_map ={
    'degree_centrality': "deg-c",
    'greedy': "greedy",
    'k_shell': "k-sh",
    'k_shell_mln': "k-sh-m",
    'neighbourhood_size': "nghb-s",
    'page_rank': "p-rnk",
    'page_rank_mln': "p-rnk-m",
    'random': "random",
    'vote_rank': "v-rnk",
    'vote_rank_mln': "v-rnk-m",
}

ss_methods, nets, protocols

## Detailed heatmaps for all cases

Charts of f(mi_value, seeding_budget) = gain for each network, protocol, ssm case

In [None]:
from typing import Optional, Union

def prepare_ticklabels(series: pd.Index) -> Union[np.ndarray, str]:
    try:
        return series.to_numpy().round(2)
    except:
        return "auto"

def plot_heatmap(
    vis_df: pd.DataFrame,
    heatmap_ax: plt.Axes,
    bar_ax: plt.Axes,
    vrange=(0, 100),
    cmap="RdYlGn",
    mask: Optional[pd.DataFrame] = None,
    fmt: Optional[str] = ".0f",
) -> None:
    sns.heatmap(
        vis_df,
        ax=heatmap_ax,
        cbar_ax=bar_ax,
        cmap=cmap,
        vmin=vrange[0],
        vmax=vrange[1],
        annot=True,
        annot_kws={"size": 7},
        fmt=fmt,
        yticklabels=prepare_ticklabels(vis_df.index),
        xticklabels=prepare_ticklabels(vis_df.columns),
        linewidth=.5,
        mask=mask,
        cbar= True if bar_ax is not None else False,
    )
    heatmap_ax.invert_yaxis()
    # heatmap_ax.tick_params(axis="x", rotation=80)

### single plot

In [None]:
x = "mi_value"
y = "seeding_budget"
z = "gain"

net = "aucs"
ssm = "greedy"

fig, ax = plt.subplots(
    nrows=1, ncols=3, figsize=(10, 4), gridspec_kw={"width_ratios": [49, 49, 2]}
)
fig.tight_layout(pad=0.5, rect=(0.05, 0.05, 0.95, 0.95))
title = f"{z} achieved by {ssm} s.s. method on {net} net; protocols from left:"

for idx, proto in enumerate(df["protocol"].unique()):
    df_plotted = df.loc[
        (df["network"] == net) &
        (df["protocol"] == proto) &
        (df["selection_metric"] == ssm)
    ]
    df_plotted = pd.pivot_table(df_plotted, index=x, columns=y, values=z)
    plot_heatmap(df_plotted, ax[idx], ax[2])
    title += f" {proto}"

fig.suptitle(title)

### bulk plot (pdf)

In [None]:
# define values to visualise as well as attributes of plots
x = "mi_value"
y = "seeding_budget" 
Z = {
    "gain": {"vrange": (0, 100), "cmap": "RdYlGn"},
    "diffusion_len": {"vrange": (0, df["diffusion_len"].max()), "cmap": "BuPu"},
}

# create file descriptor where to save visualisations
workdir = Path(".")
workdir.mkdir(exist_ok=True)
pdf = PdfPages(workdir.joinpath(f"heatmaps_{"_".join(Z)}_by_{x}_{y}.pdf"))

for net in sorted(nets):

    # a dummy plot that contains just name of processed network
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 4))
    ax.set_visible(False)
    fig.suptitle(f"{net} network - results", x=0.5, y=.5, fontsize = 15)
    fig.savefig(pdf, format="pdf")
    plt.close(fig)

    for ssm in sorted(ss_methods):

        for z, z_attrs in Z.items():
            print(f"processing: {net}, {ssm}, {z}")

            # prepare canvas - proto, proto, legend
            fig, ax = plt.subplots(
                nrows=1,
                ncols=3,
                figsize=(12, 4),
                gridspec_kw={"width_ratios": [49, 49, 2]}
            )
            fig.tight_layout(pad=0.5, rect=(0.1, 0.1, 0.9, 0.9))
            title = (
                f"{z} achieved by {ssm} s.s. method on {net} network; "
                "protocols from left:"
            )

            # prepare and plot heatmap for each proto
            for idx, proto in enumerate(protocols):
                df_plot = df.loc[
                    (df["network"] == net) &
                    (df["protocol"] == proto) &
                    (df["selection_metric"] == ssm)
                ]
                df_plot = pd.pivot_table(df_plot, index=x, columns=y, values=z)
                if len(df_plot) == 0:  # greedy wasn"t evaluated for all nets
                    continue
                plot_heatmap(
                    df_plot, ax[idx], ax[2], z_attrs["vrange"], z_attrs["cmap"]
                )
                title += f" {proto}"

            # add title and save plot to pdf
            fig.suptitle(title)
            fig.savefig(pdf, format="pdf")
            plt.close(fig)

pdf.close()

## General, averaged heatmap

Charts of f(network, ssm) = average(gain) for each protocol

In [None]:
jet = plt.get_cmap("jet")

def plot_series(
    series: pd.DataFrame, ax: plt.Axes, x: str, y:str, label: str
) -> None:
    # color = next(colors)
    avg = series.groupby(x)[y].mean()
    std = series.groupby(x)[y].std()
    ax.scatter(x=avg.index, y=avg.values, label=label, alpha=0.8)
    # ax.fill_between(x=avg.index, y1=avg-std, y2=avg+std, alpha=0.1, color=color)


In [None]:
mi_values = net_proto_df["mi_value"].unique()

fig, ax = plt.subplots(nrows=1, ncols=len(mi_values))
fig.tight_layout()

for idx, mi in enumerate(mi_values):
    mi_net_proto_df = net_proto_df.loc[net_proto_df["mi_value"] == mi]
    for ssm in ss_methods:
        ssm_net_proto_df = mi_net_proto_df.loc[mi_net_proto_df["selection_metric"] == ssm]
        plot_series(series=ssm_net_proto_df, ax=ax[idx], x=x, y=y, label=ssm)

ax[-1].legend(loc="center left", bbox_to_anchor=(1, 0.5))
for idx, axx in enumerate(ax):
    axx.set_xticks([mi_values[idx]])
ax[0].set_ylabel(y)
ax[len(ax) // 2].set_xlabel(x)
fig.suptitle(f"protocole: {proto}, network: {net}")


In [None]:
x = "mi_value"
y = "gain"

proto = protocols[0]
proto_df = df.loc[df["protocol"] == proto]
proto_df = proto_df.drop("protocol", axis=1)

net = nets[3]
net_proto_df = proto_df.loc[proto_df["network"] == net]
net_proto_df = net_proto_df.drop("network", axis=1)

fig, ax = plt.subplots(nrows=1, ncols=1)

colors = iter(jet(np.linspace(0,1,len(ss_methods))))

for idx, ssm in enumerate(ss_methods):
    ssm_net_proto_df = net_proto_df.loc[net_proto_df["selection_metric"] == ssm]
    ssm_net_proto_df = ssm_net_proto_df.drop("selection_metric", axis=1)
    plot_series(series=ssm_net_proto_df, ax=ax, x=x, y=y, label=ssm)
    # break

ax.legend(loc="center left", bbox_to_anchor=(1, 0.5))
ax.set_xlabel(x)
ax.set_ylabel(y)
fig.suptitle(f"protocole: {proto}, network: {net}")

In [None]:
x = "network"
y = "selection_metric"
z = "gain"

fig, ax = plt.subplots(
    nrows=1, ncols=3, figsize=(10, 4), gridspec_kw={"width_ratios": [49, 49, 2]}
)
fig.tight_layout(pad=.5, rect=(0.05, 0.15, 0.95, 0.95))
title = f"Average {z} achieved by s.s. methods on nets; protocols from left:"

for idx, proto in enumerate(df["protocol"].unique()):
    df_plotted = df.loc[df["protocol"] == proto]
    df_plotted = pd.pivot_table(
        df_plotted, index=x, columns=y, values=z, aggfunc=np.mean
    )
    df_plotted = df_plotted.reindex(
        index=[
            "aucs", "ckm_physicians", "lazega", "eu_transportation",
            "er2", "er3", "er5", "sf2", "sf3", "sf5",
        ]
    )
    df_plotted = df_plotted.rename(
        index={
            "aucs": "aucs",
            "ckm_physicians": "ckmp",
            "lazega": "lazega",
            "eu_transportation": "eutr",
            "er2": "er-2",
            "er3": "er-3",
            "er5": "er-5",
            "sf2": "sf-2",
            "sf3": "sf-3",
            "sf5": "sf-5"
        },
        columns=ss_methods_abbrv_map
    )
    plot_heatmap(df_plotted, ax[idx], ax[2], mask=df_plotted.isnull())
    title += f" {proto}"
    df_plotted.to_numpy().argmax(axis=1)

    # mark the best value
    for net_idx, ssm_idx in enumerate(np.nan_to_num(df_plotted.to_numpy(), 0).argmax(axis=1)):
        ax[idx].add_patch(
            Rectangle((ssm_idx, net_idx), 1, 1, fill=False, edgecolor="blue", lw=2)
        )

fig.subplots_adjust(wspace=.4)
fig.suptitle(title)

from matplotlib.patches import Rectangle
fig.savefig("means.png", dpi=300)
plt.close(fig)




In [None]:
print(df_plotted.to_numpy().argmax(axis=1))
for net_idx, ssm_idx in enumerate(np.nan_to_num(df_plotted.to_numpy(), -100).argmax(axis=1)):
    print(net_idx, ssm_idx)

## Wilcoxon tests

In [None]:
from misc.wilcoxon_test import test_samples
from itertools import combinations

def get_sorted_gain_series(network, selection_method, protocol):
    x = df.loc[
        (df["network"] == network) & 
        (df["selection_metric"] == selection_method) & 
        (df["protocol"] == protocol)
    ].sort_values(by=["seeding_budget", "mi_value"]).reindex()
    return x["gain"].to_numpy()

def get_selection_metrics_for_net(net):
    return set(df.loc[df["network"] == net]["selection_metric"].unique())

ss_methods, nets, protocols

### single plot

In [None]:
net = "aucs"

fig, ax = plt.subplots(
    nrows=1, ncols=2, figsize=(10, 4), gridspec_kw={"width_ratios": [50, 50]}
)
fig.tight_layout(pad=0.5, rect=(0.05, 0.05, 0.95, 0.95))
title = f"P-values of Wilcoxon test for network: {net}; protocols from left:"


for idx, proto in enumerate(protocols):
    title += f" {proto}"

    p_dict = {}
    for ss1, ss2 in combinations(sorted(ss_methods), 2):
        gain_ss1 = get_sorted_gain_series(net, ss1, proto)
        gain_ss2 = get_sorted_gain_series(net, ss2, proto)
        p_value = test_samples(gain_ss1, gain_ss2)
        if p_dict.get(ss1) is None:
            p_dict[ss1] = {ss2: p_value}
        else:
            p_dict[ss1][ss2] = p_value
    p_values = pd.DataFrame.from_dict(p_dict, orient='index')

    # shorten names of records
    p_values = p_values.rename(
        index=ss_methods_abbrv_map, columns=ss_methods_abbrv_map
    )

    # plot heatmap
    plot_heatmap(
        p_values, 
        ax[idx],
        bar_ax=None,
        cmap=ListedColormap(['whitesmoke']),
        fmt=".2f"
    )

    # mark the best value
    rows, cols = np.where(p_values.to_numpy() >= 0.05)
    for r_idx, c_idx in zip(rows, cols):
        ax[idx].add_patch(
            Rectangle((c_idx, r_idx), 1, 1, fill=True, color="lightcoral", lw=0)
        )

fig.subplots_adjust(wspace=.4)
fig.suptitle(title)

### bulk plot

In [None]:
# create file descriptor where to save visualisations
pdf = PdfPages(f"heatmaps_wilcoxon_test.pdf")

for net in sorted(nets):

    # prepare canvas
    fig, ax = plt.subplots(
        nrows=1, ncols=2, figsize=(10, 4), gridspec_kw={"width_ratios": [50, 50]}
    )
    fig.tight_layout(pad=0.5, rect=(0.05, 0.05, 0.95, 0.95))
    title = f"P-values of Wilcoxon test for network: {net}; protocols from left:"

    # prepare and plot heatmap for each proto
    for idx, proto in enumerate(protocols):
        title += f" {proto}"

        p_dict = {}
        for ss1, ss2 in combinations(sorted(ss_methods), 2):
            if not {ss1, ss2}.issubset(get_selection_metrics_for_net(net)):
                continue  # greedy have not been computed for all nets
            gain_ss1 = get_sorted_gain_series(net, ss1, proto)
            gain_ss2 = get_sorted_gain_series(net, ss2, proto)
            p_value = test_samples(gain_ss1, gain_ss2)
            if p_dict.get(ss1) is None:
                p_dict[ss1] = {ss2: p_value}
            else:
                p_dict[ss1][ss2] = p_value
        p_values = pd.DataFrame.from_dict(p_dict, orient='index')

        # shorten names of records
        p_values = p_values.rename(
            index=ss_methods_abbrv_map, columns=ss_methods_abbrv_map
        )

        # plot heatmap
        plot_heatmap(
            p_values, ax[idx], bar_ax=None, 
            cmap=ListedColormap(['whitesmoke']), fmt=".2f"
        )

        # mark the best value
        rows, cols = np.where(p_values.to_numpy() >= 0.05)
        for r_idx, c_idx in zip(rows, cols):
            ax[idx].add_patch(
                Rectangle((c_idx, r_idx), 1, 1, fill=True, color="lightcoral", lw=0)
            )

    # add title and save plot to pdf
    fig.subplots_adjust(wspace=.4)
    fig.suptitle(title)
    fig.savefig(pdf, format="pdf")
    plt.close(fig)

pdf.close()

### merged stats

In [None]:
import warnings
warnings.filterwarnings("error")

tests = {}

for net in sorted(nets):
    tests[net] = {}

    for idx, proto in enumerate(protocols):

        p_dict = {}
        for ss1, ss2 in combinations(sorted(ss_methods), 2):
            if not {ss1, ss2}.issubset(get_selection_metrics_for_net(net)):
                p_value = None  # greedy have not been computed for all nets
            else:
                gain_ss1 = get_sorted_gain_series(net, ss1, proto)
                gain_ss2 = get_sorted_gain_series(net, ss2, proto)
                try:
                    p_value = test_samples(gain_ss1, gain_ss2)
                except Warning as e:
                    print(ss1, ss2, net, proto, p_value)
            if p_dict.get(ss1) is None:
                p_dict[ss1] = {ss2: p_value}
            else:
                p_dict[ss1][ss2] = p_value

        p_values = pd.DataFrame.from_dict(p_dict, orient='index')
        tests[net][proto] = p_values

warnings.resetwarnings()

In [None]:
## here is why did i apply zsplit instead of dropping 0 differences
net = "er3"
ss1 = "degree_centrality"
ss2 = "k_shell_mln"
proto = "OR"

gain_ss1 = get_sorted_gain_series(net, ss1, proto)
gain_ss2 = get_sorted_gain_series(net, ss2, proto)
p_value = test_samples(gain_ss1, gain_ss2)

gain_ss1 - gain_ss2, p_value


In [None]:
# concatenate obtained statistics by counting occurances of p_value > 0.05
reference_df = tests["aucs"]["AND"]
reference_cols = list(reference_df.columns)
reference_idx = list(reference_df.index)

# containers for counts of p_values > 0.05
p_counts_or = np.zeros_like(reference_df.to_numpy())
p_counts_and = p_counts_or.copy()

# for each statistic computed for given net and proto determine p_values > 0.05
# and count them in container
for net, net_tests in tests.items():
    for proto, test in net_tests.items():
        assert list(test.columns) == reference_cols
        assert list(test.index) == reference_idx
        p_counts = (test > 0.05).to_numpy().astype(int)
        if proto == "OR":
            p_counts_or += p_counts
        if proto == "AND":
            p_counts_and += p_counts

# obtained matrices ought to be triangular, so replace zeros above diagonal 
# with nans
p_counts_or[np.tril_indices(p_counts_or.shape[0], -1)] = np.nan
p_counts_and[np.tril_indices(p_counts_and.shape[0], -1)] = np.nan

# convert matrices to dateframes
p_counts_or = pd.DataFrame(p_counts_or, columns=reference_cols, index=reference_idx)
p_counts_and = pd.DataFrame(p_counts_and, columns=reference_cols, index=reference_idx)

In [None]:
# visualise obtained results
fig, ax = plt.subplots(
    nrows=1, ncols=3, figsize=(10, 4), gridspec_kw={"width_ratios": [49, 49, 2]}
)
fig.tight_layout(pad=0.5, rect=(0.05, 0.05, 0.95, 0.85))
title = (
    "Occurences 'p-value > 0.05' for Wilcoxon tests performed on series " 
    + "'f(mi_value, seeding_budget)->gain' \n aggregated over all evaluated " 
    + "networks; protocols from left:"
)

vrange = (0, max(p_counts_or.max().max(), p_counts_and.max().max()))

for idx, (proto, proto_df) in enumerate(zip(["OR", "AND"], [p_counts_or, p_counts_and])):
    proto_df = proto_df.rename(index=ss_methods_abbrv_map, columns=ss_methods_abbrv_map)
    plot_heatmap(proto_df, ax[idx], ax[2], cmap="Reds", fmt=".0f", vrange=vrange)
    title += f" {proto}"

fig.subplots_adjust(wspace=.4)
fig.suptitle(title)