In [None]:
import pandas as pd
import matplotlib.pyplot as plt

path_main_dir = ".."

## Facebook Large

Attribute: facebook_id

In [None]:
df_fb = pd.read_csv(f"{path_main_dir}/data/fb-pages/node-labels-fb-pages.csv", index_col=0)

print("Unique facebook_id's:", df_fb["facebook_id"].unique().size)

Attribute: page_name

In [None]:
from collections import Counter

cnt = Counter(df_fb["page_name"].values)
cnt = Counter({k: c for k, c in cnt.items() if c > 1})

print("Number of page names appearing more than once:", len(cnt))
print("Most common page names:", cnt.most_common(10))

Attribute: target

In [None]:
import os


def draw_pie_chart(result, name):
    plt.figure(figsize=(8, 8))
    plt.title(name)
    plt.pie(result.values(), labels=result.keys(), autopct='%1.1f%%')
    plt.savefig(f"{plots_path}/{name}.png", bbox_inches="tight")
    print("Precise numbers per label:", result)


def get_labels_and_node_stats(plot_name, path_label, df):
    labels = []
    with open(path_label) as f:
        for line in f.readlines():
            labels.append(line[:-1])

    labels_map = dict(zip(range(len(labels)), labels))

    cnt = Counter(df['target'])
    cnt = Counter({labels_map[k]: c for k, c in cnt.items()})
    result = dict(cnt.most_common())

    draw_pie_chart(result, f"{plot_name} - target")

    print("Number of nodes:", df.shape[0])

    return labels_map


plots_path = f"{path_main_dir}/output/plots/analysis"
os.makedirs(plots_path, exist_ok=True)

In [None]:
plot_name = "Facebook Large"
path_label = f"{path_main_dir}/data/fb-pages/label-names-fb-pages.txt"

labels_map = get_labels_and_node_stats(plot_name, path_label, df_fb)

Basic statistics about hyperedges and number of connections per class.

In [None]:
from statistics import mean, median


def get_hyperedges_stats(plot_name, hyperedges_path, df, labels_map):
    connections_per_class = dict()
    with open(hyperedges_path) as f_edges:
        hyperedges_len = []
        for line in f_edges.readlines():
            hyperedge = line.split()[1:]
            hyperedges_len.append(len(hyperedge))

            for node in hyperedge:
                target = df.iloc[int(node)]['target']
                connections_per_class[target] = connections_per_class.get(target, 0) + 1

        print("Number of hyperedges:", len(hyperedges_len))
        print("Mean value:", mean(hyperedges_len))
        print("Median value:", median(hyperedges_len))
        print("Rank of hypergraph (max edge length):", max(hyperedges_len))

        connections_per_class = {labels_map[k]: v for k, v in connections_per_class.items()}
        draw_pie_chart(connections_per_class, f"{plot_name} - target distribution in hyperedges")


In [None]:
hyperedges_path = f"{path_main_dir}/data/fb-pages/hyperedges-fb-pages.txt"

get_hyperedges_stats(plot_name, hyperedges_path, df_fb, labels_map)

## Walmart trips

In [None]:
df_walmart = pd.read_csv(f"{path_main_dir}/data/walmart-trips/node-labels-walmart-trips.csv", index_col=0)

In [None]:
plot_name = "Walmart trips"
path_label = f"{path_main_dir}/data/walmart-trips/label-names-walmart-trips.txt"

labels_map = get_labels_and_node_stats(plot_name, path_label, df_walmart)

Basic statistics about hyperedges and number of connections per class.

In [None]:
hyperedges_path = f"{path_main_dir}/data/walmart-trips/hyperedges-walmart-trips.txt"

get_hyperedges_stats(plot_name, hyperedges_path, df_walmart, labels_map)

## Trivago clicks

In [None]:
df_trivago = pd.read_csv(f"{path_main_dir}/data/trivago-clicks/node-labels-trivago-clicks.csv", index_col=0)

In [None]:
labels = []
with open(f"{path_main_dir}/data/trivago-clicks/label-names-trivago-clicks.txt") as f:
    for line in f.readlines():
        labels.append(line[:-1])

labels_map = dict(zip(range(len(labels)), labels))

cnt = Counter(df_trivago['target'])
cnt = Counter({labels_map[k]: c for k, c in cnt.items()})

result = dict(cnt.most_common(len(cnt)))

plt.figure(figsize=(8, 6), dpi=80)
plt.plot(range(len(result)), result.values())
plt.xlabel("Class index")
plt.ylabel("Number of nodes")
plt.savefig(plots_path + "/Trivago clicks - target.png")

print("Number of nodes:", df_trivago.shape[0])
# print("Precise numbers per label:", result)
# print("in %:", [round(x * 100, 2) / df_trivago.shape[0] for x in result.values()])

In [None]:
plot_name = "Trivago clicks"
hyperedges_path = f"{path_main_dir}/data/trivago-clicks/hyperedges-trivago-clicks.txt"

get_hyperedges_stats(plot_name, hyperedges_path, df_trivago, labels_map)

## Trivago clicks continents

In [None]:
df_trivago_cont = pd.read_csv(
    f"{path_main_dir}/data/trivago-clicks-continents/node-labels-trivago-clicks-continents.csv",
    index_col=0)

In [None]:
plot_name = "Trivago clicks (continents)"
path_label = f"{path_main_dir}/data/trivago-clicks-continents/label-names-trivago-clicks-continents.txt"

labels_map = get_labels_and_node_stats(plot_name, path_label, df_trivago_cont)

Basic statistics about hyperedges and number of connections per class.

In [None]:
hyperedges_path = f"{path_main_dir}/data/trivago-clicks-continents/hyperedges-trivago-clicks-continents.txt"

get_hyperedges_stats(plot_name, hyperedges_path, df_trivago_cont, labels_map)