In [None]:
import pandas as pd

path_main_dir = ".."

In [None]:
import random


class LabelsConverter:
    def __init__(self, labels_path):
        labels = []
        with open(labels_path) as f:
            for line in f.readlines():
                labels.append(line[:-1])

        self.DECODE_MAP = dict(zip(range(len(labels)), labels))
        self.ENCODE_MAP = dict(zip(labels, range(len(labels))))

    def decode_label(self, label):
        return self.DECODE_MAP[label]

    def encode_label(self, label):
        return self.ENCODE_MAP[label]

    def decode_dct(self, dct):
        return {self.decode_label(k): v for k, v in dct.items()}

    def encode_dct(self, dct):
        return {self.encode_label(k): v for k, v in dct.items()}

    def get_targets(self):
        return list(self.DECODE_MAP.keys())


def load_hyperedges(hyperedges_path):
    with open(hyperedges_path) as f_edges:
        hyperedges = []
        for line in f_edges.readlines():
            hyperedges.append({int(x) for x in line.split()[1:]})

        return hyperedges


def load_nodes(nodes_df_path):
    df = pd.read_csv(nodes_df_path, index_col=0)
    nodes = dict()

    for target in conv.get_targets():
        nodes[target] = set(df.loc[df['target'] == target].index)

    return nodes


def calc_distribution(nodes, hyperedges, method):
    distrib = dict(zip(nodes.keys(), [0] * len(nodes)))

    if method == "node":
        # total_nodes = sum([len(nodes[key]) for key in nodes.keys()])
        # distrib = {k: round(len(v) * 100 / total_nodes, 2) for k, v in nodes.items()}
        # commented transforming into %
        distrib = {k: len(v) for k, v in nodes.items()}

    elif method == "hyperedge":
        # total_connections = 0

        for edge in hyperedges:
            for k in nodes.keys():
                distrib[k] += len(nodes[k].intersection(edge))
            # total_connections += len(edge)

        # distrib = {k: round(v * 100 / total_connections, 2) for k, v in distrib.items()}
        distrib = {k: v for k, v in distrib.items()}

    else:
        raise ValueError("Method nod recognized")

    return distrib


# methods are: node, hyperedge
def hypergraph_undersampling(nodes_df_path, hyperedges_path, conv, method='node'):
    nodes = load_nodes(nodes_df_path)
    hyperedges = load_hyperedges(hyperedges_path)

    distrib = calc_distribution(nodes, hyperedges, method)
    print("Before resampling:", conv.decode_dct(distrib))

    minority_class = min(distrib, key=distrib.get)
    classes_to_resample = list(distrib.keys())
    classes_to_resample.remove(minority_class)

    while classes_to_resample:
        class_under_resampling = classes_to_resample[0]

        while distrib[class_under_resampling] > min(distrib.values()):
            node_to_remove = random.choice(list(nodes[class_under_resampling]))

            nodes[class_under_resampling].remove(node_to_remove)
            [edge.discard(node_to_remove) for edge in hyperedges]

            distrib = calc_distribution(nodes, hyperedges, method)

        classes_to_resample.pop(0)
        print(conv.decode_dct(distrib))

    hyperedges = list(filter((set()).__ne__, hyperedges))  #clear empty edges

    return nodes, hyperedges

In [None]:
dataset_name = "fb-pages"
method = 'node'

label_names_path = f"{path_main_dir}/data/{dataset_name}/label-names-{dataset_name}.txt"
nodes_df_path = f"{path_main_dir}/data/{dataset_name}/node-labels-{dataset_name}.csv"
hyperedges_path = f"{path_main_dir}/data/{dataset_name}/hyperedges-{dataset_name}.txt"

conv = LabelsConverter(label_names_path)
nodes, hyperedges = hypergraph_undersampling(nodes_df_path, hyperedges_path, conv, method)

result = conv.decode_dct(calc_distribution(nodes, hyperedges, method))
print("After resampling:", result)

#TODO: Zmiana indeksów wierzchołków żeby zaczynały się od 1 i szły po kolei i zmiana w krawędziach ?

Save to file

In [None]:
import os

dataset_name += "-resampled"
os.makedirs(f"{path_main_dir}/data/{dataset_name}/", exist_ok=True)

In [None]:
hyperedges_resampled_path = f"{path_main_dir}/data/{dataset_name}/hyperedges-{dataset_name}.txt"

with open(hyperedges_resampled_path, "w") as f:
    for i, edge in enumerate(hyperedges):
        line = f"{i} " + str(edge).replace("{", "").replace("}", "").replace(",", "") + "\n"
        f.write(line)

In [None]:
nodes_df_resampled_path = f"{path_main_dir}/data/{dataset_name}/node-labels-{dataset_name}.csv"

nodes_reversed = dict()
for k, v in nodes.items():
    nodes_reversed.update(dict(zip(v, [k] * len(v))))

df_nodes = pd.DataFrame.from_dict(nodes_reversed, orient="index", columns=["target"])
df_nodes.index.name = "id"
df_nodes.head(20)

df_nodes.to_csv(nodes_df_resampled_path)

Update star schema

In [None]:
filepath_in = hyperedges_resampled_path
filepath_out = f"{path_main_dir}/data/{dataset_name}/star-{dataset_name}.txt"

with open(filepath_in, 'r') as f_in, open(filepath_out, 'w') as f_out:
    for line in f_in.readlines():
        lst = line.split()

        for elem in lst:
            f_out.write(f"{lst[0]}\t{elem}\n")

Copy label names file

In [None]:
import shutil

label_names_path_new = f"{path_main_dir}/data/{dataset_name}/label-names-{dataset_name}.txt"

shutil.copy2(label_names_path, label_names_path_new)