In [None]:
import pandas as pd
import shutil
from sklearn.preprocessing import LabelEncoder

path_main_dir = ".."

# Facebook Large

In [None]:
df_cleora = pd.read_csv(f"{path_main_dir}/data-source/facebook_large/musae_facebook_edges.csv")

In [None]:
import os

os.makedirs(f"{path_main_dir}/data/fb-pages/", exist_ok=True)
fb_cleora_input_clique_filename = f"{path_main_dir}/data/fb-pages/hyperedges-fb-pages.txt"
fb_cleora_input_star_filename = f"{path_main_dir}/data/fb-pages/star-fb-pages.txt"

In [None]:
with open(fb_cleora_input_clique_filename, "w") as f_cleora_clique, open(fb_cleora_input_star_filename,
                                                                         "w") as f_cleora_star:
    grouped_train = df_cleora.groupby('id_1')
    for n, (name, group) in enumerate(grouped_train):
        group_list = group['id_2'].tolist()
        group_elems = list(map(str, group_list))
        f_cleora_clique.write(f"{name} {' '.join(group_elems)}\n")
        f_cleora_star.write(f"{n}\t{name}\n")
        for elem in group_elems:
            f_cleora_star.write(f"{n}\t{elem}\n")

In [None]:
df = pd.read_csv(f"{path_main_dir}/data-source/facebook_large/musae_facebook_target.csv")
df.rename(columns={'page_type': 'target'}, inplace=True)

enc = LabelEncoder()
df['target'] = enc.fit_transform(df['target'])

df.to_csv(f"{path_main_dir}/data/fb-pages/node-labels-fb-pages.csv", index=False)

with open(f"{path_main_dir}/data/fb-pages/label-names-fb-pages.txt", 'w') as f:
    for l in enc.classes_:
        f.write(l + '\n')

# Trivago clicks and Walmart trips

Copy required files

In [None]:
dataset_names = ['trivago-clicks', 'walmart-trips']

for dataset_name in dataset_names:
    os.makedirs(f"{path_main_dir}/data/{dataset_name}", exist_ok=True)

    shutil.copy2(f"{path_main_dir}/data-source/{dataset_name}/label-names-{dataset_name}.txt",
                 f"{path_main_dir}/data/{dataset_name}/label-names-{dataset_name}.txt")

Convert node labels from txt file to csv

In [None]:
for dataset_name in dataset_names:
    filepath_in = f"{path_main_dir}/data-source/{dataset_name}/node-labels-{dataset_name}.txt"
    filepath_out = f"{path_main_dir}/data/{dataset_name}/node-labels-{dataset_name}.csv"

    df = pd.read_csv(filepath_in, names=['target'], header=None)
    df[
        'target'] -= 1  # this is decreasing indexes of labels to make them start from 0, this helps in next step - feature selection
    df.to_csv(filepath_out, index=True, index_label='id')

Create star format file based on hyperedges file

In [None]:
for dataset_name in dataset_names:
    filepath_in = f"{path_main_dir}/data-source/{dataset_name}/hyperedges-{dataset_name}.txt"
    filepath_out = f"{path_main_dir}/data/{dataset_name}/star-{dataset_name}.txt"
    filepath_out2 = f"{path_main_dir}/data/{dataset_name}/hyperedges-{dataset_name}.txt"

    with open(filepath_in, 'r') as f_in, open(filepath_out, 'w') as f_out, open(filepath_out2, 'w') as f_out2:
        for i, line in enumerate(f_in.readlines()):
            line_list = [int(x) - 1 for x in line.split(',')]
            line = str(line_list).replace('[', '').replace(']', '')
            line = f"{i} {line.replace(',', '')}\n"
            f_out2.write(line)

            lst = line[:-1].split(' ')
            f_out.write(f"{lst[0]}\t{lst[0]}\n")
            for elem in lst[1:]:
                f_out.write(f"{lst[0]}\t{elem}\n")

# Trivago clicks - continents version

In [None]:
path = f'{path_main_dir}/data/trivago-clicks-continents/'

shutil.rmtree(path, ignore_errors=True)
shutil.copytree(f"{path_main_dir}/data/trivago-clicks/", path, dirs_exist_ok=True)

for filename in os.listdir(path):
    os.rename(path + filename, path + str(filename[:-4] + "-continents" + filename[-4:]))


In [None]:
import json

continents_path = f"{path_main_dir}/data-source/continents.json"

data = json.load(open(continents_path, 'r'))
keys = [x['country'] for x in data]
vals = [x['continent'] for x in data]

continents = dict(zip(keys, vals))

In [None]:
lines = []

with open(f"{path_main_dir}/data/trivago-clicks-continents/label-names-trivago-clicks-continents.txt", 'r') as f:
    for line in f.readlines():
        country = line[:-1]
        continent = continents.get(country, "Other")
        lines.append(continent)

with open(f"{path_main_dir}/data/trivago-clicks-continents/label-names-trivago-clicks-continents.txt", 'w') as f:
    continents_map = dict(zip(set(lines), range(len(lines))))
    for key in continents_map:
        f.write(key + "\n")

    print(continents_map)

df = pd.read_csv(f"{path_main_dir}/data/trivago-clicks-continents/node-labels-trivago-clicks-continents.csv")

for i in range(len(lines)):
    index_to_replace = i
    replace_value = continents_map[lines[i]]
    df['target'].replace(index_to_replace, replace_value, True)

df.to_csv(f"{path_main_dir}/data/trivago-clicks-continents/node-labels-trivago-clicks-continents.csv", index=False)