In [None]:
import gzip
import pandas as pd
from glob import glob

In [None]:
def get_files(path, dataset):
    return sorted(glob(f"{path}/{dataset}/**.gz"))

In [None]:
path = "../data_raw"

In [None]:
datasets = [
    "aps-a",  
    "aps-av", 
    "aps-cv", 
    "dblp", 
    "dblp-v", 
    "mus", 
    "ndc-ai", 
    "ndc-pc", 
    "sha", 
    "stex"
]

### aps-a

In [None]:
dataset = "aps-a"
nodes = "authors"
dataset_type = "ihg"
files = get_files(path, dataset)
file = files[0]
with gzip.open(file) as f:
    df = pd.read_csv(file)
df = df[[nodes]]

In [None]:
df_filtered = df[~(df[nodes].fillna("").map(len) < 3)].copy()
df_filtered[nodes] = df_filtered[nodes].map(eval)
n_cells_filled = len(df_filtered[nodes].explode())
n_edges = len(df_filtered)

node_mapping = {x:idx for idx, x in enumerate(sorted(df_filtered[nodes].explode().unique()), start=1)}
n_nodes = len(node_mapping)

print(
    f"n: {n_nodes}, m:{n_edges}", 
    f"c: {n_cells_filled}, c/nm: {n_cells_filled / (n_nodes * n_edges)}"
     )

In [None]:
df_filtered[nodes] = df_filtered[nodes].map(lambda x:"\t".join([str(node_mapping[y]) for y in x]))
df_filtered[nodes].to_csv(f"../data/{dataset}.{dataset_type}.tsv.gz", header=False, index=False)

### aps-av

In [None]:
dataset = "aps-av"
dataset_type = "chg"
nodes = "authors"
files = get_files(path, dataset)

In [None]:
dfs = list()
stats = list()
for idx,file in enumerate(files, start=1):
    with gzip.open(file) as f:
        df = pd.read_csv(file)
    df = df[[nodes]]
    df_filtered = df[~(df[nodes].fillna("").map(len) < 3)].copy()
    df_filtered[nodes] = df_filtered[nodes].map(eval)
    n_cells_filled = len(df_filtered[nodes].explode())
    n_edges = len(df_filtered)
    node_mapping = {x:idx for idx, x in enumerate(sorted(df_filtered[nodes].explode().unique()), start=1)}
    n_nodes = len(node_mapping)
    df_filtered[nodes] = df_filtered[nodes].map(lambda x: f"{idx}\t" + "\t".join([str(node_mapping[y]) for y in x]))
    dfs.append(df_filtered.copy())
    stats.append((n_nodes, n_edges, n_cells_filled, n_cells_filled/(n_nodes * n_edges)))

In [None]:
print(f"(n/m)_max: {[tup[0]/tup[1] for tup in [max(stats, key=lambda tup:tup[0]/tup[1])]][0]}")
print(f"(c/nm)_max: {[tup[-1] for tup in [max(stats, key=lambda tup:tup[-1])]][0]}")

In [None]:
pd.concat(dfs, ignore_index=True).to_csv(f"../data/{dataset}.{dataset_type}.tsv.gz", 
                                         header=False, index=False)

### aps-cv

In [None]:
dataset = "aps-cv"
dataset_type = "chg"
nodes = "cited_doi"
files = get_files(path, dataset)

In [None]:
dfs = list()
stats = list()
for idx,file in enumerate(files, start=1):
    with gzip.open(file) as f:
        df = pd.read_csv(file)
    df = df[[nodes]]
    df_filtered = df[~(df[nodes].fillna("").map(len) < 3)].copy()
    df_filtered[nodes] = df_filtered[nodes].map(eval)
    df_filtered = df_filtered[~(df_filtered[nodes].map(len) < 1)].copy()
    n_cells_filled = len(df_filtered[nodes].explode())
    n_edges = len(df_filtered)
    node_mapping = {x:idx for idx, x in enumerate(sorted(df_filtered[nodes].explode().unique()), start=1)}
    n_nodes = len(node_mapping)
    df_filtered[nodes] = df_filtered[nodes].map(lambda x: f"{idx}\t" + "\t".join([str(node_mapping[y]) for y in x]))
    dfs.append(df_filtered.copy())
    stats.append((n_nodes, n_edges, n_cells_filled, n_cells_filled/(n_nodes * n_edges)))

In [None]:
print(f"(n/m)_max: {[tup[0]/tup[1] for tup in [max(stats, key=lambda tup:tup[0]/tup[1])]][0]}")
print(f"(c/nm)_max: {[tup[-1] for tup in [max(stats, key=lambda tup:tup[-1])]][0]}")

In [None]:
pd.concat(dfs, ignore_index=True).to_csv(f"../data/{dataset}.{dataset_type}.tsv.gz", 
                                         header=False, index=False)

### dblp

In [None]:
dataset = "dblp"
dataset_type = "ihg"
nodes = "author"
files = get_files(path, dataset)
file = files[0]
with gzip.open(file) as f:
    df = pd.read_csv(file)
df = df[[nodes]]

In [None]:
df_filtered = df[~(df[nodes].fillna("").map(len) < 3)].copy()
df_filtered[nodes] = df_filtered[nodes].map(lambda x:x.split(";"))
df_filtered = df_filtered[~(df_filtered[nodes].map(len) < 1)].copy()
n_cells_filled = len(df_filtered[nodes].explode())
n_edges = len(df_filtered)

node_mapping = {x:idx for idx, x in enumerate(sorted(df_filtered[nodes].explode().unique()), start=1)}
n_nodes = len(node_mapping)

print(
    f"n: {n_nodes}, m:{n_edges}", 
    f"c: {n_cells_filled}, c/nm: {n_cells_filled / (n_nodes * n_edges)}"
     )

In [None]:
df_filtered[nodes] = df_filtered[nodes].map(lambda x:"\t".join([str(node_mapping[y]) for y in x]))
df_filtered[nodes].to_csv(f"../data/{dataset}.{dataset_type}.tsv.gz", header=False, index=False)

### dblp-v

In [None]:
dataset = "dblp-v"
dataset_type = "chg"
nodes = "author"
files = get_files(path, dataset)

In [None]:
dfs = list()
stats = list()
for idx,file in enumerate(files, start=1):
    with gzip.open(file) as f:
        df = pd.read_csv(file)
    df = df[[nodes]]
    df_filtered = df[~(df[nodes].fillna("").map(len) < 3)].copy()
    df_filtered[nodes] = df_filtered[nodes].map(lambda x:x.split(";"))
    df_filtered = df_filtered[~(df_filtered[nodes].map(len) < 1)].copy()
    n_cells_filled = len(df_filtered[nodes].explode())
    n_edges = len(df_filtered)
    node_mapping = {x:idx for idx, x in enumerate(sorted(df_filtered[nodes].explode().unique()), start=1)}
    n_nodes = len(node_mapping)
    df_filtered[nodes] = df_filtered[nodes].map(lambda x: f"{idx}\t" + "\t".join([str(node_mapping[y]) for y in x]))
    dfs.append(df_filtered.copy())
    stats.append((n_nodes, n_edges, n_cells_filled, n_cells_filled/(n_nodes * n_edges)))

In [None]:
print(f"(n/m)_max: {[tup[0]/tup[1] for tup in [max(stats, key=lambda tup:tup[0]/tup[1])]][0]}")
print(f"(c/nm)_max: {[tup[-1] for tup in [max(stats, key=lambda tup:tup[-1])]][0]}")

In [None]:
pd.concat(dfs, ignore_index=True).to_csv(f"../data/{dataset}.{dataset_type}.tsv.gz", 
                                         header=False, index=False)

### mus

In [None]:
dataset = "mus"
dataset_type = "chg"
nodes = "frequencies440"
files = sorted(glob(f"{path}/{dataset}/**.csv"))

In [None]:
dfs = list()
stats = list()
for idx,file in enumerate(files, start=1):
    with gzip.open(file) as f:
        df = pd.read_csv(file)
    df = df[[nodes]]
    df_filtered = df[~(df[nodes].map(len) < 3)].copy()
    df_filtered[nodes] = df_filtered[nodes].map(eval)
    n_cells_filled = len(df_filtered[nodes].explode())
    n_edges = len(df_filtered)
    node_mapping = {x:idx for idx, x in enumerate(sorted(df_filtered[nodes].explode().unique()), start=1)}
    n_nodes = len(node_mapping)
    df_filtered[nodes] = df_filtered[nodes].map(lambda x: f"{idx}\t" + "\t".join([str(node_mapping[y]) for y in x]))
    dfs.append(df_filtered.copy())
    stats.append((n_nodes, n_edges, n_cells_filled, n_cells_filled/(n_nodes * n_edges)))

In [None]:
print(f"(n/m)_max: {[tup[0]/tup[1] for tup in [max(stats, key=lambda tup:tup[0]/tup[1])]][0]}")
print(f"(c/nm)_max: {[tup[-1] for tup in [max(stats, key=lambda tup:tup[-1])]][0]}")

In [None]:
pd.concat(dfs, ignore_index=True).to_csv(f"../data/{dataset}.{dataset_type}.tsv.gz", header=False, index=False)

### ndc-ai

In [None]:
dataset = "ndc-ai"
nodes = "active_ingredients_names"
dataset_type = "ihg"
files = get_files(path, dataset)
file = files[0]
with gzip.open(file) as f:
    df = pd.read_csv(file)
df = df[[nodes]]

In [None]:
df_filtered = df[~(df[nodes].map(len) < 3)].copy()
df_filtered[nodes] = df_filtered[nodes].map(eval)
n_cells_filled = len(df_filtered[nodes].explode())
n_edges = len(df_filtered)

node_mapping = {x:idx for idx, x in enumerate(sorted(df_filtered[nodes].explode().unique()), start=1)}
n_nodes = len(node_mapping)

print(
    f"n: {n_nodes}, m:{n_edges}", 
    f"c: {n_cells_filled}, c/nm: {n_cells_filled / (n_nodes * n_edges)}"
     )

In [None]:
df_filtered[nodes] = df_filtered[nodes].map(lambda x:"\t".join([str(node_mapping[y]) for y in x]))
df_filtered[nodes].to_csv(f"../data/{dataset}.{dataset_type}.tsv.gz", header=False, index=False)

### ndc-pc

In [None]:
dataset = "ndc-pc"
nodes = "pharm_class"
dataset_type = "ihg"
files = get_files(path, dataset)
file = files[0]
with gzip.open(file) as f:
    df = pd.read_csv(file)
df = df[[nodes]]

In [None]:
df_filtered = df[~(df[nodes].fillna("").map(len) < 3)].copy()
df_filtered[nodes] = df_filtered[nodes].map(eval)
n_cells_filled = len(df_filtered[nodes].explode())
n_edges = len(df_filtered)

In [None]:
node_mapping = {x:idx for idx, x in enumerate(sorted(df_filtered[nodes].explode().unique()), start=1)}
n_nodes = len(node_mapping)

In [None]:
print(
    f"n: {n_nodes}, m:{n_edges}", 
    f"c: {n_cells_filled}, c/nm: {n_cells_filled / (n_nodes * n_edges)}"
     )

In [None]:
df_filtered[nodes] = df_filtered[nodes].map(lambda x:"\t".join([str(node_mapping[y]) for y in x]))
df_filtered[nodes].to_csv(f"../data/{dataset}.{dataset_type}.tsv.gz", header=False, index=False)

### sha

In [None]:
dataset = "sha"
dataset_type = "chg"
nodes = "onstage"
files = sorted(glob(f"{path}/{dataset}/**.csv"))

In [None]:
dfs = list()
stats = list()
for idx,file in enumerate(files, start=1):
    with gzip.open(file) as f:
        df = pd.read_csv(file)
    df = df[[nodes]]
    df_filtered = df[~(df[nodes].map(len) < 3)].copy()
    df_filtered[nodes] = df_filtered[nodes].map(lambda x:x.split())
    df_filtered[nodes] = df_filtered[nodes].map(
            lambda x: [elem for elem in x if not elem.split("_")[0].isupper()]
        )
    df_filtered = df_filtered[~(df_filtered[nodes].map(len) < 1)].copy()
    n_cells_filled = len(df_filtered[nodes].explode())
    n_edges = len(df_filtered)
    node_mapping = {x:idx for idx, x in enumerate(sorted(df_filtered[nodes].explode().unique()), start=1)}
    n_nodes = len(node_mapping)
    df_filtered[nodes] = df_filtered[nodes].map(lambda x: f"{idx}\t" + "\t".join([str(node_mapping[y]) for y in x]))
    dfs.append(df_filtered.copy())
    stats.append((n_nodes, n_edges, n_cells_filled, n_cells_filled/(n_nodes * n_edges)))

In [None]:
print(f"(n/m)_max: {[tup[0]/tup[1] for tup in [max(stats, key=lambda tup:tup[0]/tup[1])]][0]}")
print(f"(c/nm)_max: {[tup[-1] for tup in [max(stats, key=lambda tup:tup[-1])]][0]}")

In [None]:
pd.concat(dfs, ignore_index=True).to_csv(f"../data/{dataset}.{dataset_type}.tsv.gz", 
                                         header=False, index=False)

### stex

In [None]:
dataset = "stex"
dataset_type = "chg"
nodes = "tags"
files = get_files(path, dataset)

In [None]:
dfs = list()
stats = list()
for idx,file in enumerate(files, start=1):
    with gzip.open(file) as f:
        df = pd.read_csv(file)
    df_filtered = df[[nodes]].copy()
    df_filtered[nodes] = df_filtered[nodes].map(eval)
    n_cells_filled = len(df_filtered[nodes].explode())
    n_edges = len(df_filtered)
    node_mapping = {x:idx for idx, x in enumerate(sorted(df_filtered[nodes].explode().unique()), start=1)}
    n_nodes = len(node_mapping)
    df_filtered[nodes] = df_filtered[nodes].map(lambda x: f"{idx}\t" + "\t".join([str(node_mapping[y]) for y in x]))
    dfs.append(df_filtered.copy())
    stats.append((n_nodes, n_edges, n_cells_filled, n_cells_filled/(n_nodes * n_edges)))

In [None]:
print(f"(n/m)_max: {[tup[0]/tup[1] for tup in [max(stats, key=lambda tup:tup[0]/tup[1])]][0]}")
print(f"(c/nm)_max: {[tup[-1] for tup in [max(stats, key=lambda tup:tup[-1])]][0]}")

In [None]:
pd.concat(dfs, ignore_index=True
         ).to_csv(f"../data/{dataset}.{dataset_type}.tsv.gz", header=False, index=False)

### syn

In [None]:
import json

In [None]:
dataset_type = "chg"
syn_datasets = ["syn_hcm", "syn_hnmp", "syn_hsbm"]

In [None]:
for dataset in syn_datasets:
    files = get_files(path, dataset)
    graph_strings = []
    for idx,file in enumerate(files, start=1):
        with gzip.open(file) as f:
            data = json.load(f)
        graph_strings.append("\n".join([f"{idx}\t" + "\t".join([str(x) for x in y]) for y in data["cr"]]))
    joined = "\n".join(graph_strings)
    
    with gzip.open(f"../data/{dataset}.{dataset_type}.tsv.gz", "wt") as f:
        f.write(joined)

In [None]:
dataset_type = "chg"

dataset = "syn_hcm"
files = get_files(path, "syn_hcm") + get_files(path, "syn_hsbm")
graph_strings = []
for idx,file in enumerate(files, start=1):
    with gzip.open(file) as f:
        data = json.load(f)
    graph_strings.append("\n".join([f"{idx}\t" + "\t".join([str(x) for x in y]) for y in data["cr"]]))
joined = "\n".join(graph_strings)
dataset = "syn_hcm-hsbm"
with gzip.open(f"../data/{dataset}.{dataset_type}.tsv.gz", "wt") as f:
    f.write(joined)