In [4]:
import pandas as pd
import numpy as np
import networkx as nx
import os
from math import comb

## Table 1 Code
Data for large datasets (e.g., the social networks) has been removed from this folder due to size constraints, so rerunning this code will result in those datasets not being shown

In [11]:
lsn_datasets = ["soc-youtube", "soc-orkut", "soc-livejournal", "soc-flickr"]

pred_datasets = ["cont-hospital", "cont-workplace-13", "cont-workplace-15", "hosp-DAWN", "bills-senate", 
                    "bills-house", "coauth-dblp", "cont-primary-school", "cont-high-school"]
all_datasets = pred_datasets + lsn_datasets + ['retail-trivago', 'email-Enron', 'cont-village']

In [18]:
# want to report: nodes, edges, filled triangles, number of classes
all_info = []
for dataset in all_datasets:
    print(dataset)
    if (dataset in lsn_datasets):
        curr_datasets = np.sort([i for i in os.listdir(f'./{dataset}/') if i.startswith(dataset[4:])])
        
        all_label_dfs = []
        all_edge_dfs = []
        all_tri_dfs = []
        for curr_d in curr_datasets:
            all_label_dfs.append( pd.read_csv(f'{dataset}/{curr_d}/labels.csv'))
            all_edge_dfs.append( pd.read_csv(f'{dataset}/{curr_d}/edges.csv'))
            all_tri_dfs.append( pd.read_csv(f'{dataset}/{curr_d}/triangles.csv'))
        
        label_df = pd.concat(all_label_dfs).drop_duplicates()
        edge_df = pd.concat(all_edge_dfs).drop_duplicates()
        tri_df = pd.concat(all_tri_dfs).drop_duplicates()
        num_nodes = len(label_df)
        num_labels = label_df['group_code'].nunique()
        num_edges = len(edge_df)
        num_tris = len(tri_df)
    else:
        label_df = pd.read_csv(f'{dataset}/labels.csv')
        label_df = label_df[label_df['group_code'] != -1]
        num_nodes = len(label_df)
        num_labels = label_df['group_code'].nunique()
        num_edges = sum(1 for line in open(f'{dataset}/edges.csv')) - 1
        num_tris = sum(1 for line in open(f'{dataset}/triangles.csv')) - 1
    
    num_time_edges = 0
    num_time_triangles = 0
    ct = 0
    
    if dataset in pred_datasets:
        with open(f'../../LinkPrediction/data/{dataset}/{dataset}-nverts.txt', 'r') as file:
            lines = file.readlines()

        for line in lines:
            num_simp = int(line)
            if(num_simp > 1):
                num_time_edges += comb(num_simp, 2)
            if (num_simp > 2):
                num_time_triangles += comb(num_simp, 3)

        with open(f'../../LinkPrediction/data/{dataset}/{dataset}-times.txt', 'r') as file:
            tlines = file.readlines()
        ct = len(pd.unique(tlines))
    
    all_info.append({
        'Dataset': dataset,
        "nodes": num_nodes,
        "classes": num_labels,
        "time_edges": int(num_time_edges),
        "edges": num_edges,
        "time_triangles": int(num_time_triangles),
        "triangles": num_tris,
        "time_steps": ct
    })

cont-hospital
cont-workplace-13
cont-workplace-15
hosp-DAWN
bills-senate
bills-house
coauth-dblp
cont-primary-school
cont-high-school
soc-youtube
soc-orkut
soc-livejournal
soc-flickr
retail-trivago
email-Enron
cont-village


In [19]:
dataset_names = {
 'cont-village': '\\texttt{cont-village}~\\cite{ozella2021using}',
 'cont-hospital': '\\texttt{cont-hospital}~\\cite{genois2018can}',
 'cont-workplace-13': '\\texttt{cont-workplace-13}~\\cite{genois2018can} ',
 'email-Enron': '\\texttt{email-Enron}~\\cite{benson2018simplicial}',
 'cont-workplace-15': '\\texttt{cont-workplace-15}~\\cite{genois2018can}',
 'cont-primary-school': '\\texttt{cont-primary-school}~\\cite{genois2018can}',
 'bills-senate': '\\texttt{bills-senate}~\\cite{fowler2006connecting, fowler2006legislative}',
 'cont-high-school': '\\texttt{cont-high-school}~\\cite{genois2018can}',
 'bills-house': '\\texttt{bills-house}~\\cite{fowler2006connecting, fowler2006legislative} ',
 'hosp-DAWN': '\\texttt{hosp-DAWN}~\\cite{benson2018simplicial}',
 'soc-youtube': '\\texttt{soc-youtube}~\\cite{mislove2007measurement}',
 'soc-flickr': '\\texttt{soc-flickr}~\\cite{mislove2007measurement}',
 'coauth-dblp': '\\texttt{coauth-dblp}~\\cite{agarwal2016women}',
 'retail-trivago': '\\texttt{clicks-trivago}~\\cite{benson2018simplicial}',
 'soc-livejournal': '\\texttt{soc-livejournal}~\\cite{mislove2007measurement}',
 'soc-orkut': '\\texttt{soc-orkut}~\\cite{mislove2007measurement}'
}

In [20]:
df = pd.DataFrame(all_info).set_index("Dataset").sort_values(by='nodes')
df = df.applymap(lambda x: "{:,}".format(x) if x > 0 else "\\rule[.5ex]{1em}{0.5pt}").reset_index()
df['Dataset'] = df['Dataset'].map(dataset_names)
with pd.option_context("max_colwidth", 1000):
    print(df.to_latex(index=False, escape=False))

\begin{tabular}{llllllll}
\toprule
                                                                  Dataset &    nodes & classes &               time_edges &      edges &           time_triangles &  triangles &               time_steps \\
\midrule
                             \texttt{cont-village}~\cite{ozella2021using} &       46 &       5 &  \rule[.5ex]{1em}{0.5pt} &        329 &  \rule[.5ex]{1em}{0.5pt} &        610 &  \rule[.5ex]{1em}{0.5pt} \\
                              \texttt{cont-hospital}~\cite{genois2018can} &       81 &       5 &                  150,126 &      1,381 &                   97,263 &      6,268 &                   12,605 \\
                         \texttt{cont-workplace-13}~\cite{genois2018can}  &      100 &       5 &                  394,247 &      3,915 &                  778,057 &     80,173 &                   20,129 \\
                         \texttt{email-Enron}~\cite{benson2018simplicial} &      148 &       2 &  \rule[.5ex]{1em}{0.5pt} &      1,344 &