In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import os
import pickle

def create_triangle_list(G):
    elist = list(G.edges())
    num_edges = len(elist)
    num_nodes = nx.number_of_nodes(G)
    
    triangles = []
    for e in elist:
        # consider the elist to be in form i, j
        i, j = e
        # neigbors of i are all nodes k that appears in the list
        first_node_neighbors = set(G.neighbors(i))
        # same for node j
        second_node_neighbors = set(G.neighbors(j))

        # find intersection between those neighbors => triangle
        common_neighbors = list(first_node_neighbors & second_node_neighbors)
        
        for t in common_neighbors:
            curr_triangle = np.sort([i,j,t])
            triangles.append(curr_triangle)
    possible_ts = np.unique(triangles, axis=0)
    return possible_ts

## Simplicial Stochastic Block Model
This is an example of a stochastic block model for simplicial complexes, which is built generatively much like the $\D elta$-ensemble of Kahle (Topology of random simplicial complexes: A survey) but with two communities.

The model is built by first establishing nodes, then edges, and then triangles.
Nodes are split into two communities (we'll do even amounts of nodes in each), then edges are placed at random within a community with probability $p_1$ and between communities with probability $q_1$.
Then, triangles are placed inside closed triangles of the graph, with probability $p_2$ if all nodes are within the same community and probability $q_2$ otherwise. 

In [2]:
## 2 types of sweeps: first on p1/q1 and second on p2/q2
# convention: (n, p1, q1, p2, q2): base model of (1000, 0.05, 0.01, 0.15, 0.7)
models = {}

# sweep 1
for idx in range(1, 10):
    p1mult = 30**((idx - 5) / 5)
    q1 = 0.01
    models[f'ssbm-sweep1-{idx}'] = (1000, p1mult*q1, q1, 0.15, 0.7)

# sweep 2    
for idx in range(1, 10):
    q2mult = 5**((idx - 5) / 5)
    p2 = 0.15
    models[f'ssbm-sweep2-{idx}'] = (1000, 0.05, 0.01, p2, q2mult * p2)   
pickle.dump(models, open("models.pkl", "wb"))

In [None]:
# number of datasets per model
num_trials = 50

for model_name, params in models.items():
    n, p1, q1, p2, q2 = params
    for trial_no in range(num_trials):
        # Determine edges with a stochastic block model
        ns = np.array([n, n])
        P = np.array([[p1, q1], [q1, p1]])

        G = nx.stochastic_block_model(ns, P)

        closed_tlist = create_triangle_list(G)

        elist = [sorted(e) for e in G.edges()]

        tlist = []
        for t in closed_tlist:
            if np.all(t < n) or np.all(t >= n):
                if (np.random.random() < p2):
                    tlist.append(t)
            else:
                if (np.random.random() < q2):
                    tlist.append(t)

        labels = [(k, (k >= n)*1) for k in range(2*n)]

        ldf = pd.DataFrame(labels)
        ldf.columns = ['id', 'group_code']

        edf = pd.DataFrame(elist)
        edf.columns = ['node_1', 'node_2']

        tdf = pd.DataFrame(tlist)
        tdf.columns = ['node_1', 'node_2', 'node_3']

        cdf = pd.DataFrame(closed_tlist)
        cdf.columns = ['node_1', 'node_2', 'node_3']

        os.makedirs(f"ssbm/{model_name}-{trial_no}", exist_ok=True)
        ldf.to_csv(f"ssbm/{model_name}-{trial_no}/labels.csv", index=False)
        edf.to_csv(f"ssbm/{model_name}-{trial_no}/edges.csv", index=False)
        tdf.to_csv(f"ssbm/{model_name}-{trial_no}/triangles.csv", index=False)
        cdf.to_csv(f"ssbm/{model_name}-{trial_no}/all_closed_triangles.csv", index=False)