In [1]:
import pandas as pd
import networkx as nx
import numpy as np

def create_triangle_list(G):
    '''
    Generate a data frame with a row for all closed triangles in the graph G
    '''
    elist = list(G.edges())
    num_edges = len(elist)
    num_nodes = nx.number_of_nodes(G)

    triangles = []

    for e in elist:
        # consider the elist to be in form i, j
        i, j = e

        # neigbors of i are all nodes k that appears in the list
        first_node_neighbors = set(G.neighbors(i))

        # same for node j
        second_node_neighbors = set(G.neighbors(j))

        # find intersection between those neighbors => triangle
        common_neighbors = list(first_node_neighbors & second_node_neighbors)

        for t in common_neighbors:
            curr_triangle = np.sort([i,j,t])
            triangles.append(curr_triangle)
    possible_ts = np.unique(triangles, axis=0)

    return possible_ts

In [2]:
all_datasets = ["cont-hospital", "cont-workplace-13", "bills-senate", "bills-house",
                "hosp-DAWN", "cont-workplace-15", "coauth-dblp", "cont-primary-school", "cont-high-school"]

In [3]:
results = []
for dataset in all_datasets:
    print(dataset)
    labels_df = pd.read_csv(f'{dataset}/labels.csv')
    labels_dict = {row['id']: row['group_code'] for i, row in labels_df.iterrows()}
    edges_df = pd.read_csv(f'{dataset}/edges.csv')
    triangles_df = pd.read_csv(f'{dataset}/triangles.csv')

    G = nx.from_pandas_edgelist(edges_df, 'node_1', 'node_2')
    closed_tlist = create_triangle_list(G)
    closed_tdf = pd.DataFrame(closed_tlist)
    closed_tdf.columns = ['node_1', 'node_2', 'node_3']

    tlabels_df = triangles_df.applymap(lambda x: labels_dict[x] if x in labels_dict else np.nan)
    tlabels_df = tlabels_df.dropna()
    closed_tlabels_df = closed_tdf.applymap(lambda x: labels_dict[x] if x in labels_dict else np.nan)
    closed_tlabels_df = closed_tlabels_df.dropna()

    num_hom_filled = np.sum((tlabels_df['node_1'] == tlabels_df['node_2']) & \
                 (tlabels_df['node_2'] == tlabels_df['node_3']))
    tot_filled = len(tlabels_df)
    obs = num_hom_filled / tot_filled

    num_hom_closed = np.sum((closed_tlabels_df['node_1'] == closed_tlabels_df['node_2']) & \
                    (closed_tlabels_df['node_2'] == closed_tlabels_df['node_3'])) 
    tot_closed = len(closed_tlabels_df)
    simp_b = num_hom_closed / tot_closed

    group_counts = labels_df.groupby('group_code').count()
    node_b = 0
    total_nodes = len(labels_df)
    for code, ct in group_counts.iterrows():
        num = ct['id']
        node_b += float(num) * (num - 1) * (num - 2) / (total_nodes * (total_nodes - 1) * (total_nodes - 2))
    
    results.append({
        'dataset': dataset,
        'observed_proportion': obs,
        'closed_baseline': simp_b,
        'node_baseline': node_b,
        'simplicial_ratio': obs / simp_b,
        'hypergraph_ratio': obs / node_b,
        'number filled (hom, total)': (num_hom_filled, tot_filled),
        'number closed (hom, total)': (num_hom_closed, tot_closed)
    })
        

cont-hospital
cont-workplace-13
bills-senate
bills-house
hosp-DAWN
cont-workplace-15
coauth-dblp
cont-primary-school
cont-high-school


In [4]:
pd.DataFrame(results).set_index("dataset").to_csv("homophily_comps.csv")

In [5]:
pd.DataFrame(results)

Unnamed: 0,dataset,observed_proportion,closed_baseline,node_baseline,simplicial_ratio,hypergraph_ratio,"number filled (hom, total)","number closed (hom, total)"
0,cont-hospital,0.130834,0.072924,0.083931,1.794116,1.558826,"(513, 3921)","(627, 8598)"
1,cont-workplace-13,0.101643,0.097207,0.078417,1.045627,1.296184,"(8007, 78776)","(8998, 92565)"
2,bills-senate,0.42357,0.366289,0.240297,1.15638,1.762694,"(1984, 4684)","(20184, 55104)"
3,bills-house,0.505818,0.55102,0.251715,0.917967,2.009488,"(2695, 5328)","(16254, 29498)"
4,hosp-DAWN,0.002004,0.000848,0.000294,2.36391,6.82176,"(685, 341900)","(2088, 2463599)"
5,cont-workplace-15,0.093807,0.080281,0.024262,1.168485,3.866503,"(11537, 122986)","(25530, 318007)"
6,coauth-dblp,0.554586,0.557863,0.493573,0.994124,1.123613,"(163900, 295536)","(175891, 315294)"
7,cont-primary-school,0.016105,0.012036,0.007946,1.338137,2.026736,"(17315, 1075115)","(17328, 1439731)"
8,cont-high-school,0.096093,0.033851,0.011938,2.838737,8.049001,"(67774, 705296)","(68978, 2037718)"
