Analysing the data from Gui's simulations - extracting the relevant info

Set the path

In [83]:
import time

import os

import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import networkx as nx
import re
from tqdm import tqdm

Inequity function (fix the counting part, we need strings rather than integers):

In [32]:
def compute_inequity(g, k):
    """Compute the proportion of allies and minorities in the top k ranks of g"""
    node_pageranks = nx.pagerank(g)
    node_pageranks_sorted = sorted(node_pageranks.items(), key=lambda x: x[1], reverse=True)
    top_k = node_pageranks_sorted[:k]
    
    num_top_k_allies = 0
    num_top_k_minority = 0
    
    for (node_id, _) in top_k:
        if g.nodes[node_id]['m'] == "1": # is an ally:
            num_top_k_allies += 1
        elif g.nodes[node_id]['m'] == "2": # is minority
            num_top_k_minority += 1
    
    return num_top_k_allies / k, num_top_k_minority / k

Get all the files to read:

In [74]:
dir_path = r'E:\\Projects\\Homophily\\gui_results\\results'

# list to store files
network_paths = []

# Iterate directory
for path in os.listdir(dir_path):
    # check if current path is a file
    if os.path.isfile(os.path.join(dir_path, path)):
        if "gml" in path:
            network_paths.append(path)

In [78]:
os.chdir(dir_path)

Extract the data we need and compute the inequity:

In [96]:
# We want different beta values:
beta_list = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4,
            0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 
            0.85, 0.9, 0.95, 1]

# Initiate the results list:
results = []

# For each file:
for path in tqdm(network_paths):
    # read in the network
    g = nx.read_gml(path)
    
    # getting the parameters we need from the path
    N = int(re.findall(r"\d+", path.split("_")[2])[0])
    fm = re.findall("\d+\.\d+", path.split("_")[3])[0]
    fa =  re.findall("\d+\.\d+", path.split("_")[4])[0]
    i =  re.findall(r"\d+", path.split("_")[5])[0]
    
    # computing the inequity for different values of beta
    for beta in beta_list:
        k = int(N * beta)
        prop_top_k_ally, prop_top_k_minority = compute_inequity(g, k)
        results.append((fm, fa, beta, prop_top_k_ally, prop_top_k_minority, i))

100%|████████████████████████████████████████████████████████████████████████████| 3810/3810 [3:59:38<00:00,  3.77s/it]


In [99]:
results_df = pd.DataFrame(results, columns=['prop_min', 'prop_ally','beta' ,'prop_ally_top_k', 'prop_min_top_k', "iter"])
results_df

Unnamed: 0,prop_min,prop_ally,beta,prop_ally_top_k,prop_min_top_k,iter
0,0.216,0.000,0.05,0.000000,0.040000,1
1,0.216,0.000,0.10,0.000000,0.090000,1
2,0.216,0.000,0.15,0.000000,0.086667,1
3,0.216,0.000,0.20,0.000000,0.110000,1
4,0.216,0.000,0.25,0.000000,0.120000,1
...,...,...,...,...,...,...
76195,0.071,0.258,0.80,0.171250,0.053750,4
76196,0.071,0.258,0.85,0.161176,0.050588,4
76197,0.071,0.258,0.90,0.206667,0.047778,4
76198,0.071,0.258,0.95,0.248421,0.045263,4


In [100]:
results_df.to_csv("extracted_ally_sweep_per_fm_top_k_new.csv", index=False)