In [None]:
import math
import networkx as nx

from tqdm import tqdm

In [None]:
def read_graphml_file(filename):
    """Reads a graph from a GraphML file and returns a NetworkX graph object."""
    try:
        # Read the graph from the file
        graph = nx.read_graphml(filename)

        # Return the graph object
        return graph
    except Exception as e:
        print(f"Error: {e}")
        return None

In [None]:
#TOPICS = ["CLIMATE_2023", "IMMIGRATION_2023", "SOCIAL_SECURITY_2023", "ECONOMIC_POLICY_2023", "EDUCATION_2023", "SDP_2023", "FINNS_2023", "NATIONAL_2023", "CENTER_2023", "GREEN_2023", "LEFT_2023", "PARTIES_2023", "EUROVISION_2023", "WILMAMURTO_2023"]

TOPICS = ["CLIMATE_2019", "IMMIGRATION_2019", "SOCIAL_SECURITY_2019", "ECONOMIC_POLICY_2019", "EDUCATION_2019", "SDP_2019", "FINNS_2019", "NATIONAL_2019", "CENTER_2019", "GREEN_2019", "LEFT_2019", "PARTIES_2019"]

# CHANGE THE YEAR IN SCRIPT!

In [None]:
results = dict()

whole_array_MI = []
whole_array_NMI = []

for T1 in tqdm(TOPICS):

    row_vector_MI = []
    row_vector_NMI = []

    CLIMATE_2023 = read_graphml_file(f"./rich-networks/2019/RICH_{T1}_NET.graphml")

    for T2 in TOPICS:
        if T1 == T2:
            row_vector_MI.append(1)
            row_vector_NMI.append(1)
            continue

        #CLIMATE_2023 = read_graphml_file(f"./rich-networks/2023/RICH_{T1}_NET.graphml")
        IMMIGRATION_2023 = read_graphml_file(f"./rich-networks/2019/RICH_{T2}_NET.graphml")

        try:
            CLIMATE_WHOLE = {data["user_id"] for n, data in CLIMATE_2023.nodes(data=True)}
            CLIMATE_CLUSTER_1 = {data["user_id"] for n, data in CLIMATE_2023.nodes(data=True) if data['finetuned_cluster'] == 0}
            CLIMATE_CLUSTER_2 = {data["user_id"] for n, data in CLIMATE_2023.nodes(data=True) if data['finetuned_cluster'] == 1}

            IMMIGRATION_WHOLE = {data["user_id"] for n, data in IMMIGRATION_2023.nodes(data=True)}
            IMMIGRATION_CLUSTER_1 = {data["user_id"] for n, data in IMMIGRATION_2023.nodes(data=True) if data['finetuned_cluster'] == 0}
            IMMIGRATION_CLUSTER_2 = {data["user_id"] for n, data in IMMIGRATION_2023.nodes(data=True) if data['finetuned_cluster'] == 1}
        except AttributeError:
            #results[pair] = {"MI": None, "NMI": None}
            print("debug")
            continue

        CLIMATE_CLUSTER_1_IMMIGRATION_CLUSTER_1 = CLIMATE_CLUSTER_1 & IMMIGRATION_CLUSTER_1
        CLIMATE_CLUSTER_1_IMMIGRATION_CLUSTER_2 = CLIMATE_CLUSTER_1 & IMMIGRATION_CLUSTER_2 
        CLIMATE_CLUSTER_2_IMMIGRATION_CLUSTER_1 = CLIMATE_CLUSTER_2 & IMMIGRATION_CLUSTER_1 
        CLIMATE_CLUSTER_2_IMMIGRATION_CLUSTER_2 = CLIMATE_CLUSTER_2 & IMMIGRATION_CLUSTER_2 

        N_NODES_IN_BOTH = len(CLIMATE_WHOLE & IMMIGRATION_WHOLE)
        FRACTION_OF_ALL_NODES = N_NODES_IN_BOTH/(len(CLIMATE_WHOLE) + len(IMMIGRATION_WHOLE))
        FRACTION_OF_CLIMATE = N_NODES_IN_BOTH/len(CLIMATE_WHOLE)

        # CLOSED SYSTEM
        RELEVANT_NODES = CLIMATE_WHOLE & IMMIGRATION_WHOLE

        REDUCED_CLIMATE_CLUSTER_1 = CLIMATE_CLUSTER_1 & RELEVANT_NODES
        REDUCED_CLIMATE_CLUSTER_2 = CLIMATE_CLUSTER_2 & RELEVANT_NODES

        REDUCED_IMMIGRATION_CLUSTER_1 = IMMIGRATION_CLUSTER_1 & RELEVANT_NODES
        REDUCED_IMMIGRATION_CLUSTER_2 = IMMIGRATION_CLUSTER_2 & RELEVANT_NODES

        # MARGINAL PROBABILITIES

        P_CLIMATE_1 = len(REDUCED_CLIMATE_CLUSTER_1)/(len(REDUCED_CLIMATE_CLUSTER_1) + len(REDUCED_CLIMATE_CLUSTER_2))
        P_CLIMATE_2 = len(REDUCED_CLIMATE_CLUSTER_2)/(len(REDUCED_CLIMATE_CLUSTER_1) + len(REDUCED_CLIMATE_CLUSTER_2))

        P_IMMIGRATION_1 = len(REDUCED_IMMIGRATION_CLUSTER_1)/(len(REDUCED_IMMIGRATION_CLUSTER_1) + len(REDUCED_IMMIGRATION_CLUSTER_2))
        P_IMMIGRATION_2 = len(REDUCED_IMMIGRATION_CLUSTER_2)/(len(REDUCED_IMMIGRATION_CLUSTER_1) + len(REDUCED_IMMIGRATION_CLUSTER_2))

        # JOINT PROBABILITIES

        P_CLIMATE_1_AND_IMMIGRATION_1 = len(CLIMATE_CLUSTER_1_IMMIGRATION_CLUSTER_1)/N_NODES_IN_BOTH
        P_CLIMATE_1_AND_IMMIGRATION_2 = len(CLIMATE_CLUSTER_1_IMMIGRATION_CLUSTER_2)/N_NODES_IN_BOTH
        P_CLIMATE_2_AND_IMMIGRATION_1 = len(CLIMATE_CLUSTER_2_IMMIGRATION_CLUSTER_1)/N_NODES_IN_BOTH
        P_CLIMATE_2_AND_IMMIGRATION_2 = len(CLIMATE_CLUSTER_2_IMMIGRATION_CLUSTER_2)/N_NODES_IN_BOTH

        # BELONGS TO CLIMATE_X GIVEN IMMIGRATION_1

        P_CLIMATE_1_GIVEN_IMMIGRATION_1 = P_CLIMATE_1_AND_IMMIGRATION_1/P_IMMIGRATION_1
        P_CLIMATE_2_GIVEN_IMMIGRATION_1 = P_CLIMATE_2_AND_IMMIGRATION_1/P_IMMIGRATION_1

        # BELONGS TO CLIMATE_X GIVEN IMMIGRATION_2
        P_CLIMATE_1_GIVEN_IMMIGRATION_1 = P_CLIMATE_1_AND_IMMIGRATION_2/P_IMMIGRATION_2
        P_CLIMATE_2_GIVEN_IMMIGRATION_1 = P_CLIMATE_2_AND_IMMIGRATION_2/P_IMMIGRATION_2

        joint_probs = [P_CLIMATE_1_AND_IMMIGRATION_1, P_CLIMATE_1_AND_IMMIGRATION_2, P_CLIMATE_2_AND_IMMIGRATION_1, P_CLIMATE_2_AND_IMMIGRATION_2]

        #MI = joint_prob * math.log(joint_prob / (margin_prob_1 * margin_prob_2)) 
        MI_VEC = [P_CLIMATE_1_AND_IMMIGRATION_1 * math.log2(P_CLIMATE_1_AND_IMMIGRATION_1 / (P_CLIMATE_1 * P_IMMIGRATION_1)),
                P_CLIMATE_1_AND_IMMIGRATION_2 * math.log2(P_CLIMATE_1_AND_IMMIGRATION_2 / (P_CLIMATE_1 * P_IMMIGRATION_2)),
                P_CLIMATE_2_AND_IMMIGRATION_1 * math.log2(P_CLIMATE_2_AND_IMMIGRATION_1 / (P_CLIMATE_2 * P_IMMIGRATION_1)),
                P_CLIMATE_2_AND_IMMIGRATION_2 * math.log2(P_CLIMATE_2_AND_IMMIGRATION_2 / (P_CLIMATE_2 * P_IMMIGRATION_2))]

        MI = sum(MI_VEC)

        H_x = -sum([P_CLIMATE_1 * math.log2(P_CLIMATE_1), P_CLIMATE_2 * math.log2(P_CLIMATE_2)])
        H_y = -sum([P_IMMIGRATION_1 * math.log2(P_IMMIGRATION_1), P_IMMIGRATION_2 * math.log2(P_IMMIGRATION_2)])
        
        H_x_given_y = -sum([P_CLIMATE_1_AND_IMMIGRATION_1 * math.log2(P_CLIMATE_1_AND_IMMIGRATION_1 / P_IMMIGRATION_1),
                        P_CLIMATE_1_AND_IMMIGRATION_2 * math.log2(P_CLIMATE_1_AND_IMMIGRATION_2 / P_IMMIGRATION_2),
                        P_CLIMATE_2_AND_IMMIGRATION_1 * math.log2(P_CLIMATE_2_AND_IMMIGRATION_1 / P_IMMIGRATION_1),
                        P_CLIMATE_2_AND_IMMIGRATION_2 * math.log2(P_CLIMATE_2_AND_IMMIGRATION_2 / P_IMMIGRATION_2)])
        
        NMI = 2 * ( (H_x - H_x_given_y) / (H_x + H_y) )

        #results[pair] = {"MI": MI, "NMI": NMI}
        row_vector_MI.append(MI)
        row_vector_NMI.append(NMI)
    
    whole_array_MI.append(row_vector_MI)
    whole_array_NMI.append(row_vector_NMI)

In [None]:
import numpy as np

data_MI = np.asarray(whole_array_MI)
data_NMI = np.asarray(whole_array_NMI)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

mask = np.zeros_like(data_NMI, dtype=bool)
mask[np.tril_indices_from(mask)] = True

# Creat fig.
plt.figure(figsize=(12, 8))
# Create a heatmap using seaborn
sns.heatmap(data_NMI, cmap='YlGnBu', annot=True, xticklabels=TOPICS, yticklabels=TOPICS, mask=mask)

# Move the x-axis to the top
plt.gca().xaxis.tick_top()
plt.gca().xaxis.set_label_position('top')
plt.xticks(rotation=90)
plt.xlabel('')
plt.ylabel('')


In [None]:
df

In [None]:
CLIMATE_2023 = read_graphml_file("./rich-networks/2023/RICH_CLIMATE_2023_NET.graphml")
IMMIGRATION_2023 = read_graphml_file("./rich-networks/2023/RICH_IMMIGRATION_2023_NET.graphml")

In [None]:
CLIMATE_WHOLE = {data["user_id"] for n, data in CLIMATE_2023.nodes(data=True)}
CLIMATE_CLUSTER_1 = {data["user_id"] for n, data in CLIMATE_2023.nodes(data=True) if data['finetuned_cluster'] == 0}
CLIMATE_CLUSTER_2 = {data["user_id"] for n, data in CLIMATE_2023.nodes(data=True) if data['finetuned_cluster'] == 1}

IMMIGRATION_WHOLE = {data["user_id"] for n, data in IMMIGRATION_2023.nodes(data=True)}
IMMIGRATION_CLUSTER_1 = {data["user_id"] for n, data in IMMIGRATION_2023.nodes(data=True) if data['finetuned_cluster'] == 0}
IMMIGRATION_CLUSTER_2 = {data["user_id"] for n, data in IMMIGRATION_2023.nodes(data=True) if data['finetuned_cluster'] == 1}

In [None]:
CLIMATE_CLUSTER_1_IMMIGRATION_CLUSTER_1 = CLIMATE_CLUSTER_1 & IMMIGRATION_CLUSTER_1
CLIMATE_CLUSTER_1_IMMIGRATION_CLUSTER_2 = CLIMATE_CLUSTER_1 & IMMIGRATION_CLUSTER_2 
CLIMATE_CLUSTER_2_IMMIGRATION_CLUSTER_1 = CLIMATE_CLUSTER_2 & IMMIGRATION_CLUSTER_1 
CLIMATE_CLUSTER_2_IMMIGRATION_CLUSTER_2 = CLIMATE_CLUSTER_2 & IMMIGRATION_CLUSTER_2 

N_NODES_IN_BOTH = len(CLIMATE_WHOLE & IMMIGRATION_WHOLE)
FRACTION_OF_ALL_NODES = N_NODES_IN_BOTH/(len(CLIMATE_WHOLE) + len(IMMIGRATION_WHOLE))
FRACTION_OF_CLIMATE = N_NODES_IN_BOTH/len(CLIMATE_WHOLE)

In [None]:
# CLOSED SYSTEM
RELEVANT_NODES = CLIMATE_WHOLE & IMMIGRATION_WHOLE

REDUCED_CLIMATE_CLUSTER_1 = CLIMATE_CLUSTER_1 & RELEVANT_NODES
REDUCED_CLIMATE_CLUSTER_2 = CLIMATE_CLUSTER_2 & RELEVANT_NODES

REDUCED_IMMIGRATION_CLUSTER_1 = IMMIGRATION_CLUSTER_1 & RELEVANT_NODES
REDUCED_IMMIGRATION_CLUSTER_2 = IMMIGRATION_CLUSTER_2 & RELEVANT_NODES

# MARGINAL PROBABILITIES

P_CLIMATE_1 = len(REDUCED_CLIMATE_CLUSTER_1)/(len(REDUCED_CLIMATE_CLUSTER_1) + len(REDUCED_CLIMATE_CLUSTER_2))
P_CLIMATE_2 = len(REDUCED_CLIMATE_CLUSTER_2)/(len(REDUCED_CLIMATE_CLUSTER_1) + len(REDUCED_CLIMATE_CLUSTER_2))

P_IMMIGRATION_1 = len(REDUCED_IMMIGRATION_CLUSTER_1)/(len(REDUCED_IMMIGRATION_CLUSTER_1) + len(REDUCED_IMMIGRATION_CLUSTER_2))
P_IMMIGRATION_2 = len(REDUCED_IMMIGRATION_CLUSTER_2)/(len(REDUCED_IMMIGRATION_CLUSTER_1) + len(REDUCED_IMMIGRATION_CLUSTER_2))

# JOINT PROBABILITIES

P_CLIMATE_1_AND_IMMIGRATION_1 = len(CLIMATE_CLUSTER_1_IMMIGRATION_CLUSTER_1)/N_NODES_IN_BOTH
P_CLIMATE_1_AND_IMMIGRATION_2 = len(CLIMATE_CLUSTER_1_IMMIGRATION_CLUSTER_2)/N_NODES_IN_BOTH
P_CLIMATE_2_AND_IMMIGRATION_1 = len(CLIMATE_CLUSTER_2_IMMIGRATION_CLUSTER_1)/N_NODES_IN_BOTH
P_CLIMATE_2_AND_IMMIGRATION_2 = len(CLIMATE_CLUSTER_2_IMMIGRATION_CLUSTER_2)/N_NODES_IN_BOTH

In [None]:
# BELONGS TO CLIMATE_X GIVEN IMMIGRATION_1

P_CLIMATE_1_GIVEN_IMMIGRATION_1 = P_CLIMATE_1_AND_IMMIGRATION_1/P_IMMIGRATION_1
P_CLIMATE_2_GIVEN_IMMIGRATION_1 = P_CLIMATE_2_AND_IMMIGRATION_1/P_IMMIGRATION_1

# BELONGS TO CLIMATE_X GIVEN IMMIGRATION_2
P_CLIMATE_1_GIVEN_IMMIGRATION_1 = P_CLIMATE_1_AND_IMMIGRATION_2/P_IMMIGRATION_2
P_CLIMATE_2_GIVEN_IMMIGRATION_1 = P_CLIMATE_2_AND_IMMIGRATION_2/P_IMMIGRATION_2

In [None]:
import math

joint_probs = [P_CLIMATE_1_AND_IMMIGRATION_1, P_CLIMATE_1_AND_IMMIGRATION_2, P_CLIMATE_2_AND_IMMIGRATION_1, P_CLIMATE_2_AND_IMMIGRATION_2]

#MI = joint_prob * math.log(joint_prob / (margin_prob_1 * margin_prob_2)) 
MI_VEC = [P_CLIMATE_1_AND_IMMIGRATION_1 * math.log2(P_CLIMATE_1_AND_IMMIGRATION_1 / (P_CLIMATE_1 * P_IMMIGRATION_1)),
          P_CLIMATE_1_AND_IMMIGRATION_2 * math.log2(P_CLIMATE_1_AND_IMMIGRATION_2 / (P_CLIMATE_1 * P_IMMIGRATION_2)),
          P_CLIMATE_2_AND_IMMIGRATION_1 * math.log2(P_CLIMATE_2_AND_IMMIGRATION_1 / (P_CLIMATE_2 * P_IMMIGRATION_1)),
          P_CLIMATE_2_AND_IMMIGRATION_2 * math.log2(P_CLIMATE_2_AND_IMMIGRATION_2 / (P_CLIMATE_2 * P_IMMIGRATION_2))]

MI = sum(MI_VEC)

In [None]:
MI

In [None]:
H_x = -sum([P_CLIMATE_1 * math.log2(P_CLIMATE_1), P_CLIMATE_2 * math.log2(P_CLIMATE_2)])
H_y = -sum([P_IMMIGRATION_1 * math.log2(P_IMMIGRATION_1), P_IMMIGRATION_2 * math.log2(P_IMMIGRATION_2)])

H_x_given_y = -sum([P_CLIMATE_1_AND_IMMIGRATION_1 * math.log2(P_CLIMATE_1_AND_IMMIGRATION_1 / P_IMMIGRATION_1),
                    P_CLIMATE_1_AND_IMMIGRATION_2 * math.log2(P_CLIMATE_1_AND_IMMIGRATION_2 / P_IMMIGRATION_2),
                    P_CLIMATE_2_AND_IMMIGRATION_1 * math.log2(P_CLIMATE_2_AND_IMMIGRATION_1 / P_IMMIGRATION_1),
                    P_CLIMATE_2_AND_IMMIGRATION_2 * math.log2(P_CLIMATE_2_AND_IMMIGRATION_2 / P_IMMIGRATION_2)])

NMI = 2 * ( (H_x - H_x_given_y) / (H_x + H_y) )

In [None]:
NMI