In [52]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [53]:
import networkx as nx
import pandas as pd
import numpy as np

from collections import defaultdict
from typing import Dict, Set, List, Callable, Tuple, TypedDict


class Stats(TypedDict):
    real_len: int
    calculated_len: int
    intersection_len: int
    accuracy: float


class Score(TypedDict):
    sum_real: int
    sum_intersection: int
    accuracy: float


def load_features(src: str) -> pd.DataFrame:
    df = pd.read_csv(src)
    df = df[["language", "numeric_id"]]
    return df


def remove_features(df: pd.DataFrame, features_to_remove: List[str]) -> pd.DataFrame:
    df = df[~df["language"].isin(features_to_remove)]
    df.reset_index(inplace=True, drop=True)
    return df


def keep_nodes(df: pd.DataFrame, number_of_nodes_to_keep: int) -> pd.DataFrame:
    indexes_to_keep = np.random.choice(df.index, size=number_of_nodes_to_keep)
    df = df.iloc[indexes_to_keep]
    df.reset_index(inplace=True, drop=True)
    return df


def load_edges(src: str, features: pd.DataFrame) -> pd.DataFrame:
    edges = pd.read_csv(src)

    ids = features["numeric_id"]

    edges_1 = edges[edges["id_1"].isin(ids)]
    edges_2 = edges[edges["id_2"].isin(ids)]
    index = pd.Index.intersection(edges_1.index, edges_2.index)
    edges = edges.iloc[index]
    edges.reset_index(inplace=True, drop=True)
    return edges


def create_graph(edges: pd.DataFrame) -> nx.Graph:
    G = nx.Graph()
    for row in edges.iterrows():
        G.add_edge(row[1]["id_1"], row[1]["id_2"])
    return G


def choose_largest_cc(graph: nx.Graph) -> nx.Graph:
    largest_cc = max(nx.connected_components(graph), key=len)
    graph = graph.subgraph(largest_cc)
    return graph


def get_real_communities(
    grah: nx.Graph, features: pd.DataFrame
) -> List[Tuple[str, Set[int]]]:
    real_communities: Dict[str, Set[int]] = defaultdict(set)
    for node in grah.nodes():
        language = features[features["numeric_id"] == node].iloc[0]["language"]
        real_communities[language].add(node)
    real_communities_list = [(k, v) for k, v in real_communities.items()]
    return sorted(real_communities_list, key=lambda x: len(x[1]), reverse=True)


def set_weights(
    graph: nx.Graph,
    metric: Dict[int, float],
    aggregate: Callable[[float, float], float],
):
    weights = {}
    for edge in graph.edges():
        n1, n2 = edge
        m1, m2 = metric[n1], metric[n2]
        weights[edge] = aggregate(m1, m2)

    nx.set_edge_attributes(graph, weights, "weights")


def calculate_communities(graph: nx.Graph) -> List[Set[int]]:
    calculated_communities = list(
        nx.algorithms.community.asyn_lpa_communities(graph, weight="weights")
    )
    calculated_communities = sorted(calculated_communities, key=len, reverse=True)
    return calculated_communities


def calculate_mapping_stats(
    real_communities: List[Tuple[str, Set[int]]], calculated_communities: List[Set[int]]
) -> Dict[str, Stats]:
    result: Dict[str, Stats] = {}

    already_mapped = set()
    for i in range(min(len(real_communities), len(calculated_communities))):
        calculated = calculated_communities[i]

        best = {"language": "", "real_len": -1, "intersection_len": -1}

        for language, real in real_communities:
            if language in already_mapped:
                continue

            intersection_len = len(real.intersection(calculated))

            if intersection_len > best["intersection_len"]:
                best = {
                    "language": language,
                    "real_len": len(real),
                    "intersection_len": intersection_len,
                }

        result[best["language"]] = {
            "real_len": best["real_len"],
            "calculated_len": len(calculated),
            "intersection_len": best["intersection_len"],
            "accuracy": best["intersection_len"] / best["real_len"],
        }
        already_mapped.add(best["language"])

    return result


def calculate_score(mapping_stats: Dict[str, Stats]) -> Score:
    sum_real = 0
    sum_intersection = 0
    for stats in mapping_stats.values():
        sum_real += stats["real_len"]
        sum_intersection += stats["intersection_len"]
    return {
        "sum_real": sum_real,
        "sum_intersection": sum_intersection,
        "accuracy": sum_intersection / sum_real,
    }


def eval(G: nx.Graph, real_communities: List[Tuple[str, Set[int]]], iterations: int):
    scores = []
    for _ in range(iterations):
        calculated_communities: List[Set[int]] = calculate_communities(G)
        mapping_stats = calculate_mapping_stats(
            real_communities, calculated_communities
        )
        score = calculate_score(mapping_stats)["accuracy"]
        scores.append(score)
    return scores

In [54]:
features = load_features("./datasets/twitch/large_twitch_features.csv")
print("Loaded", len(features.index), "nodes")

Loaded 168114 nodes


In [55]:
features_to_remove = ["OTHER"]
features = remove_features(features, features_to_remove)
print("Kept", len(features.index), "nodes after removing", features_to_remove)

Kept 166685 nodes after removing ['OTHER']


In [56]:
# number_of_nodes_to_keep = 1_000
# features = keep_nodes(features, number_of_nodes_to_keep)
# print("Kept", len(features.index), "nodes")

features_to_remove = ["EN"]
features = remove_features(features, features_to_remove)
print("Kept", len(features.index), "nodes after removing", features_to_remove)

Kept 42274 nodes after removing ['EN']


In [57]:
edges = load_edges("./datasets/twitch/large_twitch_edges.csv", features)
print("Loaded", len(edges.index), "edges")

Loaded 657915 edges


In [58]:
G = create_graph(edges)

print(
    "Created graph with", G.number_of_nodes(), "nodes and", G.number_of_edges(), "edges"
)

Created graph with 41309 nodes and 657915 edges


In [59]:
G = choose_largest_cc(G)
print(
    "Chose largest connected component with",
    G.number_of_nodes(),
    "nodes and",
    G.number_of_edges(),
    "edges",
)

Chose largest connected component with 41265 nodes and 657892 edges


In [60]:
real_communities = get_real_communities(G, features)

print("There are ", len(real_communities), "communites with the folowing counts:")
for language, community in real_communities:
    print("-", language, len(community))

There are  19 communites with the folowing counts:
- DE 9270
- FR 6734
- ES 5562
- RU 4694
- ZH 2794
- PT 2460
- JA 1262
- IT 1210
- KO 1186
- PL 906
- SV 781
- TR 761
- NL 647
- TH 631
- FI 623
- CS 569
- DA 472
- HU 414
- NO 289


In [10]:
# metric = {}
# for node, degree in G.degree():
#     metric[node] = degree
# set_weights(G, metric, max)

In [11]:
# calculated_communities: List[Set[int]] = calculate_communities(G)

In [12]:
# print("Found", len(calculated_communities), "communities with the folowing counts")
# for community in calculated_communities:
#     print("-", len(community))

Found 15 communities with the folowing counts
- 18
- 10
- 9
- 7
- 7
- 5
- 4
- 3
- 3
- 3
- 3
- 2
- 2
- 2
- 2


In [13]:
# mapping_stats = calculate_mapping_stats(real_communities, calculated_communities)
# for language, stats in mapping_stats.items():
#     print("-", language, stats)

- EN {'real_len': 76, 'calculated_len': 18, 'intersection_len': 18, 'accuracy': 0.23684210526315788}
- DE {'real_len': 3, 'calculated_len': 10, 'intersection_len': 0, 'accuracy': 0.0}
- NO {'real_len': 1, 'calculated_len': 9, 'intersection_len': 0, 'accuracy': 0.0}


In [14]:
# calculate_score(mapping_stats)

{'sum_real': 80, 'sum_intersection': 18, 'accuracy': 0.225}

In [18]:
# scores = eval(G, real_communities, 10)

In [21]:
# print(np.min(scores))
# print(np.max(scores))
# print(np.mean(scores))
# print(np.std(scores))

0.1875
0.3875
0.28874999999999995
0.07125000000000001


In [61]:
def metric_unitary(graph: nx.Graph) -> Dict[int, int]:
    metric = {}
    for node in graph.nodes():
        metric[node] = 1
    return metric


def metric_degree(graph: nx.Graph) -> Dict[int, int]:
    metric = {}
    for node, degree in graph.degree():
        metric[node] = degree
    return metric


def metric_degree_centrality(graph: nx.Graph) -> Dict[int, int]:
    return nx.degree_centrality(graph)


def metric_eigenvector_centrality(graph: nx.Graph) -> Dict[int, int]:
    return nx.eigenvector_centrality(graph)


def metric_pagerank(graph: nx.Graph) -> Dict[int, int]:
    return nx.pagerank(graph)


def metric_clustering(graph: nx.Graph) -> Dict[int, int]:
    return nx.clustering(graph)


def metric_closeness_centrality(graph: nx.Graph) -> Dict[int, int]:
    return nx.closeness_centrality(graph)


def metric_betweeness_centrality(graph: nx.Graph) -> Dict[int, int]:
    return nx.betweenness_centrality(graph)

In [64]:
tests = [
    ("unitary", metric_unitary, max),
    ("degree max", metric_degree, max),
    ("degree_centrality max", metric_degree_centrality, max),
    ("eigenvector_centrality max", metric_eigenvector_centrality, max),
    ("pagerank max", metric_pagerank, max),
    ("clustering max", metric_clustering, max),
    ("closeness_centrality max", metric_closeness_centrality, max),
    ("betweeness_centrality max", metric_betweeness_centrality, max),
]

In [67]:
import json

for test in tests:
    result = {}
    print("Evaluating", test[0])
    set_weights(G, test[1](G), test[2])
    print("Generated weights")
    print("Evaluating...")
    scores = eval(G, real_communities, 10)
    result["mean"] = np.mean(scores)
    result["std"] = np.std(scores)
    result["min"] = np.min(scores)
    result["max"] = np.max(scores)
    result["scores"] = scores
    print("Saving")
    with open(f"eval/{test[0]}.json", "w") as f:
        json.dump(result, f, indent=4)

Evaluating unitary
Generated weights
Evaluating...
