In [92]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [93]:
import networkx as nx
import pandas as pd
import numpy as np

from collections import defaultdict
from typing import Dict, Set, List, Callable, Tuple, TypedDict
from utils import *
import json


class Stats(TypedDict):
    real_len: int
    calculated_len: int
    intersection_len: int
    accuracy: float


class Score(TypedDict):
    sum_real: int
    sum_intersection: int
    accuracy: float


def load_features(src: str) -> pd.DataFrame:
    df = pd.read_csv(src)
    df = df[["language", "numeric_id"]]
    return df


def remove_features(df: pd.DataFrame, features_to_remove: List[str]) -> pd.DataFrame:
    df = df[~df["language"].isin(features_to_remove)]
    df.reset_index(inplace=True, drop=True)
    return df


def keep_nodes(df: pd.DataFrame, number_of_nodes_to_keep: int) -> pd.DataFrame:
    indexes_to_keep = np.random.choice(df.index, size=number_of_nodes_to_keep)
    df = df.iloc[indexes_to_keep]
    df.reset_index(inplace=True, drop=True)
    return df


def load_edges(src: str, features: pd.DataFrame) -> pd.DataFrame:
    edges = pd.read_csv(src)

    ids = features["numeric_id"]

    edges_1 = edges[edges["id_1"].isin(ids)]
    edges_2 = edges[edges["id_2"].isin(ids)]
    index = pd.Index.intersection(edges_1.index, edges_2.index)
    edges = edges.iloc[index]
    edges.reset_index(inplace=True, drop=True)
    return edges


def create_graph(edges: pd.DataFrame) -> nx.Graph:
    G = nx.Graph()
    for row in edges.iterrows():
        G.add_edge(row[1]["id_1"], row[1]["id_2"])
    return G


def choose_largest_cc(graph: nx.Graph) -> nx.Graph:
    largest_cc = max(nx.connected_components(graph), key=len)
    graph = graph.subgraph(largest_cc)
    return graph


def get_real_communities(
    grah: nx.Graph, features: pd.DataFrame
) -> List[Tuple[str, Set[int]]]:
    real_communities: Dict[str, Set[int]] = defaultdict(set)
    for node in grah.nodes():
        language = features[features["numeric_id"] == node].iloc[0]["language"]
        real_communities[language].add(node)
    real_communities_list = [(k, v) for k, v in real_communities.items()]
    return sorted(real_communities_list, key=lambda x: len(x[1]), reverse=True)


def set_weights(
    graph: nx.Graph,
    metric: Dict[int, float],
    aggregate: Callable[[List[float]], float],
):
    weights = {}
    for edge in graph.edges():
        n1, n2 = edge
        m1, m2 = metric[n1], metric[n2]
        weights[edge] = aggregate([m1, m2])

    nx.set_edge_attributes(graph, weights, "weights")


def calculate_communities(graph: nx.Graph) -> List[Set[int]]:
    calculated_communities = list(
        nx.algorithms.community.asyn_lpa_communities(graph, weight="weights")
    )
    calculated_communities = sorted(calculated_communities, key=len, reverse=True)
    return calculated_communities


def calculate_mapping_stats(
    real_communities: List[Tuple[str, Set[int]]], calculated_communities: List[Set[int]]
) -> Dict[str, Stats]:
    result: Dict[str, Stats] = {}

    already_mapped = set()
    for i in range(min(len(real_communities), len(calculated_communities))):
        calculated = calculated_communities[i]

        best = {
            "language": "",
            "real_len": -1,
            "intersection_len": -1,
            "accuracy": -1,
        }

        for language, real in real_communities:
            if language in already_mapped:
                continue

            intersection_len = len(real.intersection(calculated))
            accuracy = intersection_len / (
                len(real) + len(calculated) - intersection_len
            )

            if accuracy > best["accuracy"]:
                best = {
                    "language": language,
                    "real_len": len(real),
                    "intersection_len": intersection_len,
                    "accuracy": accuracy,
                }

        result[best["language"]] = {
            "real_len": best["real_len"],
            "calculated_len": len(calculated),
            "intersection_len": best["intersection_len"],
            "accuracy": best["accuracy"],
        }
        already_mapped.add(best["language"])

    return result


def calculate_score(mapping_stats: Dict[str, Stats]) -> Score:
    sum_real = 0
    sum_intersection = 0
    sum_calculated = 0
    for stats in mapping_stats.values():
        sum_real += stats["real_len"]
        sum_intersection += stats["intersection_len"]
        sum_calculated += stats["calculated_len"]
    return {
        "sum_real": sum_real,
        "sum_intersection": sum_intersection,
        "sum_calculated": sum_calculated,
        "accuracy": sum_intersection / max(sum_real, sum_calculated),
    }


def eval(G: nx.Graph, real_communities: List[Tuple[str, Set[int]]], iterations: int):
    scores = []
    for _ in range(iterations):
        calculated_communities: List[Set[int]] = calculate_communities(G)
        mapping_stats = calculate_mapping_stats(
            real_communities, calculated_communities
        )
        score = calculate_score(mapping_stats)["accuracy"]
        scores.append(score)
    return scores

In [94]:
features = load_features("./datasets/twitch/large_twitch_features.csv")
print("Loaded", len(features.index), "nodes")

Loaded 168114 nodes


In [95]:
features_to_remove = ["OTHER"]
features = remove_features(features, features_to_remove)
print("Kept", len(features.index), "nodes after removing", features_to_remove)

Kept 166685 nodes after removing ['OTHER']


In [96]:
features_to_remove = ["EN"]
features = remove_features(features, features_to_remove)
print("Kept", len(features.index), "nodes after removing", features_to_remove)

Kept 42274 nodes after removing ['EN']


In [97]:
edges = load_edges("./datasets/twitch/large_twitch_edges.csv", features)
print("Loaded", len(edges.index), "edges")

Loaded 657915 edges


In [98]:
G = create_graph(edges)

print(
    "Created graph with", G.number_of_nodes(), "nodes and", G.number_of_edges(), "edges"
)

Created graph with 41309 nodes and 657915 edges


In [99]:
G = choose_largest_cc(G)
print(
    "Chose largest connected component with",
    G.number_of_nodes(),
    "nodes and",
    G.number_of_edges(),
    "edges",
)

Chose largest connected component with 41265 nodes and 657892 edges


In [101]:
real_communities = get_real_communities(G, features)

print("There are ", len(real_communities), "communites with the folowing counts:")
for language, community in real_communities:
    print("-", language, len(community))

There are  19 communites with the folowing counts:
- DE 9270
- FR 6734
- ES 5562
- RU 4694
- ZH 2794
- PT 2460
- JA 1262
- IT 1210
- KO 1186
- PL 906
- SV 781
- TR 761
- NL 647
- TH 631
- FI 623
- CS 569
- DA 472
- HU 414
- NO 289


In [102]:
NO = real_communities[-1][1]

In [103]:
to_remove = [node for node in G.nodes() if node not in NO]

G_NO = G.copy()
G_NO.remove_nodes_from(to_remove)

In [104]:
print(G_NO.number_of_nodes(), G_NO.number_of_edges())

289 1265


In [106]:
reset_colors(G_NO)

highlight = set()
for node in G_NO.nodes():
    neighbors = G.neighbors(node)
    if any([nei not in NO for nei in neighbors]):
        highlight.add(node)
print(len(highlight))

highlight_nodes(G_NO, highlight)

187


In [107]:
externals = set()

for n1, n2 in G.edges:
    is_1 = n1 in NO
    is_2 = n2 in NO
    if is_1 and is_2:
        continue
    if is_1:
        internal = n1
        external = n2
    if is_2:
        internal = n2
        external = n1
    if is_1 or is_2:
        externals.add(external)
        G_NO.add_edge(internal, external)

In [108]:
deg = {}
for node, degree in G.degree(G_NO.nodes()):
    if degree not in deg:
        deg[degree] = 0
    deg[degree] += 1

for node in G_NO.nodes():
    degree = G_NO.degree(node)
    nbre_for_degree = deg[degree]
    if "viz" not in G_NO.nodes[node]:
        G_NO.nodes[node]["viz"] = {"position": {}}
    G_NO.nodes[node]["viz"]["position"] = {
        "x": 20 * degree,
        "y": 20 * nbre_for_degree,
        "z": 0.0,
    }
    deg[degree] -= 1

In [109]:
nx.write_gexf(G_NO, "graphs/twitch/NO.gexf")