In [97]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [98]:
from utils import reset_colors

import networkx as nx
import pandas as pd
import numpy as np

from collections import defaultdict
from typing import Dict, Set, List

In [104]:
features_to_remove = ["EN", "OTHER"]

features = pd.read_csv("./datasets/twitch/large_twitch_features.csv")
print("Loaded", len(features.index), "nodes")
features = features[~features["language"].isin(features_to_remove)]
features = features[["language", "numeric_id"]]
print("Kept", len(features.index), "nodes")
features.reset_index(inplace=True, drop=True)

print("languages ", features["language"].unique())

Loaded 168114 nodes
Kept 42274 nodes
languages ['FR' 'KO' 'JA' 'RU' 'PL' 'DE' 'ES' 'IT' 'PT' 'TR' 'ZH' 'SV' 'NL' 'TH'
 'CS' 'DA' 'HU' 'FI' 'NO']


In [105]:
ids = features["numeric_id"]

In [106]:
edges = pd.read_csv("./datasets/twitch/large_twitch_edges.csv")
print("Loaded", len(edges.index), "edges")
edges_1 = edges[edges["id_1"].isin(ids)]
edges_2 = edges[edges["id_2"].isin(ids)]
index = pd.Index.intersection(edges_1.index, edges_2.index)
edges = edges.iloc[index]
print("Kept", len(edges.index), "edges")
edges.reset_index(inplace=True, drop=True)

Loaded 6797557 edges
Kept 657915 edges


In [107]:
G = nx.Graph()

for row in edges.iterrows():
    G.add_edge(row[1]["id_1"], row[1]["id_2"])

print(
    "Created graph with", G.number_of_nodes(), "nodes and", G.number_of_edges(), "edges"
)

largest_cc = max(nx.connected_components(G), key=len)
G = G.subgraph(largest_cc)
print(
    "Chose largest connected component with",
    G.number_of_nodes(),
    "nodes and",
    G.number_of_edges(),
    "edges",
)

Created graph with 41309 nodes and 657915 edges
Chose largest connected component with 41265 nodes and 657892 edges


In [108]:
reset_colors(G)

In [109]:
nx.write_gexf(G, "graphs/twitch/graph.gexf")

In [None]:
real_communities: Dict[str, Set[int]] = defaultdict(set)
for node in G.nodes:
    language = features[features["numeric_id"] == node].iloc[0]["language"]
    real_communities[language].add(node)
real_communities_counts = [
    (language, len(nodes)) for language, nodes in real_communities.items()
]

In [131]:
print(
    "There are ", len(real_communities.keys()), "communites with the folowing counts:"
)
for x in sorted(real_communities_counts, key=lambda x: x[1], reverse=True):
    print("-", x[0], x[1])

There are  19 communites with the folowing counts:
- DE 9270
- FR 6734
- ES 5562
- RU 4694
- ZH 2794
- PT 2460
- JA 1262
- IT 1210
- KO 1186
- PL 906
- SV 781
- TR 761
- NL 647
- TH 631
- FI 623
- CS 569
- DA 472
- HU 414
- NO 289


In [125]:
# highlight_nodes_communities(G, [nodes for _, nodes in real_communities.items()])
# nx.write_gexf(G, "graphs/twitch/languages.gexf")

In [181]:
weights = {}
for edge in G.edges():
    n1, n2 = edge
    d1, d2 = G.degree(n1), G.degree(n2)
    # weights[edge] = 1
    weights[edge] = max(d1, d2)
    # weights[edge] = max(d1, d2) ** 2
    # weights[edge] = (d1 + d2)
    # weights[edge] = (d1 + d2) ** 2

nx.set_edge_attributes(G, weights, "weights")

calculated_communities: List[Set[int]] = list(
    nx.algorithms.community.asyn_lpa_communities(G, weight="weights")
)
calculated_communities_counts = [len(com) for com in calculated_communities]

In [182]:
print("Found", len(calculated_communities), "communities with the folowing counts")
for count in sorted(calculated_communities_counts, reverse=True):
    print("-", count)

Found 21 communities with the folowing counts
- 9496
- 6907
- 4706
- 3194
- 2817
- 2640
- 2551
- 1196
- 1189
- 925
- 867
- 746
- 700
- 633
- 566
- 561
- 544
- 425
- 382
- 216
- 4


In [117]:
# highlight_nodes_communities(G, calculated_communities)
# nx.write_gexf(G, "graphs/twitch/calculated_communities.gexf")

In [183]:
total_score = 0
for language, nodes in real_communities.items():
    best = (0, -1)
    for i, community in enumerate(calculated_communities):
        intersection_len = len(community.intersection(nodes))
        if intersection_len > best[0]:
            best = (intersection_len, i)
    total_score += best[0]
    print(f"Matched {language} ({len(nodes)}) with {best[1]} ({best[0]} in common)")

Matched NL (647) with 0 (529 in common)
Matched NO (289) with 1 (199 in common)
Matched DE (9270) with 2 (9175 in common)
Matched HU (414) with 3 (376 in common)
Matched JA (1262) with 4 (863 in common)
Matched DA (472) with 8 (413 in common)
Matched FR (6734) with 6 (6701 in common)
Matched FI (623) with 9 (551 in common)
Matched ES (5562) with 10 (2790 in common)
Matched SV (781) with 5 (638 in common)
Matched ZH (2794) with 16 (2773 in common)
Matched IT (1210) with 11 (1163 in common)
Matched PT (2460) with 12 (2394 in common)
Matched PL (906) with 14 (852 in common)
Matched TH (631) with 15 (624 in common)
Matched TR (761) with 17 (735 in common)
Matched RU (4694) with 7 (4598 in common)
Matched CS (569) with 18 (555 in common)
Matched KO (1186) with 19 (1143 in common)


In [184]:
print("Out of", G.number_of_nodes(), "nodes,", total_score, "nodes matched correctly")
print("Accuracy", total_score / G.number_of_nodes())

Out of 41265 nodes, 37072 nodes matched correctly
Accuracy 0.8983884648006786
