In [2]:
import csv
import networkx as nx
from networkx.algorithms import pagerank
from collections import defaultdict
from datetime import datetime

## Constants
We have defined `FINAL_DATE` to mark the cutoff date for the entire project, we use this for the filter on the network so any games that were released/created after the cutoff date are not included.

In [3]:
FINAL_DATE = datetime(2023, 1, 1)

## Graph Generation

We have three functions to generate directed graphs using the `networkx` package. At the moment, the function `generate_network_filter` removes entries in the collated related games file where the game has a release date or created date after the cutoff date.

In [4]:
def generate_network_filter(filename: str):
    with open(filename, 'r') as openfile:
        csv_reader = csv.reader(openfile)
        next(csv_reader)
        filter_map = defaultdict(bool)
        for row in csv_reader:
            # Check if the created/release date is after 2023, if it is then we can ignore it in the network.
            release_date = datetime.strptime(row[3], "%Y-%m-%d")
            if row[4] == "None":
                row[4] = "2017-10-22T05:21:29Z"
            created_date = datetime.strptime(row[4], "%Y-%m-%dT%H:%M:%SZ")
            if created_date < FINAL_DATE and release_date < FINAL_DATE:
                filter_map[row[0]] = True

    return filter_map

def get_weighted_edges_from_csv(filename, filter=None):
    with open(filename, 'r') as openfile:
        csv_reader = csv.reader(openfile)
        next(csv_reader)

        edges = list()
        for row in csv_reader:
            if filter is None:
                edges.append(tuple([row[0], row[1], int(row[2])]))
                continue
            
            if not filter.get(row[0]) or not filter.get(row[1]):
                continue

            edges.append(tuple([row[0], row[1], int(row[2])]))

    return edges

def generate_graph_from_edges(edges_list):
    directed_graph = nx.DiGraph()
    directed_graph.add_weighted_edges_from(edges_list)
    return directed_graph

## Metric Functions

Here are some functions that I am using to measure the network. Currently, we are measuring betweenness centrality, pagerank popularity, and probably some communities in the future.

In [5]:
def find_top_n_pagerank_nodes(g: nx.Graph, n=1):
    values = pagerank(g)
    values_sorted = dict(sorted(values.items(), key=lambda item: item[1], reverse=True))
    return list(values_sorted)[0:n]

def get_graph_order_size(g: nx.Graph):
    return g.order(), g.size()

def find_top_n_betweenness_centrality_nodes(g: nx.Graph, n=10):
    degree_centrality = nx.degree_centrality(g)
    betweenness_centrality = nx.betweenness_centrality(g, normalized=True, endpoints=True)

    degree_centrality_sorted = dict(sorted(degree_centrality.items(), key=lambda item: item[1], reverse=True))
    betweenness_centrality_sorted = dict(sorted(betweenness_centrality.items(), key=lambda item: item[1], reverse=True))

    keys_top_degree = list(degree_centrality_sorted)[0:n]
    keys_top_betweenness = list(betweenness_centrality_sorted)[0:n]
    return list(set(keys_top_degree) & set(keys_top_betweenness))

## Unlimited Related Games Network

We are generating a network using the entire related games network that has been filtered using the previously mentioned function. This gives us a directed graph with **30,434 nodes**, and **14,744,682 edges**. Without filtering, we get a network that has **30,970 nodes** and **14,857,762 edges**. The filtering removes **536 edges** and **113,080 nodes**.

In [6]:
filter_filename = "../data/games_information/all_games.csv"
graph_filename = "../data/too_big/all_games.csv"
filter_map = generate_network_filter(filter_filename)
edges = get_weighted_edges_from_csv(graph_filename, filter=filter_map)
graph = generate_graph_from_edges(edges)
print(f"FILTERED: {get_graph_order_size(graph)}")

edges = get_weighted_edges_from_csv(graph_filename, filter=None)
graph = generate_graph_from_edges(edges)
print(f"UNFILTERED: {get_graph_order_size(graph)}")
# print(find_top_n_betweenness_centrality_nodes(graph, n=100))

FILTERED: (30434, 14744682)
UNFILTERED: (30970, 14857762)


In [None]:
filter_filename = "../data/games_information/all_games.csv"
graph_filename = "../data/too_big/all_games.csv"
filter_map = generate_network_filter(filter_filename)
edges = get_weighted_edges_from_csv(graph_filename, filter=filter_map)
graph = generate_graph_from_edges(edges)
highest_bc_nodes = find_top_n_betweenness_centrality_nodes(graph, n=25)
print(highest_bc_nodes)

## Other Percentage Limited Networks 

Using a limited version of the full network (10% representing the collated related games of the first 10% of games in the `data/related_games/` directory, ~3000 games.) we can see how the network evolves over time. We can see that there is a logarithmic increase in the number of nodes, or games, represented in the network when more time progresses.

| Percentage | # Nodes | # Edges |
|----|---|---|
| 10 | 21713 | 1798823 | 
| 20 | 25919 | 3073789 | 
| 30 | 27426 | 4205049 | 
| 40 | 28594 | 5585984 | 
| 50 | 29345 | 7238562 | 
| 60 | 29776 | 9015808 | 
| 70 | 30088 | 10609558 | 
| 80 | 30292 | 12132973 | 
| 90 | 30398 | 13535490 | 

In [7]:
filter_filename = "../data/games_information/all_games.csv"
filter_map = generate_network_filter(filter_filename)

percentages_files = {
    "10":"../data/too_big/all_games_10_percent.csv",
    "20":"../data/too_big/all_games_20_percent.csv",
    "30":"../data/too_big/all_games_30_percent.csv",
    "40":"../data/too_big/all_games_40_percent.csv",
    "50":"../data/too_big/all_games_50_percent.csv",
    "60":"../data/too_big/all_games_60_percent.csv",
    "70":"../data/too_big/all_games_70_percent.csv",
    "80":"../data/too_big/all_games_80_percent.csv",
    "90":"../data/too_big/all_games_90_percent.csv",
}

for percentage, filename in percentages_files.items():
    edges = get_weighted_edges_from_csv(filename, filter=filter_map)
    graph = generate_graph_from_edges(edges)
    print(f"{percentage}: {get_graph_order_size(graph)}")

10: (21713, 1798823)
20: (25919, 3073789)
30: (27426, 4205049)
40: (28594, 5585984)
50: (29345, 7238562)
60: (29776, 9015808)
70: (30088, 10609558)
80: (30292, 12132973)
90: (30398, 13535490)


In [9]:
filter_filename = "../data/games_information/all_games.csv"
graph_filename = "../data/too_big/all_games_10_percent.csv"
filter_map = generate_network_filter(filter_filename)
edges = get_weighted_edges_from_csv(graph_filename, filter=filter_map)
graph = generate_graph_from_edges(edges)
print(get_graph_order_size(graph))

highest_bc_nodes = find_top_n_betweenness_centrality_nodes(graph, n=25)
print(highest_bc_nodes)

(21713, 1798823)
['k6qew96g', 'y65457de', 'm1mp9312', '76rmo418', '4d7eog67', 'w6jkle1j', 'kdkz7ldm', '9doye36p', 'kdkzvmld', '9d3rr0dl', 'kyd4gde4', 'l3dx51yv', 'y658506e', 'v1p0x468', 'm9do0odp', 'pd0wq31e', 'om1m3625', '46w2l76r', 'nd28gvd0', '3dxzqv1y', '369go31l']


In [None]:
pos = nx.circular(graph)
nx.draw_networkx(
    graph,
    pos=pos,
    node_size=0,
    edge_color="#444444",
    alpha=0.05,
    with_labels=False)

In [None]:
pos = nx.spring_layout(graph, k=0.1)
nx.draw_networkx(
    graph,
    pos=pos,
    node_size=0,
    edge_color="#444444",
    alpha=0.05,
    with_labels=False)

## Trying `graph-tool` for Fast Network Algorithms

In [12]:
from graph_tool.all import *
# https://git.skewed.de/count0/graph-tool/-/wikis/installation-instructions

ModuleNotFoundError: No module named 'graph_tool'