In [1]:
import csv
import networkx as nx

## Related Games File and Games Information Discrepancy

In this function we are trying to find the discrepancy in games between the games found in the collated version of `data/related_games/` and the games found in `data/games_metadata/all_games.csv`. This leads us to have two games that are present in the related games, but not the games information: `k6qw78o6` and `4d7e8g67`. I have no idea what these games are, and they cannot be found on the speedrun.com API anymore.

In [2]:
def find_difference_between_gamesinfo_and_toobig(gamesinfo_filename, toobig_filename):
    games_from_gamesinfo, games_from_toobig = set(), set()

    with open(toobig_filename, 'r') as openfile:
            csv_reader = csv.reader(openfile)
            next(csv_reader)
            [games_from_toobig.add(row[0]) for row in csv_reader]

    with open(gamesinfo_filename, 'r') as openfile:
            csv_reader = csv.reader(openfile)
            next(csv_reader)
            [games_from_gamesinfo.add(row[0]) for row in csv_reader]

    return list(games_from_toobig - games_from_gamesinfo)

def find_number_of_games(filename):
        games = set()
        with open(filename, 'r') as openfile:
            csv_reader = csv.reader(openfile)
            next(csv_reader)
            [games.add(row[0]) for row in csv_reader]
            [games.add(row[1]) for row in csv_reader]
        return len(games)

In [3]:
find_difference_between_gamesinfo_and_toobig("../data/games_metadata/all_games.csv", "../data/too_big/all_games.csv")

['k6qw78o6', '4d7e8g67']

## Related Games Graph and Games Information Discrepancy

In this series of functions, we are trying to find the discrepancy in games from the generated graph and the list of games in `data/games_metadata/all_games.csv`. We find that there are 2016 games missing from the generated network and the list of games.  There are a couple scenarios where a game is not present in the network:
 1. A game has zero runs;
 2. A game is played by ONLY guests;
 3. A game is played by users, but those users have only played the one game. Since we are not allowing edges with the same source and target, they are not present in the network.

In [4]:
def get_weighted_edges_from_csv(filename, filter=None):
    with open(filename, 'r') as openfile:
        csv_reader = csv.reader(openfile)
        next(csv_reader)

        edges = list()
        for row in csv_reader:
            if filter is None:
                edges.append(tuple([row[0], row[1], int(row[2])]))
                continue
            
            if not filter.get(row[0]) or not filter.get(row[1]):
                continue

            edges.append(tuple([row[0], row[1], int(row[2])]))

    return edges

def generate_graph_from_edges(edges_list):
    directed_graph = nx.DiGraph()
    directed_graph.add_weighted_edges_from(edges_list)
    return directed_graph

def find_missing_games_from_network(games_filename, graph_filename):
    edges = get_weighted_edges_from_csv(graph_filename, filter=None)
    graph = generate_graph_from_edges(edges)

    nodes = set(graph.nodes)
    games = set()
    with open(games_filename, 'r') as openfile:
        csv_reader = csv.reader(openfile)
        next(csv_reader)
        [games.add(row[0]) for row in csv_reader]
    return list(games - nodes)


In [5]:
games_filename = "../data/games_metadata/all_games.csv"
graph_filename = "../data/too_big/all_games.csv"
missing_games = find_missing_games_from_network(games_filename, graph_filename)
print(len(missing_games))
# print(missing_games)

2016


## Games Information Dataframe and Filtered Graph Discrepancy

We filter the games information dataframe to come up with the discrepancy between the filtered number of games and the number of nodes in the related games graph. We filter by the created date and release date before 2023/01/01, the number of runs being bigger than zero, and the number of users being bigger than zero. This gets us to 30677, which is 243 different than the number found in `network_analysis.ipynb`. I suspect this is due to the third reason listed there, which is that the users that have played that game have *only* played that game. This removes them from the network.

In [6]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt

In [7]:
df = pd.read_csv('../data/games_metadata/all_games.csv')

# This drops indices that have a time as none, reduces values from before 2023-01-01, and formats the timestamps correctly.
df.drop(df[df['created_date'] == 'None'].index, inplace=True)

df = df.dropna()

df['release_date'] = pd.to_datetime(df['release_date'])
df['created_date'] = pd.to_datetime(df['created_date'], format='%Y-%m-%dT%H:%M:%SZ')

df['release_date'] = pd.to_datetime(df['release_date'].dt.strftime('%Y-%m-%d'))
df['created_date'] = pd.to_datetime(df['created_date'].dt.strftime('%Y-%m-%d'))

df = df[(df['created_date'] < '2023-01-01') & (df['release_date'] < '2023-01-01')]

df[['num_categories', 'num_levels', 'num_runs', 'num_users', 'num_guests']] = df[['num_categories', 'num_levels', 'num_runs', 'num_users', 'num_guests']].apply(pd.to_numeric)

df = df[(df['num_runs'] != 0) & (df['num_users'] != 0)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30677 entries, 0 to 32827
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   game_id         30677 non-null  object        
 1   game_name       30677 non-null  object        
 2   developers      30677 non-null  object        
 3   release_date    30677 non-null  datetime64[ns]
 4   created_date    30677 non-null  datetime64[ns]
 5   num_categories  30677 non-null  int64         
 6   num_levels      30677 non-null  int64         
 7   num_runs        30677 non-null  int64         
 8   num_users       30677 non-null  int64         
 9   num_guests      30677 non-null  int64         
dtypes: datetime64[ns](2), int64(5), object(3)
memory usage: 2.6+ MB
