In [1]:
import sys, os
from pymongo import MongoClient

from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import *

# https://stackoverflow.com/questions/41323423/plotly-inside-jupyter-notebook-python
init_notebook_mode(connected=True) # initiate notebook for offline plot

import pandas as pd

In [2]:
sys.path.append("..")
import config # ../config.py
import common # ../common.py

# run on Asus laptop Mongo database
#config.mongodb_ip = "192.168.1.124"
# run on Dell
config.mongodb_ip = "192.168.1.171"

client = MongoClient(host=config.mongodb_ip, port=config.mongodb_port)
db = client['steam']

## Failed appids excluded from future attempts or analysis

In [3]:
failed_appids_df = pd.DataFrame(list(db['apps'].find({"failureCount": {"$gte":3}}, {"appid":1,"name":1,"_id":False})))
failed_appids_df

Unnamed: 0,name,appid
0,Hammer Heads Deluxe Demo,3402
1,Rocket Mania Deluxe Demo,3442
2,Legend: Hand of God,12630
3,The Settlers: Heritage of Kings,15310
4,The Settlers: Rise of an Empire Gold Edition,19930
...,...,...
12839,DRAGON BALL Z: KAKAROT Dragon Palace Bowl,1144642
12840,DRAGON BALL Z: KAKAROT Steaming-Hot Grilled Fish,1144644
12841,DRAGON BALL Z: KAKAROT Aged Wild Steak,1144641
12842,GROUND BRANCH CTE Dedicated Server,1207110


In [4]:
grouped_df = db['apps'].aggregate([
{"$match":
    {"$or": [{"failureCount": {"$lt": 3}}, {"failureCount": {"$exists": False}}]}
},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$updated_date"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

grouped_df = pd.DataFrame(list(grouped_df))
grouped_df.rename(columns={grouped_df.columns[0] : 'updated_date'}, inplace=True)
grouped_df

Unnamed: 0,updated_date,count
0,2019-12-28,1622
1,2019-12-29,3926
2,2019-12-30,3561
3,2019-12-31,3497
4,2020-01-01,3497
5,2020-01-02,3993
6,2020-01-03,3480
7,2020-01-04,3473
8,2020-01-05,3996
9,2020-01-06,3494


## Large numbers of old AppID entries means `refreshsteam.py` needs to be run.  This excludes consistently failed appids.

In [5]:
# https://plot.ly/python/bar-charts/

data = [Bar(
        x=grouped_df['updated_date'],
        y=grouped_df['count']
    )]

layout = Layout(
        title='AppIDs By Last Updated Date',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Count'
        )
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

In [6]:
grouped_df = db['pricehistory'].aggregate([
{"$match": {}},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$date"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

grouped_df = pd.DataFrame(list(grouped_df))
grouped_df.rename(columns={grouped_df.columns[0] : 'date'}, inplace=True)
grouped_df

Unnamed: 0,date,count
0,2018-09-04,8215
1,2018-09-05,43441
2,2018-09-06,37421
3,2018-09-07,23655
4,2018-09-08,37188
...,...,...
434,2020-01-15,28705
435,2020-01-16,28979
436,2020-01-17,28293
437,2020-01-18,28595


## This is historical data from the `pricehistory` Mongo collection.  This is updated via the `updatepricehistory.py` script.

In [7]:
# https://plot.ly/python/bar-charts/

data = [Bar(
        x=grouped_df['date'],
        y=grouped_df['count']
    )]

layout = Layout(
        title='Distribution of Price History',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Records Count'
        )
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

In [8]:
grouped_df = db['steamusers'].aggregate([
{"$match": {}},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$date"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

grouped_df = pd.DataFrame(list(grouped_df))
grouped_df.rename(columns={grouped_df.columns[0] : 'date'}, inplace=True)
grouped_df

Unnamed: 0,date,count
0,2018-12-19,119
1,2018-12-20,340
2,2018-12-21,751
3,2018-12-22,1021
4,2018-12-23,932
...,...,...
389,2020-01-14,627
390,2020-01-15,629
391,2020-01-16,631
392,2020-01-17,464


## These are the number of records we have per day for the `steamusers` Mongo collection.  This is created using the `steamusers.py` script.

In [9]:
# https://plot.ly/python/bar-charts/

data = [Bar(
        x=grouped_df['date'],
        y=grouped_df['count']
    )]

layout = Layout(
        title='Distribution of Records Per Day for steamusers',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Records Count'
        )
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

In [10]:
grouped_df = db['topgames'].aggregate([
{"$match": {}},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$date"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

grouped_df = pd.DataFrame(list(grouped_df))
grouped_df.rename(columns={grouped_df.columns[0] : 'date'}, inplace=True)
grouped_df

Unnamed: 0,date,count
0,2019-01-08,300
1,2019-01-09,2300
2,2019-01-10,100
3,2019-01-13,4000
4,2019-01-14,9000
...,...,...
363,2020-01-15,8700
364,2020-01-16,8700
365,2020-01-17,8800
366,2020-01-18,8700


## These are the number of entries we have per day for the `topgames` Mongo collection.  Each query returns the current top 100 games being played.  This is run every 15 minutes.

In [11]:
# https://plot.ly/python/bar-charts/

data = [Bar(
        x=grouped_df['date'],
        y=grouped_df['count']
    )]

layout = Layout(
        title='Distribution of Records Per Day for topgames',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Records Count'
        )
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

In [12]:
steam_grouped_df = db['opencritic'].aggregate([
{"$match": {"steamId": {"$exists": True, "$ne": None}}},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$date"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

steam_grouped_df = pd.DataFrame(list(steam_grouped_df))
steam_grouped_df.rename(columns={steam_grouped_df.columns[0] : 'date'}, inplace=True)

nonsteam_grouped_df = db['opencritic'].aggregate([
{"$match": {"$or": [{"steamId": {"$exists": False}}, {"steamId": None}]}},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$date"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

nonsteam_grouped_df = pd.DataFrame(list(nonsteam_grouped_df))
nonsteam_grouped_df.rename(columns={nonsteam_grouped_df.columns[0] : 'date'}, inplace=True)

print(steam_grouped_df)
print(nonsteam_grouped_df)

          date  count
0   2020-01-04    134
1   2020-01-05    342
2   2020-01-06    306
3   2020-01-07    341
4   2020-01-08    314
5   2020-01-09    322
6   2020-01-10    330
7   2020-01-11    310
8   2020-01-12    341
9   2020-01-13    308
10  2020-01-14    347
11  2020-01-15    343
12  2020-01-16    328
13  2020-01-17    365
14  2020-01-18    339
15  2020-01-19     34
          date  count
0   2020-01-04     78
1   2020-01-05    187
2   2020-01-06    222
3   2020-01-07    191
4   2020-01-08    217
5   2020-01-09    208
6   2020-01-10    199
7   2020-01-11    227
8   2020-01-12    207
9   2020-01-13    221
10  2020-01-14    204
11  2020-01-15    208
12  2020-01-16    240
13  2020-01-17    198
14  2020-01-18    216
15  2020-01-19     16


## This is the breakdown of OpenCritic data we have by last accessed/updated date.

In [13]:
# https://plot.ly/python/bar-charts/#stacked-bar-chart

steamTrace = Bar(
    x=steam_grouped_df['date'],
    y=steam_grouped_df['count'],
    name='Games w/steamId'
)
nonSteamTrace = Bar(
    x=nonsteam_grouped_df['date'],
    y=nonsteam_grouped_df['count'],
    name='Games without steamId'
)

data = [steamTrace, nonSteamTrace]

layout = Layout(
        title='Distribution of OpenCritic Records Per Day Grouped By steamId',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Records Count'
        ),
        barmode='stack'
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

## This is counts of Twitch records by day grouped by a tie to Steam

In [14]:
steam_grouped_df = db['twitchhistorical'].aggregate([
{"$match": {"steamId": {"$exists": True}}},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$date"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

steam_grouped_df = pd.DataFrame(list(steam_grouped_df))
steam_grouped_df.rename(columns={steam_grouped_df.columns[0] : 'date'}, inplace=True)

nonsteam_grouped_df = db['twitchhistorical'].aggregate([
{"$match": {"steamId": {"$exists": False}}},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$date"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

nonsteam_grouped_df = pd.DataFrame(list(nonsteam_grouped_df))
nonsteam_grouped_df.rename(columns={nonsteam_grouped_df.columns[0] : 'date'}, inplace=True)

# https://plot.ly/python/bar-charts/#stacked-bar-chart

steamTrace = Bar(
    x=steam_grouped_df['date'],
    y=steam_grouped_df['count'],
    name='Games w/steamId'
)
nonSteamTrace = Bar(
    x=nonsteam_grouped_df['date'],
    y=nonsteam_grouped_df['count'],
    name='Games without steamId'
)

data = [steamTrace, nonSteamTrace]

layout = Layout(
        title='Distribution of Twitch Records Per Day Grouped By steamId',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Records Count'
        ),
        barmode='stack'
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

## Storage and disk usage information

In [15]:
for collection in db.list_collection_names():
    stats = db.command("collstats", str(collection))
    print(collection + " size: " + common.sizeof_fmt(stats['size']))

print("--------------------")
stats = db.command("dbstats")
print("Total Database Size: " + common.sizeof_fmt(stats['dataSize']))

twitchhistorical size: 1.4 GiB
apps size: 617.8 MiB
pricehistory size: 943.7 MiB
topgames size: 523.7 MiB
steamusers size: 18.1 MiB
opencritic size: 112.5 MiB
--------------------
Total Database Size: 3.5 GiB


## Check index size across collections

In [16]:
total_index = 0
for collection in db.list_collection_names():
    stats = db.command("collstats", str(collection))
    total_index = total_index + stats['totalIndexSize']
    print(collection + " size: " + common.sizeof_fmt(stats['totalIndexSize']))

print("--------------------")
print("Total Index Size: " + common.sizeof_fmt(total_index))

twitchhistorical size: 119.6 MiB
apps size: 8.5 MiB
pricehistory size: 279.6 MiB
topgames size: 54.3 MiB
steamusers size: 7.6 MiB
opencritic size: 528.0 KiB
--------------------
Total Index Size: 470.1 MiB
