In [23]:
import sys, os
from pymongo import MongoClient

from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import *

# https://stackoverflow.com/questions/41323423/plotly-inside-jupyter-notebook-python
init_notebook_mode(connected=True) # initiate notebook for offline plot

import pandas as pd

In [24]:
sys.path.append("..")
import config # ../config.py
import common # ../common.py

# run on dellxps
config.mongodb_ip = "192.168.1.114"

client = MongoClient(host=config.mongodb_ip, port=config.mongodb_port)
db = client['steam']

## Failed appids excluded from future attempts or analysis

In [25]:
failed_appids_df = pd.DataFrame(list(db['apps'].find({"failureCount": {"$gte":3}}, {"appid":1,"name":1,"_id":False})))
failed_appids_df

Unnamed: 0,name,appid
0,Hammer Heads Deluxe Demo,3402
1,Rocket Mania Deluxe Demo,3442
2,Venice Demo,3492
3,Darwinia_ESRB,5803
4,Larva Mortus Demo,11350
...,...,...
19322,Yakyosho Demo,1983860
19323,KLONOA Digital Artbook & Soundtrack,1913570
19324,Portal Versus SDK,2080260
19325,Lightphobe Dedicated Server,2093570


In [26]:
grouped_df = db['apps'].aggregate([
{"$match":
    {"$or": [{"failureCount": {"$lt": 3}}, {"failureCount": {"$exists": False}}]}
},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$updated_date"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

grouped_df = pd.DataFrame(list(grouped_df))
grouped_df.rename(columns={grouped_df.columns[0] : 'updated_date'}, inplace=True)
grouped_df

Unnamed: 0,updated_date,count
0,,2
1,2022-06-08,27
2,2022-06-09,3168
3,2022-06-10,3307
4,2022-06-11,3314
5,2022-06-12,3003
6,2022-06-13,3370
7,2022-06-14,3054
8,2022-06-15,3544
9,2022-06-16,3457


## Large numbers of old AppID entries means `refreshsteam.py` needs to be run.  This excludes consistently failed appids.

In [27]:
# https://plot.ly/python/bar-charts/

data = [Bar(
        x=grouped_df['updated_date'],
        y=grouped_df['count']
    )]

layout = Layout(
        title='AppIDs By Last Updated Date',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Count'
        )
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

In [28]:
grouped_df = db['pricehistory'].aggregate([
{"$match": {}},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$date"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

grouped_df = pd.DataFrame(list(grouped_df))
grouped_df.rename(columns={grouped_df.columns[0] : 'date'}, inplace=True)
grouped_df

Unnamed: 0,date,count
0,2018-09-04,8215
1,2018-09-05,43441
2,2018-09-06,37421
3,2018-09-07,23655
4,2018-09-08,37188
...,...,...
1309,2022-07-17,25816
1310,2022-07-18,26902
1311,2022-07-19,28481
1312,2022-07-20,27931


## This is historical data from the `pricehistory` Mongo collection.  This is updated via the `updatepricehistory.py` script.

In [29]:
# https://plot.ly/python/bar-charts/

data = [Bar(
        x=grouped_df['date'],
        y=grouped_df['count']
    )]

layout = Layout(
        title='Distribution of Price History',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Records Count'
        )
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

In [30]:
grouped_df = db['steamusers'].aggregate([
{"$match": {}},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$date"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

grouped_df = pd.DataFrame(list(grouped_df))
grouped_df.rename(columns={grouped_df.columns[0] : 'date'}, inplace=True)
grouped_df

Unnamed: 0,date,count
0,2018-12-19,119
1,2018-12-20,340
2,2018-12-21,751
3,2018-12-22,1021
4,2018-12-23,932
...,...,...
1278,2022-07-17,651
1279,2022-07-18,628
1280,2022-07-19,629
1281,2022-07-20,550


## These are the number of records we have per day for the `steamusers` Mongo collection.  This is created using the `steamusers.py` script.

In [31]:
# https://plot.ly/python/bar-charts/

data = [Bar(
        x=grouped_df['date'],
        y=grouped_df['count']
    )]

layout = Layout(
        title='Distribution of Records Per Day for steamusers',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Records Count'
        )
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

In [32]:
grouped_df = db['topgames'].aggregate([
{"$match": {}},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$date"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

grouped_df = pd.DataFrame(list(grouped_df))
grouped_df.rename(columns={grouped_df.columns[0] : 'date'}, inplace=True)
grouped_df

Unnamed: 0,date,count
0,2019-01-08,300
1,2019-01-09,2300
2,2019-01-10,100
3,2019-01-13,4000
4,2019-01-14,9000
...,...,...
1238,2022-07-17,8300
1239,2022-07-18,8500
1240,2022-07-19,8700
1241,2022-07-20,8700


## These are the number of entries we have per day for the `topgames` Mongo collection.  Each query returns the current top 100 games being played.  This is run every 15 minutes.

In [33]:
# https://plot.ly/python/bar-charts/

data = [Bar(
        x=grouped_df['date'],
        y=grouped_df['count']
    )]

layout = Layout(
        title='Distribution of Records Per Day for topgames',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Records Count'
        )
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

In [34]:
grouped_df = db['apps'].aggregate([
{"$match": {"type": {"$in": ["game", "dlc"]}}},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$reviews.last_updated"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

grouped_df = pd.DataFrame(list(grouped_df))
grouped_df.rename(columns={grouped_df.columns[0] : 'date'}, inplace=True)
grouped_df

Unnamed: 0,date,count
0,,26536
1,2021-03-07,4
2,2021-03-08,3
3,2021-03-15,1
4,2021-03-20,1
...,...,...
236,2022-07-17,2050
237,2022-07-18,2100
238,2022-07-19,2200
239,2022-07-20,2158


## These are the number of Steam games where we have review information from Steam and when they were last updated

In [35]:
# https://plot.ly/python/bar-charts/

data = [Bar(
        x=grouped_df['date'],
        y=grouped_df['count']
    )]

layout = Layout(
        title='Distribution of Steam game reviews per day',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Records Count'
        )
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

In [36]:
steam_grouped_df = db['opencritic'].aggregate([
{"$match": {"steamId": {"$exists": True, "$ne": None}}},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$date"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

steam_grouped_df = pd.DataFrame(list(steam_grouped_df))
steam_grouped_df.rename(columns={steam_grouped_df.columns[0] : 'date'}, inplace=True)

nonsteam_grouped_df = db['opencritic'].aggregate([
{"$match": {"$or": [{"steamId": {"$exists": False}}, {"steamId": None}]}},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$date"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

nonsteam_grouped_df = pd.DataFrame(list(nonsteam_grouped_df))
nonsteam_grouped_df.rename(columns={nonsteam_grouped_df.columns[0] : 'date'}, inplace=True)

print(steam_grouped_df)
print(nonsteam_grouped_df)

          date  count
0   2022-01-30    180
1   2022-01-31    216
2   2022-02-01    219
3   2022-02-02    210
4   2022-02-03    234
5   2022-02-04    148
6   2022-02-05    178
7   2022-02-06    247
8   2022-02-07    230
9   2022-02-08    228
10  2022-02-09    248
11  2022-02-10    219
12  2022-02-11    200
13  2022-02-12    239
14  2022-02-13     66
15  2022-02-14    214
16  2022-02-15    145
17  2022-02-16    207
18  2022-02-17    239
19  2022-02-18    221
20  2022-02-19    124
21  2022-02-22    197
22  2022-02-23    198
23  2022-02-24    203
          date  count
0   2021-09-05      1
1   2022-01-26      1
2   2022-01-30    224
3   2022-01-31    240
4   2022-02-01    309
5   2022-02-02    270
6   2022-02-03    294
7   2022-02-04    308
8   2022-02-05    350
9   2022-02-06    281
10  2022-02-07    298
11  2022-02-08    300
12  2022-02-09    280
13  2022-02-10    309
14  2022-02-11    328
15  2022-02-12    289
16  2022-02-13    102
17  2022-02-14    290
18  2022-02-15    237
19  2022-0

## This is the breakdown of OpenCritic data we have by last accessed/updated date.

In [37]:
# https://plot.ly/python/bar-charts/#stacked-bar-chart

steamTrace = Bar(
    x=steam_grouped_df['date'],
    y=steam_grouped_df['count'],
    name='Games w/steamId'
)
nonSteamTrace = Bar(
    x=nonsteam_grouped_df['date'],
    y=nonsteam_grouped_df['count'],
    name='Games without steamId'
)

data = [steamTrace, nonSteamTrace]

layout = Layout(
        title='Distribution of OpenCritic Records Per Day Grouped By steamId',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Records Count'
        ),
        barmode='stack'
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

## This is counts of Twitch records by day grouped by a tie to Steam

In [38]:
steam_grouped_df = db['twitchhistorical'].aggregate([
{"$match": {"steamId": {"$exists": True}}},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$date"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

steam_grouped_df = pd.DataFrame(list(steam_grouped_df))
steam_grouped_df.rename(columns={steam_grouped_df.columns[0] : 'date'}, inplace=True)

nonsteam_grouped_df = db['twitchhistorical'].aggregate([
{"$match": {"steamId": {"$exists": False}}},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$date"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

nonsteam_grouped_df = pd.DataFrame(list(nonsteam_grouped_df))
nonsteam_grouped_df.rename(columns={nonsteam_grouped_df.columns[0] : 'date'}, inplace=True)

# https://plot.ly/python/bar-charts/#stacked-bar-chart

steamTrace = Bar(
    x=steam_grouped_df['date'],
    y=steam_grouped_df['count'],
    name='Games w/steamId'
)
nonSteamTrace = Bar(
    x=nonsteam_grouped_df['date'],
    y=nonsteam_grouped_df['count'],
    name='Games without steamId'
)

data = [steamTrace, nonSteamTrace]

layout = Layout(
        title='Distribution of Twitch Records Per Day Grouped By steamId',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Records Count'
        ),
        barmode='stack'
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

In [39]:
grouped_df = db['bandwidth'].aggregate([
{"$match": {}},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$date"}},
             "bytes" : { "$sum" : "$bytes"} } },
{"$sort": {"_id": 1}}
])

grouped_df = pd.DataFrame(list(grouped_df))
grouped_df.rename(columns={grouped_df.columns[0] : 'date'}, inplace=True)
grouped_df

Unnamed: 0,date,bytes
0,2020-08-24,47308192
1,2020-08-25,60355637
2,2020-08-26,61575373
3,2020-08-27,62138445
4,2020-08-28,65542520
...,...,...
684,2022-07-17,61918417
685,2022-07-18,53812176
686,2022-07-19,64666066
687,2022-07-20,59081721


## Bandwidth used

In [40]:
# https://plot.ly/python/bar-charts/

data = [Bar(
        x=grouped_df['date'],
        y=grouped_df['bytes']
    )]

layout = Layout(
        title='Data downloaded per day',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Bytes'
        )
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

## Storage and disk usage information

In [41]:
for collection in db.list_collection_names():
    stats = db.command("collstats", str(collection))
    print(collection + " size: " + common.sizeof_fmt(stats['size']))

print("--------------------")
stats = db.command("dbstats")
print("Total Database Size: " + common.sizeof_fmt(stats['dataSize']))

twitchhistorical size: 7.5 GiB
pricehistory size: 3.0 GiB
steamusers size: 59.9 MiB
bandwidth size: 6.3 MiB
topgames size: 1.8 GiB
opencritic size: 200.8 MiB
apps size: 1.4 GiB
--------------------
Total Database Size: 13.9 GiB


## Check index size across collections

In [42]:
total_index = 0
for collection in db.list_collection_names():
    stats = db.command("collstats", str(collection))
    total_index = total_index + stats['totalIndexSize']
    print(collection + " size: " + common.sizeof_fmt(stats['totalIndexSize']))

print("--------------------")
print("Total Index Size: " + common.sizeof_fmt(total_index))

twitchhistorical size: 826.6 MiB
pricehistory size: 1.2 GiB
steamusers size: 29.7 MiB
bandwidth size: 4.2 MiB
topgames size: 279.5 MiB
opencritic size: 648.0 KiB
apps size: 13.7 MiB
--------------------
Total Index Size: 2.3 GiB
