In [17]:
import sys, os
from pymongo import MongoClient

from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import *

# https://stackoverflow.com/questions/41323423/plotly-inside-jupyter-notebook-python
init_notebook_mode(connected=True) # initiate notebook for offline plot

import pandas as pd

In [18]:
sys.path.append("..")
import config # ../config.py
import common # ../common.py

# run on Asus laptop Mongo database
#config.mongodb_ip = "192.168.1.124"
# run on Dell
config.mongodb_ip = "192.168.1.171"

client = MongoClient(host=config.mongodb_ip, port=config.mongodb_port)
db = client['steam']

## Failed appids excluded from future attempts or analysis

In [19]:
failed_appids_df = pd.DataFrame(list(db['apps'].find({"failureCount": {"$gte":3}}, {"appid":1,"name":1,"_id":False})))
failed_appids_df

Unnamed: 0,appid,name
0,12630,Legend: Hand of God
1,15310,The Settlers: Heritage of Kings
2,19930,The Settlers: Rise of an Empire Gold Edition
3,23330,The Last Remnant Demo
4,33220,Tom Clancy's Splinter Cell: Conviction
5,33910,Arma 2
6,34000,Football Manager 2010
7,41010,Serious Sam HD: The Second Encounter
8,47900,Dragon Age II
9,55010,Flotilla Demo


In [20]:
grouped_df = db['apps'].aggregate([
{"$match":
    {"$or": [{"failureCount": {"$lt": 3}}, {"failureCount": {"$exists": False}}]}
},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$updated_date"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

grouped_df = pd.DataFrame(list(grouped_df))
grouped_df.rename(columns={grouped_df.columns[0] : 'updated_date'}, inplace=True)
grouped_df

Unnamed: 0,updated_date,count
0,,1129
1,2019-11-26,907
2,2019-11-27,3309
3,2019-11-28,3873
4,2019-11-29,3333
5,2019-11-30,3350
6,2019-12-01,3349
7,2019-12-02,3846
8,2019-12-03,3354
9,2019-12-04,3436


## Large numbers of old AppID entries means `refreshsteam.py` needs to be run.  This excludes consistently failed appids.

In [21]:
# https://plot.ly/python/bar-charts/

data = [Bar(
        x=grouped_df['updated_date'],
        y=grouped_df['count']
    )]

layout = Layout(
        title='AppIDs By Last Updated Date',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Count'
        )
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

In [22]:
grouped_df = db['pricehistory'].aggregate([
{"$match": {}},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$date"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

grouped_df = pd.DataFrame(list(grouped_df))
grouped_df.rename(columns={grouped_df.columns[0] : 'date'}, inplace=True)
grouped_df

Unnamed: 0,date,count
0,2018-09-04,8215
1,2018-09-05,43441
2,2018-09-06,37421
3,2018-09-07,23655
4,2018-09-08,37188
5,2018-09-10,2963
6,2018-09-11,45999
7,2018-09-12,20210
8,2018-09-13,269
9,2018-09-14,37390


## This is historical data from the `pricehistory` Mongo collection.  This is updated via the `updatepricehistory.py` script.

In [23]:
# https://plot.ly/python/bar-charts/

data = [Bar(
        x=grouped_df['date'],
        y=grouped_df['count']
    )]

layout = Layout(
        title='Distribution of Price History',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Records Count'
        )
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

In [24]:
grouped_df = db['steamusers'].aggregate([
{"$match": {}},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$date"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

grouped_df = pd.DataFrame(list(grouped_df))
grouped_df.rename(columns={grouped_df.columns[0] : 'date'}, inplace=True)
grouped_df

Unnamed: 0,date,count
0,2018-12-19,119
1,2018-12-20,340
2,2018-12-21,751
3,2018-12-22,1021
4,2018-12-23,932
5,2018-12-24,980
6,2018-12-25,938
7,2018-12-26,924
8,2018-12-27,1106
9,2018-12-28,1287


## These are the number of records we have per day for the `steamusers` Mongo collection.  This is created using the `steamusers.py` script.

In [25]:
# https://plot.ly/python/bar-charts/

data = [Bar(
        x=grouped_df['date'],
        y=grouped_df['count']
    )]

layout = Layout(
        title='Distribution of Records Per Day for steamusers',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Records Count'
        )
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

In [26]:
grouped_df = db['topgames'].aggregate([
{"$match": {}},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$date"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

grouped_df = pd.DataFrame(list(grouped_df))
grouped_df.rename(columns={grouped_df.columns[0] : 'date'}, inplace=True)
grouped_df

Unnamed: 0,date,count
0,2019-01-08,300
1,2019-01-09,2300
2,2019-01-10,100
3,2019-01-13,4000
4,2019-01-14,9000
5,2019-01-15,8400
6,2019-01-16,9200
7,2019-01-17,8500
8,2019-01-18,9100
9,2019-01-19,5200


## These are the number of entries we have per day for the `topgames` Mongo collection.  Each query returns the current top 100 games being played.  This is run every 15 minutes.

In [27]:
# https://plot.ly/python/bar-charts/

data = [Bar(
        x=grouped_df['date'],
        y=grouped_df['count']
    )]

layout = Layout(
        title='Distribution of Records Per Day for topgames',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Records Count'
        )
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

In [28]:
grouped_df = db['apps'].aggregate([
{"$match": {}},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$reviews.last_updated"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

grouped_df = pd.DataFrame(list(grouped_df))
grouped_df.rename(columns={grouped_df.columns[0] : 'date'}, inplace=True)
grouped_df

Unnamed: 0,date,count
0,,91371
1,2020-07-25,51


## These are the number of Steam games where we have review information from Steam and when they were last updated

In [29]:
# https://plot.ly/python/bar-charts/

data = [Bar(
        x=grouped_df['date'],
        y=grouped_df['count']
    )]

layout = Layout(
        title='Distribution of Steam game reviews per day',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Records Count'
        )
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

In [30]:
steam_grouped_df = db['opencritic'].aggregate([
{"$match": {"steamId": {"$exists": True, "$ne": None}}},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$date"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

steam_grouped_df = pd.DataFrame(list(steam_grouped_df))
steam_grouped_df.rename(columns={steam_grouped_df.columns[0] : 'date'}, inplace=True)

nonsteam_grouped_df = db['opencritic'].aggregate([
{"$match": {"$or": [{"steamId": {"$exists": False}}, {"steamId": None}]}},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$date"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

nonsteam_grouped_df = pd.DataFrame(list(nonsteam_grouped_df))
nonsteam_grouped_df.rename(columns={nonsteam_grouped_df.columns[0] : 'date'}, inplace=True)

print(steam_grouped_df)
print(nonsteam_grouped_df)

          date  count
0   2019-12-04     11
1   2019-12-05    351
2   2019-12-06    345
3   2019-12-07    353
4   2019-12-08    345
5   2019-12-09    336
6   2019-12-10    357
7   2019-12-11    325
8   2019-12-12    329
9   2019-12-13    330
10  2019-12-14    336
11  2019-12-15    336
12  2019-12-16    347
13  2019-12-17    338
14  2019-12-18    333
          date  count
0   2019-12-04     11
1   2019-12-05    209
2   2019-12-06    213
3   2019-12-07    195
4   2019-12-08    204
5   2019-12-09    215
6   2019-12-10    202
7   2019-12-11    226
8   2019-12-12    200
9   2019-12-13    224
10  2019-12-14    214
11  2019-12-15    216
12  2019-12-16    208
13  2019-12-17    214
14  2019-12-18    193


## This is the breakdown of OpenCritic data we have by last accessed/updated date.

In [31]:
# https://plot.ly/python/bar-charts/#stacked-bar-chart

steamTrace = Bar(
    x=steam_grouped_df['date'],
    y=steam_grouped_df['count'],
    name='Games w/steamId'
)
nonSteamTrace = Bar(
    x=nonsteam_grouped_df['date'],
    y=nonsteam_grouped_df['count'],
    name='Games without steamId'
)

data = [steamTrace, nonSteamTrace]

layout = Layout(
        title='Distribution of OpenCritic Records Per Day Grouped By steamId',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Records Count'
        ),
        barmode='stack'
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

## This is counts of Twitch records by day grouped by a tie to Steam

In [32]:
steam_grouped_df = db['twitchhistorical'].aggregate([
{"$match": {"steamId": {"$exists": True}}},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$date"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

steam_grouped_df = pd.DataFrame(list(steam_grouped_df))
steam_grouped_df.rename(columns={steam_grouped_df.columns[0] : 'date'}, inplace=True)

nonsteam_grouped_df = db['twitchhistorical'].aggregate([
{"$match": {"steamId": {"$exists": False}}},
{"$group":  {"_id": {"$dateToString": {"format": "%Y-%m-%d", "date": "$date"}},
             "count" : { "$sum" : 1} } },
{"$sort": {"_id": 1}}
])

nonsteam_grouped_df = pd.DataFrame(list(nonsteam_grouped_df))
nonsteam_grouped_df.rename(columns={nonsteam_grouped_df.columns[0] : 'date'}, inplace=True)

# https://plot.ly/python/bar-charts/#stacked-bar-chart

steamTrace = Bar(
    x=steam_grouped_df['date'],
    y=steam_grouped_df['count'],
    name='Games w/steamId'
)
nonSteamTrace = Bar(
    x=nonsteam_grouped_df['date'],
    y=nonsteam_grouped_df['count'],
    name='Games without steamId'
)

data = [steamTrace, nonSteamTrace]

layout = Layout(
        title='Distribution of Twitch Records Per Day Grouped By steamId',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Records Count'
        ),
        barmode='stack'
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

## Storage and disk usage information

In [33]:
for collection in db.list_collection_names():
    stats = db.command("collstats", str(collection))
    print(collection + " size: " + common.sizeof_fmt(stats['size']))

print("--------------------")
stats = db.command("dbstats")
print("Total Database Size: " + common.sizeof_fmt(stats['dataSize']))

opencritic size: 110.8 MiB
topgames size: 475.6 MiB
pricehistory size: 865.3 MiB
twitchhistorical size: 1.2 GiB
apps size: 608.1 MiB
steamusers size: 16.7 MiB
--------------------
Total Database Size: 3.2 GiB


## Check index size across collections

In [34]:
total_index = 0
for collection in db.list_collection_names():
    stats = db.command("collstats", str(collection))
    total_index = total_index + stats['totalIndexSize']
    print(collection + " size: " + common.sizeof_fmt(stats['totalIndexSize']))

print("--------------------")
print("Total Index Size: " + common.sizeof_fmt(total_index))

opencritic size: 392.0 KiB
topgames size: 52.1 MiB
pricehistory size: 256.3 MiB
twitchhistorical size: 108.0 MiB
apps size: 7.1 MiB
steamusers size: 7.0 MiB
--------------------
Total Index Size: 430.8 MiB
