In [14]:
import sys, os
from pymongo import MongoClient

from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import *

# https://stackoverflow.com/questions/41323423/plotly-inside-jupyter-notebook-python
init_notebook_mode(connected=True) # initiate notebook for offline plot

import pandas as pd

In [15]:
sys.path.append("..")
import config # ../config.py

# run on raspberrypi3 Mongo database
# config.mongodb_ip = "192.168.1.224"

client = MongoClient(host=config.mongodb_ip, port=config.mongodb_port)
db = client['steam']

## Failed appids excluded from future attempts or analysis

In [26]:
failed_appids_df = pd.DataFrame(list(db['apps'].find({"failureCount": {"$gte":3}}, {"appid":1,"name":1,"_id":False})))
failed_appids_df

Unnamed: 0,appid,name
0,7,Steam Client
1,8,winui2
2,90,Half-Life Dedicated Server
3,100,Counter-Strike: Condition Zero Deleted Scenes
4,205,Source Dedicated Server
5,210,Source Dedicated Server
6,211,Source SDK
7,215,Source SDK Base 2006
8,218,Source SDK Base 2007
9,260,Counter-Strike: Source Beta


In [17]:
all_times_df = pd.DataFrame(list(db['apps'].find({"$or": [{"failureCount": {"$lt": 3}}, {"failureCount": {"$exists": False}}]}, {"updated_date":1, '_id':False})))
# all_times_df.head()
# all_times_df.dtypes
# http://pbpython.com/pandas-grouper-agg.html
grouped_df = pd.DataFrame(all_times_df.groupby(pd.Grouper(key='updated_date', freq='D')).size())
grouped_df['updated_date'] = grouped_df.index
grouped_df.rename(columns={grouped_df.columns[0] : 'count'}, inplace=True)
grouped_df

Unnamed: 0_level_0,count,updated_date
updated_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-09-03,10895,2018-09-03
2018-09-04,22905,2018-09-04
2018-09-05,23586,2018-09-05
2018-09-06,60,2018-09-06
2018-09-07,9,2018-09-07
2018-09-08,0,2018-09-08
2018-09-09,0,2018-09-09
2018-09-10,0,2018-09-10
2018-09-11,0,2018-09-11
2018-09-12,0,2018-09-12


## Large numbers of old AppID entries means `refreshsteam.py` needs to be run.  This excludes consistently failed appids.

In [18]:
# https://plot.ly/python/bar-charts/

data = [Bar(
        x=grouped_df['updated_date'],
        y=grouped_df['count']
    )]

layout = Layout(
        title='AppIDs By Last Updated Date',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Count'
        )
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

In [19]:
all_times_df = pd.DataFrame(list(db['pricehistory'].find({}, {"date":1, '_id':False})))
# all_times_df.head()
# all_times_df.dtypes
# http://pbpython.com/pandas-grouper-agg.html
grouped_df = pd.DataFrame(all_times_df.groupby(pd.Grouper(key='date', freq='D')).size())
grouped_df['date'] = grouped_df.index
grouped_df.rename(columns={grouped_df.columns[0] : 'count'}, inplace=True)
grouped_df

Unnamed: 0_level_0,count,date
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-09-04,8215,2018-09-04
2018-09-05,43441,2018-09-05
2018-09-06,37421,2018-09-06
2018-09-07,23655,2018-09-07
2018-09-08,37188,2018-09-08
2018-09-09,0,2018-09-09
2018-09-10,2963,2018-09-10
2018-09-11,45999,2018-09-11
2018-09-12,20210,2018-09-12
2018-09-13,269,2018-09-13


## This is historical data from the `pricehistory` Mongo collection.  This is run roughly every 48 hours.

In [20]:
# https://plot.ly/python/bar-charts/

data = [Bar(
        x=grouped_df['date'],
        y=grouped_df['count']
    )]

layout = Layout(
        title='Distribution of Price History',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Records Count'
        )
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

In [21]:
all_times_df = pd.DataFrame(list(db['steamusers'].find({}, {"date":1, '_id':False})))
# all_times_df.head()
# all_times_df.dtypes
# http://pbpython.com/pandas-grouper-agg.html
grouped_df = pd.DataFrame(all_times_df.groupby(pd.Grouper(key='date', freq='D')).size())
grouped_df['date'] = grouped_df.index
grouped_df.rename(columns={grouped_df.columns[0] : 'count'}, inplace=True)
grouped_df

Unnamed: 0_level_0,count,date
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-11-17,43,2018-11-17
2018-11-18,302,2018-11-18
2018-11-19,259,2018-11-19
2018-11-20,0,2018-11-20
2018-11-21,0,2018-11-21
2018-11-22,0,2018-11-22
2018-11-23,0,2018-11-23
2018-11-24,0,2018-11-24
2018-11-25,0,2018-11-25
2018-11-26,0,2018-11-26


## These are the number of records we have per day for the `steamusers` Mongo collection.  This is created using the `steamusers.py` script.

In [22]:
# https://plot.ly/python/bar-charts/

data = [Bar(
        x=grouped_df['date'],
        y=grouped_df['count']
    )]

layout = Layout(
        title='Distribution of Records Per Day for steamusers',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Records Count'
        )
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

In [23]:
all_times_df = pd.DataFrame(list(db['topgames'].find({}, {"date":1, '_id':False})))
# all_times_df.head()
# all_times_df.dtypes
# http://pbpython.com/pandas-grouper-agg.html
grouped_df = pd.DataFrame(all_times_df.groupby(pd.Grouper(key='date', freq='D')).size())
grouped_df['date'] = grouped_df.index
grouped_df.rename(columns={grouped_df.columns[0] : 'count'}, inplace=True)
grouped_df

Unnamed: 0_level_0,count,date
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-05,2500,2019-01-05
2019-01-06,300,2019-01-06
2019-01-07,0,2019-01-07
2019-01-08,100,2019-01-08


## These are the number of entries we have per day for the `topgames` Mongo collection.  Each query returns the current top 100 games being played.  This is run every 15 minutes.

In [24]:
# https://plot.ly/python/bar-charts/

data = [Bar(
        x=grouped_df['date'],
        y=grouped_df['count']
    )]

layout = Layout(
        title='Distribution of Records Per Day for topgames',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Records Count'
        )
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

## Storage and disk usage information

In [25]:
# https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for collection in db.list_collection_names():
    stats = db.command("collstats", str(collection))
    print(collection + " size: " + sizeof_fmt(stats['size']))

print("--------------------")
stats = db.command("dbstats")
print("Total Steam DB Size: " + sizeof_fmt(stats['dataSize']))

topgames size: 528.1 KiB
apps size: 464.6 MiB
steamusers size: 1.1 MiB
pricehistory size: 275.0 MiB
systemreqs size: 5.7 MiB
--------------------
Total Steam DB Size: 747.0 MiB
