In [4]:
import sys, os
from pymongo import MongoClient

from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import *

# https://stackoverflow.com/questions/41323423/plotly-inside-jupyter-notebook-python
init_notebook_mode(connected=True) # initiate notebook for offline plot

import pandas as pd

In [5]:
sys.path.append("..")
import config # ../config.py

# run on raspberrypi3 Mongo database
#config.mongodb_ip = "192.168.1.224"
# run on Asus laptop Mongo database
#config.mongodb_ip = "192.168.1.124"

client = MongoClient(host=config.mongodb_ip, port=config.mongodb_port)
db = client['steam']

## Failed appids excluded from future attempts or analysis

In [6]:
failed_appids_df = pd.DataFrame(list(db['apps'].find({"failureCount": {"$gte":3}}, {"appid":1,"name":1,"_id":False})))
failed_appids_df

Unnamed: 0,appid,name
0,12630,Legend: Hand of God
1,15310,The Settlers: Heritage of Kings
2,19930,The Settlers: Rise of an Empire Gold Edition
3,23330,The Last Remnant Demo
4,33910,Arma 2
5,33220,Tom Clancy's Splinter Cell: Conviction
6,63600,realMyst
7,81675,Ticket to Ride Trailer DE
8,81674,Ticket to Ride Trailer EN
9,81676,Ticket to Ride Trailer FR


In [7]:
all_times_df = pd.DataFrame(list(db['apps'].find({"$or": [{"failureCount": {"$lt": 3}}, {"failureCount": {"$exists": False}}]}, {"updated_date":1, '_id':False})))
# all_times_df.head()
# all_times_df.dtypes
# http://pbpython.com/pandas-grouper-agg.html
grouped_df = pd.DataFrame(all_times_df.groupby(pd.Grouper(key='updated_date', freq='D')).size())
grouped_df['updated_date'] = grouped_df.index
grouped_df.rename(columns={grouped_df.columns[0] : 'count'}, inplace=True)
grouped_df

Unnamed: 0_level_0,count,updated_date
updated_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-03-13,955,2019-03-13
2019-03-14,3970,2019-03-14
2019-03-15,3538,2019-03-15
2019-03-16,3542,2019-03-16
2019-03-17,0,2019-03-17
2019-03-18,2984,2019-03-18
2019-03-19,3552,2019-03-19
2019-03-20,3531,2019-03-20
2019-03-21,2485,2019-03-21
2019-03-22,2862,2019-03-22


## Large numbers of old AppID entries means `refreshsteam.py` needs to be run.  This excludes consistently failed appids.

In [8]:
# https://plot.ly/python/bar-charts/

data = [Bar(
        x=grouped_df['updated_date'],
        y=grouped_df['count']
    )]

layout = Layout(
        title='AppIDs By Last Updated Date',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Count'
        )
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

In [9]:
all_times_df = pd.DataFrame(list(db['pricehistory'].find({}, {"date":1, '_id':False})))
# all_times_df.head()
# all_times_df.dtypes
# http://pbpython.com/pandas-grouper-agg.html
grouped_df = pd.DataFrame(all_times_df.groupby(pd.Grouper(key='date', freq='D')).size())
grouped_df['date'] = grouped_df.index
grouped_df.rename(columns={grouped_df.columns[0] : 'count'}, inplace=True)
grouped_df

Unnamed: 0_level_0,count,date
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-09-04,8215,2018-09-04
2018-09-05,43441,2018-09-05
2018-09-06,37421,2018-09-06
2018-09-07,23655,2018-09-07
2018-09-08,37188,2018-09-08
2018-09-09,0,2018-09-09
2018-09-10,2963,2018-09-10
2018-09-11,45999,2018-09-11
2018-09-12,20210,2018-09-12
2018-09-13,269,2018-09-13


## This is historical data from the `pricehistory` Mongo collection.  This is run roughly every 48 hours.

In [10]:
# https://plot.ly/python/bar-charts/

data = [Bar(
        x=grouped_df['date'],
        y=grouped_df['count']
    )]

layout = Layout(
        title='Distribution of Price History',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Records Count'
        )
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

In [11]:
all_times_df = pd.DataFrame(list(db['steamusers'].find({}, {"date":1, '_id':False})))
# all_times_df.head()
# all_times_df.dtypes
# http://pbpython.com/pandas-grouper-agg.html
grouped_df = pd.DataFrame(all_times_df.groupby(pd.Grouper(key='date', freq='D')).size())
grouped_df['date'] = grouped_df.index
grouped_df.rename(columns={grouped_df.columns[0] : 'count'}, inplace=True)
grouped_df

Unnamed: 0_level_0,count,date
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-12-19,119,2018-12-19
2018-12-20,340,2018-12-20
2018-12-21,751,2018-12-21
2018-12-22,1021,2018-12-22
2018-12-23,932,2018-12-23
2018-12-24,980,2018-12-24
2018-12-25,938,2018-12-25
2018-12-26,924,2018-12-26
2018-12-27,1106,2018-12-27
2018-12-28,1287,2018-12-28


## These are the number of records we have per day for the `steamusers` Mongo collection.  This is created using the `steamusers.py` script.

In [12]:
# https://plot.ly/python/bar-charts/

data = [Bar(
        x=grouped_df['date'],
        y=grouped_df['count']
    )]

layout = Layout(
        title='Distribution of Records Per Day for steamusers',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Records Count'
        )
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

In [13]:
all_times_df = pd.DataFrame(list(db['topgames'].find({}, {"date":1, '_id':False})))
# all_times_df.head()
# all_times_df.dtypes
# http://pbpython.com/pandas-grouper-agg.html
grouped_df = pd.DataFrame(all_times_df.groupby(pd.Grouper(key='date', freq='D')).size())
grouped_df['date'] = grouped_df.index
grouped_df.rename(columns={grouped_df.columns[0] : 'count'}, inplace=True)
grouped_df

Unnamed: 0_level_0,count,date
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-08,300,2019-01-08
2019-01-09,2300,2019-01-09
2019-01-10,100,2019-01-10
2019-01-11,0,2019-01-11
2019-01-12,0,2019-01-12
2019-01-13,4000,2019-01-13
2019-01-14,9000,2019-01-14
2019-01-15,8400,2019-01-15
2019-01-16,9200,2019-01-16
2019-01-17,8500,2019-01-17


## These are the number of entries we have per day for the `topgames` Mongo collection.  Each query returns the current top 100 games being played.  This is run every 15 minutes.

In [14]:
# https://plot.ly/python/bar-charts/

data = [Bar(
        x=grouped_df['date'],
        y=grouped_df['count']
    )]

layout = Layout(
        title='Distribution of Records Per Day for topgames',
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Records Count'
        )
    )

fig = Figure(data=data, layout=layout)

iplot(fig)

## Storage and disk usage information

In [15]:
# https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for collection in db.list_collection_names():
    stats = db.command("collstats", str(collection))
    print(collection + " size: " + sizeof_fmt(stats['size']))

print("--------------------")
stats = db.command("dbstats")
print("Total Steam DB Size: " + sizeof_fmt(stats['dataSize']))

topgames size: 115.9 MiB
apps size: 507.8 MiB
steamusers size: 4.8 MiB
pricehistory size: 540.0 MiB
--------------------
Total Steam DB Size: 1.1 GiB
