In [1]:
# <!-- collapse=True -->
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool
from bokeh.models import Legend
from bokeh.palettes import Category20
from bokeh.plotting import figure
from bokeh.plotting import output_file
from bokeh.plotting import reset_output
from bokeh.plotting import save
from bokeh.plotting import show
from glob import glob
from IPython.display import HTML
from IPython.display import IFrame
from os.path import basename
import pandas as pd
import numpy as np

## Define a helper function to save figure to file and display in notebook

In [2]:
# <!-- collapse=True -->
def save_file_and_show(figure, filename):
    """Save Bokeh figure to filename, then display inline in notebook"""
    output_file(filename)
    save(figure)
    reset_output()
    # display(IFrame(src=filename, width=800, height=550))
    output_notebook()
    show(figure)

## Get data recorded from every successful backup

In [3]:
# <!-- collapse=True -->
# Create list of files, all of the form <username>.dat
datfiles = glob('/opt/share/rrbackup/datfiles/*')
df_list = []
for i, f in enumerate(datfiles):
    # Get username from filename
    user = basename(f).replace('.dat', '')
    # Read user's data
    df = pd.read_csv(f, delim_whitespace=True, parse_dates=[0])
    # Add a column so that every row has a field saying which user it belongs to
    # df['user'] = user
    df['user'] = 'user_{}'.format(i+1)
    df_list.append(df.copy())
# Combine individual user dataframes into one
rsync_data = pd.concat(df_list).dropna().reset_index(drop=True)
n_users = rsync_data['user'].nunique()
rsync_data = rsync_data[['date', 'total_size', 'user']]
# Convert size from bytes to TB
bytes_per_tb = 2**40
rsync_data['total_size'] /= bytes_per_tb
# Show last few rows to see what the dataframe looks like
rsync_data.tail()

Unnamed: 0,date,total_size,user
267,2018-03-06,1.080694,user_16
268,2018-03-20,1.017738,user_16
269,2018-05-01,1.00414,user_16
270,2018-05-14,1.00976,user_16
271,2018-05-09,1.668195e-08,user_17


## Get data from the daily record of available storage

In [4]:
# <!-- collapse=True -->
storage_totals = pd.read_csv(
    '/opt/share/mem_tracking/remaining.dat',
    delim_whitespace=True,
    header=None,
    names=['date', 'remaining'],
    parse_dates=[0],
)
max_tb = 19
kb_per_tb = 2**30
storage_totals = storage_totals.set_index('date')
storage_totals['remaining'] /= kb_per_tb
storage_totals['total_used'] = max_tb - storage_totals['remaining']
storage_totals.tail()

Unnamed: 0_level_0,remaining,total_used
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-05-11,1.484445,17.515555
2018-05-12,1.405076,17.594924
2018-05-13,1.40016,17.59984
2018-05-14,1.326841,17.673159
2018-05-15,1.289808,17.710192


## Pivot storage data

We want the data in a dataframe with date as index, and a different column for each user, plus a column for the storage remaining, and the storage not tracked ('other')

In [5]:
# <!-- collapse=True -->
storage_data = rsync_data.pivot(index='date',
                                columns='user',
                                values='total_size')
users = list(storage_data.columns)
storage_totals = storage_totals[storage_totals.index >= storage_data.index.min()]
storage_data = storage_data.join(storage_totals, how='outer')
storage_data = storage_data.fillna(method='ffill')
storage_data = storage_data.fillna(method='bfill')
storage_data['other'] = (
    storage_data['total_used'] - np.sum(storage_data[users], axis=1)
)
storage_data = storage_data.drop('total_used', axis=1)
users_high_to_low = list(
    storage_data.iloc[-1][users].sort_values(ascending=False).index
)
new_columns = users_high_to_low + ['other', 'remaining']
storage_data = storage_data[new_columns]
storage_data.tail()

Unnamed: 0_level_0,user_3,user_8,user_14,user_4,user_10,user_16,user_13,user_11,user_5,user_9,user_2,user_7,user_1,user_12,user_15,user_6,user_17,other,remaining
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2018-05-11,2.370526,2.023709,1.362028,1.216505,1.188948,1.00414,0.84054,0.823325,0.625269,0.60434,0.436009,0.446949,0.338751,0.312103,0.044454,0.024307,1.668195e-08,3.853652,1.484445
2018-05-12,2.370526,2.023709,1.362028,1.216505,1.188948,1.00414,0.84054,0.823325,0.625269,0.60434,0.436009,0.446949,0.338751,0.312103,0.044454,0.024307,1.668195e-08,3.933021,1.405076
2018-05-13,2.370526,2.023709,1.362028,1.216505,1.188948,1.00414,0.84054,0.823418,0.625269,0.60434,0.436009,0.446949,0.338751,0.312103,0.044454,0.024307,1.668195e-08,3.937845,1.40016
2018-05-14,2.370526,2.023709,1.362028,1.216505,1.188948,1.00976,0.84054,0.823418,0.625269,0.60434,0.436009,0.446949,0.338751,0.312103,0.044454,0.024307,1.668195e-08,4.005543,1.326841
2018-05-15,2.370526,2.023709,1.362028,1.216505,1.188948,1.00976,0.84054,0.823418,0.625269,0.60434,0.459931,0.446949,0.338751,0.312103,0.044454,0.024307,1.668195e-08,4.018655,1.289808


## Plot stacked bar plot showing storage split over time

In [6]:
# <!-- collapse=True -->
n_columns = len(storage_data.columns)
colors = Category20[n_columns]
p = figure(x_axis_type='datetime', plot_width=800, plot_height=500,
           toolbar_location='left', toolbar_sticky=False)
source = ColumnDataSource(data=storage_data)
p.vbar_stack(storage_data.columns, x='date', width=1000000 * 85, source=source,
             color=colors)
renderers = p.renderers[-n_columns:]
items_list = []
for r, label in zip(renderers, storage_data.columns):
    p.add_tools(
        HoverTool(
            renderers=[r],
            tooltips=[('Label', label), ('Memory', '@' + label + ' TB')]
        )
    )
    items_list.append((label, [r]))
legend = Legend(items=list(reversed(items_list)))
p.add_layout(legend, 'right')

# from bokeh.resources import CDN
# from bokeh.embed import file_html
# from IPython.core.display import HTML

# html = file_html(p, CDN, "plot")
# display(HTML(html))
# IFrame(html, width=800, height=550)
# display(HTML(html))
save_file_and_show(p, 'stacked-bar-chart.html')

## Plot data as line plots for each user, plus one for 'remaining' and 'other'

In [7]:
storage_data['date_str'] = storage_data.index.map(lambda x: x.strftime('%m/%d'))
rsync_data['date_str'] = rsync_data['date'].map(lambda x: x.strftime('%m/%d'))
p = figure(x_axis_type='datetime', plot_width=800, plot_height=500,
           tools="pan,wheel_zoom,box_zoom,save,reset",
           toolbar_location='left', toolbar_sticky=False)
items_list = []
source = ColumnDataSource(
    data=storage_data[['other', 'remaining', 'date_str']]
)
for i, col in enumerate(['other', 'remaining']):
    r_line = p.line('date', col, source=source, color=colors[i], line_width=4)
    col_titlecase = col.title()
    items_list.append((col_titlecase, [r_line]))
    p.add_tools(
        HoverTool(
            renderers=[r_line],
            tooltips=[
                (col_titlecase, '@{} TB'.format(col)),
                ('Date', '@date_str'),
            ],
        )
    )
    
for i, user in enumerate(users_high_to_low):
    source = ColumnDataSource(data=rsync_data[rsync_data['user'] == user])
    r_line = p.line('date', 'total_size', source=source,
                    color=colors[i + 2], line_width=2)
    r_circle = p.circle('date', 'total_size', source=source,
                        color=colors[i + 2], size=10)
    items_list.append((user, [r_line, r_circle]))
    p.add_tools(
        HoverTool(
            renderers=[r_line, r_circle],
            tooltips=[
                ('User', user),
                ('Memory', '@total_size TB'),
                ('Date', '@date_str')
            ]
        )
    )
legend = Legend(items=items_list)
p.add_layout(legend, 'right')
save_file_and_show(p, 'line-plot.html')

## Now that the plots have been generated, email them to myself

In [8]:
!./email_plots.sh