## Plot reddylab cummulative disk usage  

Configuration

In [1]:
number_of_last_months = 12
data_dir = '/data/reddylab/Alex/reddylab_disk_usage/'
remove_total_disk_usage = True
min_num_tbs = 0.5
data_rootdir = '/data/reddylab'

Imports

In [2]:
from datetime import datetime
import pandas as pd
import seaborn as sns
import os
import scipy.stats as stats
from matplotlib import pyplot as plt
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.express as px
sns.set_context('talk')
sns.set_style('ticks')
plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['lines.markersize'] = 10

  import pandas.util.testing as tm


Auxiliary functions


In [3]:
def monthdelta(date, delta):
    """Add/subtract a delta number months from a date"""
    m, y = (date.month+delta) % 12, date.year + ((date.month)+delta-1) // 12
    if not m: m = 12
    d = min(date.day, [31,
        29 if y%4==0 and not y%400==0 else 28,31,30,31,30,31,31,30,31,30,31][m-1])
    return date.replace(day=d,month=m, year=y)

def human_readable_to_mb(h):
    """Translate from human readable to TBs"""
    human_dict_to_dec = {'G': 1./1024, 'M': 1./(1024*1024), 'T': 1, 'K':1./(1024*1024*1024)}
    if h == 0: return h
    for unit, val in human_dict_to_dec.items():
        if h.endswith(unit):
            return float(h.replace(unit, ''))*val
    return float(h)

In [4]:
# Threshold for the last disk usage report to retrieve
date_threshold = monthdelta(datetime.date(datetime.now()), - number_of_last_months)

In [5]:
# resulting dataframe
res = pd.DataFrame()  
for ii in sorted(os.listdir(data_dir)):
    # Make sure we are grabbing a disk usage file (YYYY-MM-DD.txt)
    # and that the date is within the last number of months specified above
    try:
        if datetime.date(datetime.fromisoformat(ii[:10])) < date_threshold or len(ii) != 14:
            continue
    except:
        continue
    res = res.join(pd.read_csv(os.path.join(data_dir, ii), sep='\t', names = [ii], index_col=1), how='outer')

# Remove the rootdir aggregate usage, redundant with total when using "du -ch"
res.drop([data_rootdir], inplace=True)

# Should we remove the "total" disk space usage from the plot? Sum of all individual folders 
if remove_total_disk_usage:
    res.drop(['total'], inplace=True)

# Fill NA with zeros, for new/defunct folders 
res.fillna(0, inplace=True)

In [6]:
# Translate usage to TBs
res = res.applymap(human_readable_to_mb)
# Filter results to improve readibility
res = res[res.max(axis=1)>min_num_tbs]
# Add folders in index as column
res = res.reset_index()

In [7]:
# Melt the dataframe to plot it
res_melted = pd.melt(res, id_vars=['index'],)

### HARDAC disk usage over time

In [8]:
# Plot with plotly
fig = px.scatter(res_melted, 
              x="variable", 
              y="value",
              color='index',
              title='Reddylab disk usage (past %d months)' % number_of_last_months)
fig.update_layout(xaxis_title = '', yaxis_title = 'TB')
for ii in range(len(fig.data)):
    fig.data[ii].update(mode='markers+lines')
fig.show()