# Measuring and logging the CPU usage at the `hpc05`

Takes a measuring point **every 15 minutes** and then updates this website.

Want to know something? Ask/e-mail Bas at [basnijholt@gmail.com](mailto:basnijholt@gmail.com).

_You can see the code by clicking on this button:_

In [None]:
import datetime
tz = datetime.timezone(datetime.timedelta(hours=2))
print('Last time this script ran is at {}'.format(str(datetime.datetime.now(tz))))

# Current usage at the `hpc05`

In [None]:
!ssh hpc05 'bash -c "~/miniconda3/envs/dev/bin/python /home/basnijholt/Work/cluster_log/stat.py"'

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

def open_json(fname):
    with open(fname, 'r') as f:
        x = json.load(f)

    data = []
    for key, val in x.items():
        ID, user, process = key.split()
        for i in val:
            current_time, actual_usage_time, num_cores = i
            data.append(dict(ID=ID, user=user, process=process, current_time=current_time, 
                        actual_usage_time=actual_usage_time, num_cores=num_cores))
    return data


def load_file_sort_by_ID(data, today=False):
    df = pd.DataFrame(data)
    by_ID = df.groupby('ID')
    df['reserved_time'] = by_ID['current_time'].transform(lambda x: x-x.min())
    df['actual_usage_time'] = by_ID['actual_usage_time'].transform(lambda x: x-x.min())
    df['actual_usage_time'] = df['actual_usage_time'] / df['num_cores']
    df['activity'] = df['actual_usage_time'] / df['reserved_time'] * 100
    df.index = pd.to_datetime(df.pop('current_time'), unit='s', utc=True)
    df = df.sort_index()
    if today:
        # select only today
        df = df.loc[str(datetime.date.today())]
        by_ID = df.groupby('ID')
    lasts = by_ID.last().set_index('user')
    return df, lasts


def get_user_df(lasts):
    by_user = lasts.groupby(level='user')
    reserved_days = by_user.apply(lambda x: (x.reserved_time * x.num_cores).sum() / 86400)
    actual_usage_days = by_user.apply(lambda x: (x.actual_usage_time * x.num_cores).sum() / 86400)
    idle_days = reserved_days - actual_usage_days
    user_df = reserved_days.to_frame('reserved_days').join(actual_usage_days.to_frame('actual_usage_days')).join(idle_days.to_frame('idle_days'))
    user_df['activity'] = user_df.actual_usage_days / user_df.reserved_days * 100
    return user_df

today = str(datetime.date.today())
month = datetime.datetime.today().strftime("%B")
data = open_json('job_log_{}.json'.format(today[:-3]))

# This months data

* `reserved_days` means the total amount of CPU time (in days) that the user *reserved* the cores.
* `actual_usage_days` means the total amount of CPU time (in days) that the user was *actually* using the cores.
* `idle_days = reserved_days - actual_usage_days`

In [None]:
df, lasts = load_file_sort_by_ID(data, today=False)
user_df = get_user_df(lasts)
user_df.sort_values('idle_days', ascending=False)

In [None]:
ax = user_df.sort_values('activity').plot.bar(y=['reserved_days', 'actual_usage_days'])
ax.set_ylabel('CPU time in days')
ax.set_title('CPU time used per user in {}'.format(month))

In [None]:
ax = df.groupby(df.index.weekday_name, sort=False).actual_usage_time.sum().divide(86400 * 365).plot.bar()
ax.set_ylabel('CPU time in years')
ax.set_title('CPU time per weekday in {}'.format(month))

In [None]:
ax = df.groupby(df.index.hour, sort=False).actual_usage_time.sum().divide(86400 * 365).plot.bar()
ax.set_ylabel('CPU time in years')
ax.set_title('CPU time per hour in {}'.format(month))

# Only today

In [None]:
df, lasts = load_file_sort_by_ID(data, today=True)
user_df_today = get_user_df(lasts)
user_df_today.sort_values('idle_days', ascending=False)

In [None]:
ax = user_df_today.sort_values('activity').plot.bar(y=['reserved_days', 'actual_usage_days'])
ax.set_ylabel('CPU time in days')
ax.set_title('CPU time per user today ({})'.format(today))