# Measuring the CPU usage at the hpc05

In [None]:
import datetime
import json
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def open_json(fname):
    with open(fname, 'r') as f:
        x = json.load(f)

    data = []
    for key, val in x.items():
        ID, user, process = key.split()
        for i in val:
            current_time, running_time, num_cores = i
            data.append(dict(ID=ID, user=user, process=process, current_time=current_time, 
                        running_time=running_time, num_cores=num_cores))
    return data


def load_file_sort_by_ID(today=False):
    df = pd.DataFrame(data)
    by_ID = df.groupby('ID')
    df['elapsed_time'] = by_ID['current_time'].transform(lambda x: x-x.min())
    df['running_time'] = by_ID['running_time'].transform(lambda x: x-x.min())
    df['running_time'] = df['running_time'] / df['num_cores']
    df['activity'] = df['running_time'] / df['elapsed_time'] * 100
    df.index = pd.to_datetime(df.pop('current_time'), unit='s', utc=True)
    df = df.sort_index()
    if today:
        # select only today
        df = df.ix[str(datetime.date.today())]
        by_ID = df.groupby('ID')
    lasts = by_ID.last().set_index('user')
    return df, lasts


def get_user_df(lasts):
    by_user = lasts.groupby(level='user')
    elapsed_days = by_user.apply(lambda x: (x.elapsed_time * x.num_cores).sum() / 86400)
    running_days = by_user.apply(lambda x: (x.running_time * x.num_cores).sum() / 86400)
    user_df = elapsed_days.to_frame('elapsed_days').join(running_days.to_frame('running_days'))
    user_df['activity'] = user_df.running_days / user_df.elapsed_days * 100
    return user_df

In [None]:
tz = datetime.timezone(datetime.timedelta(hours=2))
print('Now it is {}'.format(str(datetime.datetime.now(tz))))
today = str(datetime.date.today())
data = open_json('job_log_{}.json'.format(today[:-3]))

# This months data

In [None]:
df, lasts = load_file_sort_by_ID(today=True)
user_df = get_user_df(lasts)
user_df.sort_values('elapsed_days', ascending=False)

### Barplot with elapsed days and running days

In [None]:
user_df.sort_values('activity').plot.bar(y=['elapsed_days', 'running_days'])

### Sorted on activity

In [None]:
for user, row in user_df.sort_values('activity').iterrows():
    print('{} has {:.0f} days of computing and {:.1f}% time activity'.format(
        user, row.elapsed_days, row.activity))

### Sorted on days

In [None]:
for user, row in user_df.sort_values('elapsed_days', ascending=False).iterrows():
    print('{} has {:.0f} days of computing and {:.1f}% time activity'.format(
        user, row.elapsed_days, row.activity))

# Only today

In [None]:
df, lasts = load_file_sort_by_ID(today=True)
user_df_today = get_user_df(lasts)
user_df_today.sort_values('elapsed_days', ascending=False)

In [None]:
user_df_today.sort_values('activity').plot.bar(y=['elapsed_days', 'running_days'])