In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json

with open('jobs.json', 'r') as file:
    raw_json = json.load(file)

print(f'Raw json loaded. High-level keys: {raw_json.keys()}')

Raw json loaded. High-level keys: dict_keys(['meta', 'errors', 'jobs'])


In [2]:
jobs = raw_json['jobs']
print(f'Jobs in file: {len(jobs)}')
print(f'Each job is of length: {len(jobs[0])}')

Jobs in file: 12824
Each job is of length: 30


In [3]:
jobs_df = pd.DataFrame(jobs)
jobs_df.head(3)

Unnamed: 0,account,comment,allocation_nodes,array,association,cluster,constraints,container,derived_exit_code,time,...,qos,required,kill_request_user,reservation,state,steps,tres,user,wckey,working_directory
0,68,"{'administrator': None, 'job': None, 'system':...",1,"{'job_id': 0, 'limits': {'max': {'running': {'...","{'account': '68', 'cluster': 'eagle', 'partiti...",eagle,,,"{'status': 'SUCCESS', 'return_code': 0}","{'elapsed': 9482, 'eligible': 1672509600, 'end...",...,normal,"{'CPUs': 24, 'memory': 2048}",,"{'id': 0, 'name': 0}","{'current': 'COMPLETED', 'reason': 'BeginTime'}","[{'nodes': {'list': ['e1788'], 'count': 1, 'ra...","{'allocated': [{'type': 'cpu', 'name': None, '...",kulka,"{'wckey': '', 'flags': []}",/mnt/storage_2/scratch/grant_68/kulka/test/dev...
1,68,"{'administrator': None, 'job': None, 'system':...",1,"{'job_id': 0, 'limits': {'max': {'running': {'...","{'account': '68', 'cluster': 'eagle', 'partiti...",eagle,,,"{'status': 'SUCCESS', 'return_code': 0}","{'elapsed': 9932, 'eligible': 1672531200, 'end...",...,normal,"{'CPUs': 24, 'memory': 2048}",,"{'id': 0, 'name': 0}","{'current': 'COMPLETED', 'reason': 'BeginTime'}","[{'nodes': {'list': ['e2281'], 'count': 1, 'ra...","{'allocated': [{'type': 'cpu', 'name': None, '...",kulka,"{'wckey': '', 'flags': []}",/mnt/storage_2/scratch/grant_68/kulka/test/dev...
2,68,"{'administrator': None, 'job': None, 'system':...",1,"{'job_id': 0, 'limits': {'max': {'running': {'...","{'account': '68', 'cluster': 'eagle', 'partiti...",eagle,,,"{'status': 'SUCCESS', 'return_code': 0}","{'elapsed': 10601, 'eligible': 1672552800, 'en...",...,normal,"{'CPUs': 24, 'memory': 2048}",,"{'id': 0, 'name': 0}","{'current': 'COMPLETED', 'reason': 'BeginTime'}","[{'nodes': {'list': ['e1901'], 'count': 1, 'ra...","{'allocated': [{'type': 'cpu', 'name': None, '...",kulka,"{'wckey': '', 'flags': []}",/mnt/storage_2/scratch/grant_68/kulka/test/dev...


In [4]:
print(f'Columns: {jobs_df.columns.to_list()}')

Columns: ['account', 'comment', 'allocation_nodes', 'array', 'association', 'cluster', 'constraints', 'container', 'derived_exit_code', 'time', 'exit_code', 'flags', 'group', 'het', 'job_id', 'name', 'mcs', 'nodes', 'partition', 'priority', 'qos', 'required', 'kill_request_user', 'reservation', 'state', 'steps', 'tres', 'user', 'wckey', 'working_directory']


In [375]:
from collections import defaultdict
dic = defaultdict(lambda: 0)
for k in jobs_df['steps']:
    if len(k) > 0:
        dic[k[0]['tres']['allocated'][0]['id']] += 1
    else:
        #print(k)
        continue
print(f'Unique keys: {len(dic.keys())}')
if len(dic) < 100:
    print(dic)



Unique keys: 1
defaultdict(<function <lambda> at 0x0000024D1127E950>, {1: 12823})


In [377]:
col = 'steps'
print(jobs_df[col][0][1]['tres'].keys())
#print(jobs_df[col].value_counts())
#jobs_df[col].keys()
#['nodes', 'tres', 'time', 'exit_code', 'tasks', 'pid', 'CPU', 'kill_request_user', 'state', 'statistics', 'step', 'task']

dict_keys(['requested', 'consumed', 'allocated'])


In [None]:
none_columns = [
    'comment', # All nested columns are None,
    'constraints', # All values are None,
    'container', # All values are None
    ]

redundant_info_columns = [
    'derived_exit_code', # All nested values have one unique value status: SUCCESS, return_code: 0,
    'het', # All nested cols are 0 or None
    'mcs', # Nested column (only one: label) have ''
]

info_columns = [
    'account', # Account ids [int, str]
    'array', # Job id, limits and task_id. Most (95%) records are the same: job_id: 0, task: None, task_id: None, limits_max_running_tasks: 0
    'association', # account, cluster, partition, user.
    'group', # group name,
    'name', # some string as job name
]

nested_columns = [
    'time', # nested times. Probably important: elapsed, eligible, end, start, submission, limit. The rest features are always 0
]

continous_columns = [
    'allocation_nodes', # Number of allocated nodes [int]
]

In [25]:
df = jobs_df.copy()
#for (columnName, columnData) in df.iteritems():
columnName = 'comment'
columnData = df[columnName]
print('Column Name : ', columnName)
# print('Column Data : ', columnData)
cd = columnData.apply(pd.Series)
#columnData.columns = [columnName + '_' + colname for colname in columnData.columns]
df = pd.concat([df, cd], axis=1)
df[cd.columns]

Column Name :  comment


Unnamed: 0,administrator,job,system
0,,,
1,,,
2,,,
3,,,
4,,,
...,...,...,...
12819,,,
12820,,,
12821,,,
12822,,,


In [395]:
jobs_df['time'][0]['start']

1672509600

In [389]:
import datetime




datetime.datetime(2022, 12, 31, 18, 0, tzinfo=datetime.timezone.utc)

In [396]:
_max = 0
_min = 2e20

for t in jobs_df['time']:
    _max = max(_max, t['start']) 
    _min = min(_min, t['start']) 
print(_max)
print(_min)
print(datetime.datetime.fromtimestamp(_min, datetime.timezone.utc))
print(datetime.datetime.fromtimestamp(_max, datetime.timezone.utc))

1674832232
1671961140
2022-12-25 09:39:00+00:00
2023-01-27 15:10:32+00:00
