In [142]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
from datetime import datetime, timedelta

In [121]:
def analyzeDataset(path, timeColumn = 'SubmitTime', delimiter = ','):

    datasetname = path.split('/')[-1].split('.')[0]

    convert_to_timestamp = lambda x: datetime.fromtimestamp(int(x))

    # Read the dataset into a dataframe and set the timestamp as the index.
    dataset = pd.read_csv(path, parse_dates=[timeColumn], date_parser=convert_to_timestamp, delimiter=delimiter)

    # Print the number of rows and columns in the dataframe.
    print("Dataset Shape:")
    print(dataset.shape)

    # Print the number of missing values in dataframe.
    print("Number of missing values:")
    print(dataset.isnull().sum().sum())

    #Print the names of the columns in the dataframe.
    print("Column Names:")
    print(dataset.columns)

    # tex row
    jobsInMillions = round(dataset.shape[0] / 1000000, 2)
    submitTimesInYears = round((dataset[timeColumn].max() - dataset[timeColumn].min()).days / 365, 2)
    print("Latex Row:")
    print(f" \hline \n {datasetname} & {jobsInMillions} M & {submitTimesInYears} yrs & \\\\")

    

In [122]:
analyzeDataset('./datasets/raw/das2.csv')


Dataset Shape:
(1124772, 29)
Number of missing values:
0
Column Names:
Index(['JobID', 'SubmitTime', 'WaitTime', 'RunTime', 'NProc', 'UsedCPUTime',
       'UsedMemory', 'ReqNProcs', 'ReqTime', 'ReqMemory', 'Status', 'UserID',
       'GroupID', 'ExecutableID', 'QueueID', 'PartitionID', 'OrigSiteID',
       'LastRunSiteID', 'JobStructure', 'JobStructureParams', 'UsedNetwork',
       'UsedLocalDiskSpace', 'UsedResources', 'ReqPlatform', 'ReqNetwork',
       'ReqLocalDiskSpace', 'ReqResources', 'VOID', 'ProjectID'],
      dtype='object')
Latex Row:
 \hline 
 das2 & 1.12 M & 1.8 yrs & \\


In [123]:
analyzeDataset('./datasets/raw/grid5000.csv')

Dataset Shape:
(1020195, 29)
Number of missing values:
0
Column Names:
Index(['JobID', 'SubmitTime', 'WaitTime', 'RunTime', 'NProc', 'UsedCPUTime',
       'UsedMemory', 'ReqNProcs', 'ReqTime', 'ReqMemory', 'Status', 'UserID',
       'GroupID', 'ExecutableID', 'QueueID', 'PartitionID', 'OrigSiteID',
       'LastRunSiteID', 'JobStructure', 'JobStructureParams', 'UsedNetwork',
       'UsedLocalDiskSpace', 'UsedResources', 'ReqPlatform', 'ReqNetwork',
       'ReqLocalDiskSpace', 'ReqResources', 'VOID', 'ProjectID'],
      dtype='object')
Latex Row:
 \hline 
 grid5000 & 1.02 M & 2.52 yrs & \\


In [22]:
analyzeDataset('./datasets/raw/nordugrid.csv')

Dataset Shape:
(781370, 29)
Number of missing values:
0
Column Names:
Index(['JobID', 'SubmitTime', 'WaitTime', 'RunTime', 'NProc', 'UsedCPUTime',
       'UsedMemory', 'ReqNProcs', 'ReqTime', 'ReqMemory', 'Status', 'UserID',
       'GroupID', 'ExecutableID', 'QueueID', 'PartitionID', 'OrigSiteID',
       'LastRunSiteID', 'JobStructure', 'JobStructureParams', 'UsedNetwork',
       'UsedLocalDiskSpace', 'UsedResources', 'ReqPlatform', 'ReqNetwork',
       'ReqLocalDiskSpace', 'ReqResources', 'VOID', 'ProjectID'],
      dtype='object')
Latex Row:
 \hline 
 nordugrid & 0.78 M & 3.15 yrs & \\


In [23]:
analyzeDataset('./datasets/raw/auvergrid.csv')

Dataset Shape:
(404176, 29)
Number of missing values:
0
Column Names:
Index(['JobID', 'SubmitTime', 'WaitTime', 'RunTime', 'NProc', 'UsedCPUTime',
       'UsedMemory', 'ReqNProcs', 'ReqTime', 'ReqMemory', 'Status', 'UserID',
       'GroupID', 'ExecutableID', 'QueueID', 'PartitionID', 'OrigSiteID',
       'LastRunSiteID', 'JobStructure', 'JobStructureParams', 'UsedNetwork',
       'UsedLocalDiskSpace', 'UsedResources', 'ReqPlatform', 'ReqNetwork',
       'ReqLocalDiskSpace', 'ReqResources', 'VOID', 'ProjectID'],
      dtype='object')
Latex Row:
 \hline 
 auvergrid & 0.4 M & 1.0 yrs & \\


In [24]:
analyzeDataset('./datasets/raw/sharcnet.csv')

Dataset Shape:
(1195242, 29)
Number of missing values:
0
Column Names:
Index(['JobID', 'SubmitTime', 'WaitTime', 'RunTime', 'NProc', 'UsedCPUTime',
       'UsedMemory', 'ReqNProcs', 'ReqTime', 'ReqMemory', 'Status', 'UserID',
       'GroupID', 'ExecutableID', 'QueueID', 'PartitionID', 'OrigSiteID',
       'LastRunSiteID', 'JobStructure', 'JobStructureParams', 'UsedNetwork',
       'UsedLocalDiskSpace', 'UsedResources', 'ReqPlatform', 'ReqNetwork',
       'ReqLocalDiskSpace', 'ReqResources', 'VOID', 'ProjectID'],
      dtype='object')
Latex Row:
 \hline 
 sharcnet & 1.2 M & 1.07 yrs & \\


In [25]:
analyzeDataset('./datasets/raw/lcg.csv')

Dataset Shape:
(188041, 29)
Number of missing values:
0
Column Names:
Index(['JobID', 'SubmitTime', 'WaitTime', 'RunTime', 'NProc', 'UsedCPUTime',
       'UsedMemory', 'ReqNProcs', 'ReqTime', 'ReqMemory', 'Status', 'UserID',
       'GroupID', 'ExecutableID', 'QueueID', 'PartitionID', 'OrigSiteID',
       'LastRunSiteID', 'JobStructure', 'JobStructureParams', 'UsedNetwork',
       'UsedLocalDiskSpace', 'UsedResources', 'ReqPlatform', 'ReqNetwork',
       'ReqLocalDiskSpace', 'ReqResources', 'VOID', 'ProjectID'],
      dtype='object')
Latex Row:
 \hline 
 lcg & 0.19 M & 0.03 yrs & \\


In [26]:
# merge 500 csv's into one
def mergeCSVs(path):
    csvList = []
    for csv in glob.glob(f"./datasets/raw/{path}/*/*.csv"):
        dataframe = pd.read_csv(csv, delimiter=';\t')
        csvList.append(dataframe)
    df = pd.concat(csvList)
    df.rename(columns={'Timestamp [ms]': 'SubmitTime'}, inplace=True)
    df.to_csv( f"./datasets/{path}.csv", index=False)

In [27]:
# import warnings
# warnings.filterwarnings('ignore')
# mergeCSVs('fastStorage')

In [28]:
analyzeDataset('./datasets/raw/fastStorage.csv')

Dataset Shape:
(11221800, 11)
Number of missing values:
0
Column Names:
Index(['SubmitTime', 'CPU cores', 'CPU capacity provisioned [MHZ]',
       'CPU usage [MHZ]', 'CPU usage [%]', 'Memory capacity provisioned [KB]',
       'Memory usage [KB]', 'Disk read throughput [KB/s]',
       'Disk write throughput [KB/s]', 'Network received throughput [KB/s]',
       'Network transmitted throughput [KB/s]'],
      dtype='object')
Latex Row:
 \hline 
 fastStorage & 11.22 M & 0.08 yrs & \\


In [29]:
# import warnings
# warnings.filterwarnings('ignore')
# mergeCSVs('rnd')

In [30]:
analyzeDataset('./datasets/raw/rnd.csv')

Dataset Shape:
(12496728, 11)
Number of missing values:
0
Column Names:
Index(['SubmitTime', 'CPU cores', 'CPU capacity provisioned [MHZ]',
       'CPU usage [MHZ]', 'CPU usage [%]', 'Memory capacity provisioned [KB]',
       'Memory usage [KB]', 'Disk read throughput [KB/s]',
       'Disk write throughput [KB/s]', 'Network received throughput [KB/s]',
       'Network transmitted throughput [KB/s]'],
      dtype='object')
Latex Row:
 \hline 
 rnd & 12.5 M & 0.25 yrs & \\
