In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import glob

In [None]:
def analyzeDataset(path, timeColumn = 'SubmitTime', delimiter = ','):

    datasetname = path.split('/')[-1].split('.')[0]

    #Lambda function to convert seconds from 1970 to timestamp
    convert_to_timestamp = lambda x: pd.to_datetime(x, unit='s')

    # Read the dataset into a dataframe and set the timestamp as the index.
    dataset = pd.read_csv(path, parse_dates=[timeColumn], date_parser=convert_to_timestamp, delimiter=delimiter)

    # Print the number of rows and columns in the dataframe.
    print("Dataset Shape:")
    print(dataset.shape)

    # Print the number of missing values in dataframe.
    print("Number of missing values:")
    print(dataset.isnull().sum().sum())

    #Print the names of the columns in the dataframe.
    print("Column Names:")
    print(dataset.columns)

    # tex row
    jobsInMillions = round(dataset.shape[0] / 1000000, 2)
    submitTimesInYears = round((dataset[timeColumn].max() - dataset[timeColumn].min()).days / 365, 2)
    print("Latex Row:")
    print(f" \hline \n {datasetname} & {jobsInMillions} M & {submitTimesInYears} yrs & \\\\")

    

    

    

In [None]:
analyzeDataset('./datasets/das2.csv')


  dataset = pd.read_csv(path, index_col=0, parse_dates=[timeColumn], date_parser=convert_to_timestamp)


Dataset Shape:
(1124772, 28)
Number of missing values:
0
Column Names:
Index(['SubmitTime', 'WaitTime', 'RunTime', 'NProc', 'UsedCPUTime',
       'UsedMemory', 'ReqNProcs', 'ReqTime', 'ReqMemory', 'Status', 'UserID',
       'GroupID', 'ExecutableID', 'QueueID', 'PartitionID', 'OrigSiteID',
       'LastRunSiteID', 'JobStructure', 'JobStructureParams', 'UsedNetwork',
       'UsedLocalDiskSpace', 'UsedResources', 'ReqPlatform', 'ReqNetwork',
       'ReqLocalDiskSpace', 'ReqResources', 'VOID', 'ProjectID'],
      dtype='object')
Latex Row:
 \hline 
 das2 & 1.12 M & 1.8 yrs & \\


In [None]:
analyzeDataset('./datasets/grid5000.csv')

Dataset Shape:
(1020195, 28)
Number of missing values:
0
Column Names:
Index(['SubmitTime', 'WaitTime', 'RunTime', 'NProc', 'UsedCPUTime',
       'UsedMemory', 'ReqNProcs', 'ReqTime', 'ReqMemory', 'Status', 'UserID',
       'GroupID', 'ExecutableID', 'QueueID', 'PartitionID', 'OrigSiteID',
       'LastRunSiteID', 'JobStructure', 'JobStructureParams', 'UsedNetwork',
       'UsedLocalDiskSpace', 'UsedResources', 'ReqPlatform', 'ReqNetwork',
       'ReqLocalDiskSpace', 'ReqResources', 'VOID', 'ProjectID'],
      dtype='object')
Timeframe:
920 days 05:16:15
Latex Row:
 \hline 
 grid5000 & 1.02 M & 2.52 yrs & \\


In [None]:
analyzeDataset('./datasets/nordugrid.csv')

Dataset Shape:
(781370, 28)
Number of missing values:
0
Column Names:
Index(['SubmitTime', 'WaitTime', 'RunTime', 'NProc', 'UsedCPUTime',
       'UsedMemory', 'ReqNProcs', 'ReqTime', 'ReqMemory', 'Status', 'UserID',
       'GroupID', 'ExecutableID', 'QueueID', 'PartitionID', 'OrigSiteID',
       'LastRunSiteID', 'JobStructure', 'JobStructureParams', 'UsedNetwork',
       'UsedLocalDiskSpace', 'UsedResources', 'ReqPlatform', 'ReqNetwork',
       'ReqLocalDiskSpace', 'ReqResources', 'VOID', 'ProjectID'],
      dtype='object')
Timeframe:
1151 days 06:47:06
Latex Row:
 \hline 
 nordugrid & 0.78 M & 3.15 yrs & \\


In [None]:
analyzeDataset('./datasets/auvergrid.csv')

Dataset Shape:
(404176, 28)
Number of missing values:
0
Column Names:
Index(['SubmitTime', 'WaitTime', 'RunTime', 'NProc', 'UsedCPUTime',
       'UsedMemory', 'ReqNProcs', 'ReqTime', 'ReqMemory', 'Status', 'UserID',
       'GroupID', 'ExecutableID', 'QueueID', 'PartitionID', 'OrigSiteID',
       'LastRunSiteID', 'JobStructure', 'JobStructureParams', 'UsedNetwork',
       'UsedLocalDiskSpace', 'UsedResources', 'ReqPlatform', 'ReqNetwork',
       'ReqLocalDiskSpace', 'ReqResources', 'VOID', 'ProjectID'],
      dtype='object')
Timeframe:
364 days 23:56:06
Latex Row:
 \hline 
 auvergrid & 0.4 M & 1.0 yrs & \\


In [None]:
analyzeDataset('./datasets/sharcnet.csv')

Dataset Shape:
(1195242, 28)
Number of missing values:
0
Column Names:
Index(['SubmitTime', 'WaitTime', 'RunTime', 'NProc', 'UsedCPUTime',
       'UsedMemory', 'ReqNProcs', 'ReqTime', 'ReqMemory', 'Status', 'UserID',
       'GroupID', 'ExecutableID', 'QueueID', 'PartitionID', 'OrigSiteID',
       'LastRunSiteID', 'JobStructure', 'JobStructureParams', 'UsedNetwork',
       'UsedLocalDiskSpace', 'UsedResources', 'ReqPlatform', 'ReqNetwork',
       'ReqLocalDiskSpace', 'ReqResources', 'VOID', 'ProjectID'],
      dtype='object')
Timeframe:
390 days 22:45:48
Latex Row:
 \hline 
 sharcnet & 1.2 M & 1.07 yrs & \\


In [None]:
analyzeDataset('./datasets/lcg.csv')

Dataset Shape:
(188041, 28)
Number of missing values:
0
Column Names:
Index(['SubmitTime', 'WaitTime', 'RunTime', 'NProc', 'UsedCPUTime',
       'UsedMemory', 'ReqNProcs', 'ReqTime', 'ReqMemory', 'Status', 'UserID',
       'GroupID', 'ExecutableID', 'QueueID', 'PartitionID', 'OrigSiteID',
       'LastRunSiteID', 'JobStructure', 'JobStructureParams', 'UsedNetwork',
       'UsedLocalDiskSpace', 'UsedResources', 'ReqPlatform', 'ReqNetwork',
       'ReqLocalDiskSpace', 'ReqResources', 'VOID', 'ProjectID'],
      dtype='object')
Timeframe:
10 days 23:59:52
Latex Row:
 \hline 
 lcg & 0.19 M & 0.03 yrs & \\


In [None]:
# merge 500 csv's into one
def mergeCSVs(path):
    csvList = []
    for csv in glob.glob(f"./datasets/{path}/*/*.csv"):
        pd.read_csv(csv, delimiter=';\t')
        csvList.append(csv)
    df = pd.concat(csvList)
    df.rename(columns={'Timestamp [ms]': 'SubmitTime'}, inplace=True)
    df.to_csv( "./datasets/{path}.csv", index=False)

In [98]:
mergeCSVs('fastStorage')

Index(['Timestamp [ms];\tCPU cores;\tCPU capacity provisioned [MHZ];\tCPU usage [MHZ];\tCPU usage [%];\tMemory capacity provisioned [KB];\tMemory usage [KB];\tDisk read throughput [KB/s];\tDisk write throughput [KB/s];\tNetwork received throughput [KB/s];\tNetwork transmitted throughput [KB/s]'], dtype='object')

In [100]:
analyzeDataset('./datasets/fastStorage.csv', 'Timestamp [ms]', ';\t')

  dataset = pd.read_csv(path, parse_dates=[timeColumn], date_parser=convert_to_timestamp, delimiter=delimiter)


KeyboardInterrupt: 

In [79]:
len(glob.glob("./datasets/fastStorage/*/*.csv"))

1250