In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
from datetime import datetime, timedelta

In [58]:
# function to add the seconds of the row to the timestamp of the row 

def addSecondsToTimestamp(timestamp, secondsToAdd):
    return timestamp + timedelta(seconds=secondsToAdd)


def formDataset(path):
    convert_to_timestamp = lambda x: datetime.fromtimestamp(int(x))

    dataset = pd.read_csv(path, delimiter=',', parse_dates=['SubmitTime'], date_parser=convert_to_timestamp)

    dataset['StopTime'] = dataset[['SubmitTime', 'RunTime']].apply(lambda x: addSecondsToTimestamp(*x), axis=1)

    #drop columns with 0 variance
    dataset.drop(["Status", "PartitionID"], inplace=True, axis=1)
    dataset = dataset.loc[:, dataset.apply(pd.Series.nunique) != 1]
    #drop columns with type object
    dataset.drop(dataset.select_dtypes(['object']), inplace=True, axis=1)

    try:
        cancelled_jobs = dataset[dataset["ReqTime"] < dataset["RunTime"]]
        dataset = dataset[ ~ (dataset["ReqTime"] < dataset["RunTime"])]

        #drop columns with 'ReqTime' == -1
        cancelled_jobs.drop(cancelled_jobs["ReqTime"] == -1, inplace=True)
        cancelled_jobs.to_csv(f"./datasets/dropped/dropped-{path.split('/')[-1]}", index=False)
    except:
        pass



    dataset.to_csv( f"./datasets/formed/formed-{path.split('/')[-1]}", index=False)

In [59]:
for csv in glob.glob(pathname="./datasets/raw/*.csv"):
    print(csv)
    if csv in ['./datasets/raw/fastStorage.csv', './datasets/raw/rnd.csv']:
        continue
    formDataset(csv)

./datasets/raw/grid5000.csv
./datasets/raw/das2.csv


  dataset = pd.read_csv(path, delimiter=',', parse_dates=['SubmitTime'], date_parser=convert_to_timestamp)


./datasets/raw/rnd.csv
./datasets/raw/nordugrid.csv
./datasets/raw/auvergrid.csv
./datasets/raw/fastStorage.csv
./datasets/raw/sharcnet.csv


In [64]:
df = pd.read_csv("./datasets/raw/sharcnet.csv")
df.head(10)

Unnamed: 0,JobID,SubmitTime,WaitTime,RunTime,NProc,UsedCPUTime,UsedMemory,ReqNProcs,ReqTime,ReqMemory,...,JobStructureParams,UsedNetwork,UsedLocalDiskSpace,UsedResources,ReqPlatform,ReqNetwork,ReqLocalDiskSpace,ReqResources,VOID,ProjectID
0,1,1135130133,0,6,1,0.1,-1.0,-1,-1.0,-1.0,...,-1,-1.0,-1.0,-1,-1,-1.0,-1.0,-1,-1,-1
1,2,1135130401,1,13,1,0.13,-1.0,-1,-1.0,-1.0,...,-1,-1.0,-1.0,-1,-1,-1.0,-1.0,-1,-1,-1
2,3,1135130415,0,17,20,0.01,-1.0,-1,-1.0,-1.0,...,-1,-1.0,-1.0,-1,-1,-1.0,-1.0,-1,-1,-1
3,4,1135130438,0,17,1,0.16,-1.0,-1,-1.0,-1.0,...,-1,-1.0,-1.0,-1,-1,-1.0,-1.0,-1,-1,-1
4,5,1135130471,0,5,4,0.03,256.0,-1,-1.0,-1.0,...,-1,-1.0,-1.0,-1,-1,-1.0,-1.0,-1,-1,-1
5,6,1135130835,0,16,1,0.13,-1.0,-1,-1.0,-1.0,...,-1,-1.0,-1.0,-1,-1,-1.0,-1.0,-1,-1,-1
6,7,1138120593,0,6,1,0.11,-1.0,-1,-1.0,-1.0,...,-1,-1.0,-1.0,-1,-1,-1.0,-1.0,-1,-1,-1
7,8,1138120603,0,5,1,0.1,1024.0,-1,-1.0,-1.0,...,-1,-1.0,-1.0,-1,-1,-1.0,-1.0,-1,-1,-1
8,9,1138120664,0,5,1,0.13,1024.0,-1,-1.0,-1.0,...,-1,-1.0,-1.0,-1,-1,-1.0,-1.0,-1,-1,-1
9,10,1138120938,0,6,80,0.0,-1.0,-1,-1.0,-1.0,...,-1,-1.0,-1.0,-1,-1,-1.0,-1.0,-1,-1,-1


In [65]:
df.dtypes

JobID                   int64
SubmitTime              int64
WaitTime                int64
RunTime                 int64
NProc                   int64
UsedCPUTime           float64
UsedMemory            float64
ReqNProcs               int64
ReqTime               float64
ReqMemory             float64
Status                  int64
UserID                 object
GroupID                 int64
ExecutableID           object
QueueID                 int64
PartitionID             int64
OrigSiteID             object
LastRunSiteID          object
JobStructure            int64
JobStructureParams      int64
UsedNetwork           float64
UsedLocalDiskSpace    float64
UsedResources           int64
ReqPlatform             int64
ReqNetwork            float64
ReqLocalDiskSpace     float64
ReqResources            int64
VOID                    int64
ProjectID               int64
dtype: object

In [66]:
df.describe()

Unnamed: 0,JobID,SubmitTime,WaitTime,RunTime,NProc,UsedCPUTime,UsedMemory,ReqNProcs,ReqTime,ReqMemory,...,JobStructureParams,UsedNetwork,UsedLocalDiskSpace,UsedResources,ReqPlatform,ReqNetwork,ReqLocalDiskSpace,ReqResources,VOID,ProjectID
count,1195242.0,1195242.0,1195242.0,1195242.0,1195242.0,1195242.0,1195242.0,1195242.0,1195242.0,1195242.0,...,1195242.0,1195242.0,1195242.0,1195242.0,1195242.0,1195242.0,1195242.0,1195242.0,1195242.0,1195242.0
mean,597621.5,1157647000.0,28666.88,31654.3,2.993179,20757.24,80496.21,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
std,345036.8,7924207.0,82916.87,116561.4,24.55266,5154719.0,463979.1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,1.0,1135130000.0,-1.0,-1.0,-1.0,-2124628000.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,298811.2,1151855000.0,8.0,99.0,1.0,0.19,4096.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
50%,597621.5,1159679000.0,1119.0,2789.0,1.0,211.45,7168.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
75%,896431.8,1164127000.0,19135.0,21910.0,1.0,9718.0,23552.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
max,1195242.0,1168908000.0,1916568.0,13908400.0,3000.0,2087029000.0,32021500.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [61]:
df1 = df[df["ReqTime"]< df["RunTime"]]

In [62]:
df1["Diff"] = - df["ReqTime"] + df["RunTime"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["Diff"] = - df["ReqTime"] + df["RunTime"]


In [63]:
df1.describe()

Unnamed: 0,JobID,SubmitTime,WaitTime,RunTime,NProc,UsedCPUTime,UsedMemory,ReqNProcs,ReqTime,ReqMemory,...,UsedNetwork,UsedLocalDiskSpace,UsedResources,ReqPlatform,ReqNetwork,ReqLocalDiskSpace,ReqResources,VOID,ProjectID,Diff
count,1878.0,1878.0,1878.0,1878.0,1878.0,1878.0,1878.0,1878.0,1878.0,1878.0,...,1878.0,1878.0,1878.0,1878.0,1878.0,1878.0,1878.0,1878.0,1878.0,1878.0
mean,198466.585197,1153133000.0,11011.666667,205719.5,1.0,26474.198616,365245.8,1.0,192311.789137,24154.09,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,13407.75
std,125734.421505,9161548.0,29556.968513,159496.1,0.0,37192.910675,312288.5,0.0,107357.919293,136852.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,114426.3
min,35.0,1136284000.0,0.0,604.0,1.0,0.0,0.0,1.0,600.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0
25%,84733.5,1144955000.0,1.0,80086.0,1.0,24.0,79522.0,1.0,79980.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,34.0
50%,166247.0,1155411000.0,246.5,259244.0,1.0,3087.0,352040.0,1.0,259200.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,67.0
75%,328341.25,1160560000.0,7186.75,259290.8,1.0,54109.25,590076.0,1.0,259200.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,101.0
max,402277.0,1167325000.0,442804.0,1575814.0,1.0,238553.0,2523348.0,1.0,345600.0,1126400.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1425234.0
