In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
# Generate Timestamps from January 2018 to December 2021 with hourly frequency
start_date = '2018-01-01'
end_date = '2021-12-31'
timestamps = pd.date_range(start=start_date, end=end_date, freq='H')
np.random.seed(42)

In [9]:
# Generate CPUTime: Mean = 350000, with fluctuations based on time of day, holidays, and random spikes
def generate_cputime(timestamps):
    # Seasonal pattern with drops during Christmas holidays, exams, and summer months
    holiday_pattern = (
        np.isin(timestamps.month, [12, 6, 9]) |
        (timestamps.month == 7) | (timestamps.month == 8)
    ) * -90000

    # Random spikes
    spikes = np.clip(np.random.lognormal(mean=5, sigma=3, size=len(timestamps)), None, 5e6)

    # Noise
    noise = np.random.normal(loc=0, scale=50000, size=len(timestamps)) 

    #cycle
    cycle = np.sin(np.arange(0, len(timestamps)) * 2 * np.pi / 24) * 50000

    series = 1000000 + holiday_pattern + spikes + cycle + noise

    return np.clip(series, 0, None)

In [10]:
# Generate MemoryUsed: High correlation with CPUTime but non-linear relationship
def generate_memoryused(cputime):
    # Add some noise and non-linear relationship to CPUTime
    noise = np.random.normal(loc=0, scale=20000, size=len(cputime))

    memory_ratio = (np.sin(cputime / 800000) * 50) + 100

    return np.clip(cputime * memory_ratio + noise, 0, None)

In [11]:
# Generate NJobs: Non-linear relationship with CPUTime and MemoryUsed
def generate_njobs(cputime, memoryused):
    # Non-linear relationship with CPUTime and MemoryUsed
    cputime = np.clip(cputime, 1e-8, None)
    cputime_factor = np.clip(np.log(cputime), -10, None) 
    memoryused_factor = np.sqrt(memoryused)*np.tanh(memoryused / 1e9) / 10

    # Random fluctuations with some correlation to CPUTime and MemoryUsed
    njobs_noise = np.random.normal(loc=15, scale=10, size=len(cputime))
    njobs = cputime_factor + memoryused_factor + njobs_noise

    return np.clip(njobs, 1, 500)  # Clip to a minimum of 1 and maximum of 100 jobs


In [12]:
def generate_nprocs(njobs):
    nprocs_random_factor = np.random.normal(loc=6, scale=3, size=len(njobs))

    nprocs = njobs * np.clip(nprocs_random_factor, 1, None)
    return nprocs 

In [13]:
def generate_jobs(timestamps):
    arrived_jobs = np.random.poisson(lam=10, size=len(timestamps))
    completed_jobs = np.random.poisson(lam=8, size=len(timestamps))
    return arrived_jobs, completed_jobs

In [14]:
# Generate the synthetic dataset
cputime = generate_cputime(timestamps)
memoryused = generate_memoryused(cputime)
njobs = generate_njobs(cputime, memoryused)
nprocs = generate_nprocs(njobs)
arrived_jobs, completed_jobs = generate_jobs(timestamps)

In [15]:
# Combine all features into a DataFrame
data = {
    'Timestamp': timestamps,
    'CPUTime': cputime,
    'Memory': memoryused,
    'NJobs': njobs,
    'NProcs': nprocs,
    'ArrivedJobs': arrived_jobs,
    'CompletedJobs': completed_jobs,
}
dataset = pd.DataFrame(data)

dataset["cosHour"] = dataset["Timestamp"].map(lambda timestamp: np.cos(2*np.pi*timestamp.hour/24))
dataset["sinHour"] = dataset["Timestamp"].map(lambda timestamp: np.sin(2*np.pi*timestamp.hour/24))

dataset["cosDay"] = dataset["Timestamp"].map(lambda timestamp: np.cos(2*np.pi*timestamp.timetuple().tm_yday/365))
dataset["sinDay"] = dataset["Timestamp"].map(lambda timestamp: np.sin(2*np.pi*timestamp.timetuple().tm_yday/365))

dataset["cosMonth"] = dataset["Timestamp"].map(lambda timestamp: np.cos(2*np.pi*timestamp.month/12))
dataset["sinMonth"] = dataset["Timestamp"].map(lambda timestamp: np.sin(2*np.pi*timestamp.month/12))


# dataset.to_csv("./datasets/final/final-formed-synthetic.csv", index=False)

In [16]:
dataset.head()

Unnamed: 0,Timestamp,CPUTime,Memory,NJobs,NProcs,ArrivedJobs,CompletedJobs,cosHour,sinHour,cosDay,sinDay,cosMonth,sinMonth
0,2018-01-01 00:00:00,1039633.0,154086100.0,228.026135,1682.249177,5,6,1.0,0.0,0.999852,0.017213,0.866025,0.5
1,2018-01-01 01:00:00,1047930.0,155411500.0,232.836734,875.465388,4,10,0.965926,0.258819,0.999852,0.017213,0.866025,0.5
2,2018-01-01 02:00:00,1034680.0,153227300.0,223.195183,1280.366604,6,8,0.866025,0.5,0.999852,0.017213,0.866025,0.5
3,2018-01-01 03:00:00,1030387.0,152470200.0,213.0183,2675.856184,18,6,0.707107,0.707107,0.999852,0.017213,0.866025,0.5
4,2018-01-01 04:00:00,1029641.0,152401200.0,217.920878,217.920878,15,11,0.5,0.866025,0.999852,0.017213,0.866025,0.5


In [17]:
dataset.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CPUTime,35041.0,973436.7,150031.1,691527.7,913877.3,968628.8,1022015.0,6115376.0
Memory,35041.0,142551100.0,20473730.0,95475450.0,132946400.0,142187700.0,151111500.0,910604000.0
NJobs,35041.0,197.9696,28.55392,100.7669,179.8426,196.9787,214.4729,500.0
NProcs,35041.0,1200.469,596.8052,118.6046,769.7162,1166.458,1583.651,6073.83
ArrivedJobs,35041.0,9.97814,3.150375,0.0,8.0,10.0,12.0,25.0
CompletedJobs,35041.0,8.018378,2.83855,0.0,6.0,8.0,10.0,22.0
cosHour,35041.0,2.8538e-05,0.707127,-1.0,-0.7071068,6.123234000000001e-17,0.7071068,1.0
sinHour,35041.0,-1.1634200000000001e-17,0.7071068,-1.0,-0.7071068,0.0,0.7071068,1.0
cosDay,35041.0,2.843652e-05,0.7071268,-0.999963,-0.7086267,0.004303538,0.7025275,1.0
sinDay,35041.0,1.178963e-05,0.7071069,-0.9999907,-0.7055836,6.432491e-16,0.7055836,0.9999907
