In [97]:
import pandas as pd
import numpy as np
import random

In [98]:
# Generate Timestamps from January 2018 to December 2021 with hourly frequency
start_date = '2018-01-01'
end_date = '2021-12-31'
timestamps = pd.date_range(start=start_date, end=end_date, freq='H')
np.random.seed(42)

In [407]:
# Generate CPUTime: Mean = 350000, with fluctuations based on time of day, holidays, and random spikes
def generate_cputime(timestamps):
    # Hourly pattern with more usage during office hours
    hour_pattern = np.sin(np.pi * timestamps.hour / 24) * 50000

    # Seasonal pattern with drops during Christmas holidays, exams, and summer months
    holiday_pattern = (
        np.isin(timestamps.month, [12, 6, 9]) |
        (timestamps.month == 7) | (timestamps.month == 8)
    ) * -90000

    # Random spikes
    spikes = np.clip(np.random.lognormal(mean=5, sigma=3, size=len(timestamps)), None, 5e6)

    # Noise
    noise = np.random.normal(loc=0, scale=50000, size=len(timestamps)) 

    #cycle
    cycle = np.sin(np.arange(0, len(timestamps)) * 2 * np.pi / 24) * 50000

    series = 1000000 + hour_pattern + holiday_pattern + spikes + cycle + noise

    return np.clip(series, 0, None)

In [408]:
# Generate MemoryUsed: High correlation with CPUTime but non-linear relationship
def generate_memoryused(cputime):
    # Add some noise and non-linear relationship to CPUTime
    noise = np.random.normal(loc=0, scale=20000, size=len(cputime))

    memory_ratio = (np.sin(cputime / 800000) * 50) + 100

    return np.clip(cputime * memory_ratio + noise, 0, None)

In [409]:
# Generate NJobs: Non-linear relationship with CPUTime and MemoryUsed
def generate_njobs(cputime, memoryused):
    # Non-linear relationship with CPUTime and MemoryUsed
    cputime = np.clip(cputime, 1e-8, None)
    cputime_factor = np.clip(np.log(cputime), -10, None) 
    memoryused_factor = np.sqrt(memoryused)*np.tanh(memoryused / 1e9) / 10

    # Random fluctuations with some correlation to CPUTime and MemoryUsed
    njobs_noise = np.random.normal(loc=15, scale=10, size=len(cputime))
    njobs = cputime_factor + memoryused_factor + njobs_noise

    return np.clip(njobs, 1, 500)  # Clip to a minimum of 1 and maximum of 100 jobs


In [410]:
def generate_nprocs(njobs):
    nprocs_random_factor = np.random.normal(loc=6, scale=3, size=len(njobs))

    nprocs = njobs * np.clip(nprocs_random_factor, 1, None)
    return nprocs 

In [411]:
def generate_jobs(timestamps):
    arrived_jobs = np.random.poisson(lam=10, size=len(timestamps))
    completed_jobs = np.random.poisson(lam=8, size=len(timestamps))
    return arrived_jobs, completed_jobs

In [412]:
# Generate the synthetic dataset
cputime = generate_cputime(timestamps)
memoryused = generate_memoryused(cputime)
njobs = generate_njobs(cputime, memoryused)
nprocs = generate_nprocs(njobs)
arrived_jobs, completed_jobs = generate_jobs(timestamps)

In [413]:
# Combine all features into a DataFrame
data = {
    'Timestamp': timestamps,
    'CPUTime': cputime,
    'Memory': memoryused,
    'NJobs': njobs,
    'NProcs': nprocs,
    'ArrivedJobs': arrived_jobs,
    'CompletedJobs': completed_jobs,
}
dataset = pd.DataFrame(data)

dataset["cosHour"] = dataset["Timestamp"].map(lambda timestamp: np.cos(2*np.pi*timestamp.hour/24))
dataset["sinHour"] = dataset["Timestamp"].map(lambda timestamp: np.sin(2*np.pi*timestamp.hour/24))

dataset["cosDay"] = dataset["Timestamp"].map(lambda timestamp: np.cos(2*np.pi*timestamp.timetuple().tm_yday/365))
dataset["sinDay"] = dataset["Timestamp"].map(lambda timestamp: np.sin(2*np.pi*timestamp.timetuple().tm_yday/365))

dataset["cosMonth"] = dataset["Timestamp"].map(lambda timestamp: np.cos(2*np.pi*timestamp.month/12))
dataset["sinMonth"] = dataset["Timestamp"].map(lambda timestamp: np.sin(2*np.pi*timestamp.month/12))


dataset.to_csv("./datasets/final/final-formed-synthetic.csv", index=False)

In [414]:
dataset.head()

Unnamed: 0,Timestamp,CPUTime,Memory,NJobs,NProcs,ArrivedJobs,CompletedJobs,cosHour,sinHour,cosDay,sinDay,cosMonth,sinMonth
0,2018-01-01 00:00:00,1060517.0,157496300.0,237.883862,1663.992864,15,8,1.0,0.0,0.999852,0.017213,0.866025,0.5
1,2018-01-01 01:00:00,1125058.0,167965000.0,245.436361,1704.2371,7,3,0.965926,0.258819,0.999852,0.017213,0.866025,0.5
2,2018-01-01 02:00:00,1031287.0,152661500.0,215.93792,2302.502089,6,8,0.866025,0.5,0.999852,0.017213,0.866025,0.5
3,2018-01-01 03:00:00,1019123.0,150622400.0,198.998278,579.824822,11,5,0.707107,0.707107,0.999852,0.017213,0.866025,0.5
4,2018-01-01 04:00:00,1178008.0,176389700.0,250.351872,1862.300267,9,4,0.5,0.866025,0.999852,0.017213,0.866025,0.5


In [415]:
dataset.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CPUTime,35041.0,1005094.0,150669.6,734193.0,944356.7,998791.1,1053667.0,6062456.0
Memory,35041.0,147778600.0,20542170.0,102566400.0,138097500.0,147237300.0,156364400.0,897908100.0
NJobs,35041.0,207.2502,29.3232,119.811,188.5065,206.1356,224.0386,500.0
NProcs,35041.0,1260.231,630.0473,131.0698,800.8751,1224.397,1664.309,6284.498
ArrivedJobs,35041.0,10.01435,3.142977,1.0,8.0,10.0,12.0,25.0
CompletedJobs,35041.0,8.03162,2.839519,0.0,6.0,8.0,10.0,22.0
cosHour,35041.0,2.8538e-05,0.707127,-1.0,-0.7071068,6.123234000000001e-17,0.7071068,1.0
sinHour,35041.0,-1.1634200000000001e-17,0.7071068,-1.0,-0.7071068,0.0,0.7071068,1.0
cosDay,35041.0,2.843652e-05,0.7071268,-0.999963,-0.7086267,0.004303538,0.7025275,1.0
sinDay,35041.0,1.178963e-05,0.7071069,-0.9999907,-0.7055836,6.432491e-16,0.7055836,0.9999907
