In [42]:
import pandas as pd
import numpy as np
import random

In [43]:
# Generate Timestamps from January 2018 to December 2021 with hourly frequency
start_date = '2018-01-01'
end_date = '2021-12-31'
timestamps = pd.date_range(start=start_date, end=end_date, freq='H')
np.random.seed(42)

In [80]:
# Generate CPUTime: Mean = 350000, with fluctuations based on time of day, holidays, and random spikes
def generate_cputime(timestamps):
    # Seasonal pattern with drops during Christmas holidays, exams, and summer months
    holiday_pattern =1 - np.isin(timestamps.month, [12, 6, 7 ,8, 9])*0.1

    # Random spikes
    spike_interval = 24 * 7
    
    spikes = 2.5 - 1.5*np.abs((timestamps.hour % 6 ) / spike_interval)

    # Noise
    noise = 1 + np.random.normal(loc=0, scale=0.01, size=len(timestamps))

    #cycle
    cycle = np.sin(np.arange(0, len(timestamps)) * 2 * np.pi / 24) * 0.1 + 1

    series = 1000000 * noise * holiday_pattern * cycle * spikes

    return np.clip(series, 0, None)

In [105]:
# Generate MemoryUsed: High correlation with CPUTime but non-linear relationship
def generate_memoryused(cputime):
    # Add some noise and non-linear relationship to CPUTime
    noise = np.random.normal(loc=0, scale=2000, size=len(cputime))

    memory_ratio = (np.sin(cputime / 250000) * 20) + 100

    return np.clip(cputime * memory_ratio + noise, 0, None)

In [106]:
# Generate NJobs: Non-linear relationship with CPUTime and MemoryUsed
def generate_nprocs(cputime, memoryused):
    # Non-linear relationship with CPUTime and MemoryUsed
    cputime = np.clip(cputime, 1e-8, None)
    cputime_factor = np.clip(np.log(cputime), -10, None) 
    memoryused_factor = np.sqrt(memoryused)*np.tanh(memoryused / 1e9) / 10

    # Random fluctuations with some correlation to CPUTime and MemoryUsed
    njobs_noise = np.random.normal(loc=15, scale=10, size=len(cputime))
    njobs = cputime_factor + memoryused_factor + njobs_noise

    return np.clip(njobs, 1, 600)  # Clip to a minimum of 1 and maximum of 100 jobs


In [107]:
def generate_njobs(nprocs):
    njobs_random_factor = np.random.normal(loc=6, scale=0.5, size=len(nprocs))

    njobs = nprocs / np.clip(njobs_random_factor, 1, None)
    return njobs 

In [108]:
def generate_jobs(timestamps):
    arrived_jobs = np.random.poisson(lam=10, size=len(timestamps))
    completed_jobs = np.random.poisson(lam=8, size=len(timestamps))
    return arrived_jobs, completed_jobs

In [109]:
# Generate the synthetic dataset
cputime = generate_cputime(timestamps)
memoryused = generate_memoryused(cputime)
nprocs = generate_nprocs(cputime, memoryused)
njobs = generate_njobs(nprocs)
arrived_jobs, completed_jobs = generate_jobs(timestamps)

In [110]:
# Combine all features into a DataFrame
data = {
    'Timestamp': timestamps,
    'CPUTime': cputime,
    'Memory': memoryused,
    'NJobs': njobs,
    'NProcs': nprocs,
    'ArrivedJobs': arrived_jobs,
    'CompletedJobs': completed_jobs,
}
dataset = pd.DataFrame(data)

dataset["cosHour"] = dataset["Timestamp"].map(lambda timestamp: np.cos(2*np.pi*timestamp.hour/24))
dataset["sinHour"] = dataset["Timestamp"].map(lambda timestamp: np.sin(2*np.pi*timestamp.hour/24))

dataset["cosDay"] = dataset["Timestamp"].map(lambda timestamp: np.cos(2*np.pi*timestamp.timetuple().tm_yday/365))
dataset["sinDay"] = dataset["Timestamp"].map(lambda timestamp: np.sin(2*np.pi*timestamp.timetuple().tm_yday/365))

dataset["cosMonth"] = dataset["Timestamp"].map(lambda timestamp: np.cos(2*np.pi*timestamp.month/12))
dataset["sinMonth"] = dataset["Timestamp"].map(lambda timestamp: np.sin(2*np.pi*timestamp.month/12))


dataset.to_csv("./datasets/final/final-formed-synthetic.csv", index=False)

In [111]:
dataset.head()

Unnamed: 0,Timestamp,CPUTime,Memory,NJobs,NProcs,ArrivedJobs,CompletedJobs,cosHour,sinHour,cosDay,sinDay,cosMonth,sinMonth
0,2018-01-01 00:00:00,2537043.0,220117300.0,55.213717,345.328568,10,13,1.0,0.0,0.999852,0.017213,0.866025,0.5
1,2018-01-01 01:00:00,2504147.0,222475500.0,66.37979,357.240027,8,5,0.965926,0.258819,0.999852,0.017213,0.866025,0.5
2,2018-01-01 02:00:00,2627551.0,216275200.0,59.001732,354.975088,8,10,0.866025,0.5,0.999852,0.017213,0.866025,0.5
3,2018-01-01 03:00:00,2660781.0,216133800.0,52.598819,327.892378,10,6,0.707107,0.707107,0.999852,0.017213,0.866025,0.5
4,2018-01-01 04:00:00,2696776.0,216910400.0,52.523992,338.741574,11,9,0.5,0.866025,0.999852,0.017213,0.866025,0.5


In [112]:
dataset.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CPUTime,35041.0,2373944.0,209586.3,1927694.0,2239128.0,2360283.0,2532418.0,2831284.0
Memory,35041.0,232627300.0,10825260.0,216086900.0,221194400.0,235233200.0,242773800.0,246586700.0
NJobs,35041.0,63.56813,6.91145,42.94669,58.6946,63.20315,68.00503,98.71066
NProcs,35041.0,378.5226,25.68877,309.2486,355.7978,382.6396,399.9177,443.9747
ArrivedJobs,35041.0,10.02152,3.168426,0.0,8.0,10.0,12.0,24.0
CompletedJobs,35041.0,8.014697,2.826395,0.0,6.0,8.0,10.0,25.0
cosHour,35041.0,2.8538e-05,0.707127,-1.0,-0.7071068,6.123234000000001e-17,0.7071068,1.0
sinHour,35041.0,-1.1634200000000001e-17,0.7071068,-1.0,-0.7071068,0.0,0.7071068,1.0
cosDay,35041.0,2.843652e-05,0.7071268,-0.999963,-0.7086267,0.004303538,0.7025275,1.0
sinDay,35041.0,1.178963e-05,0.7071069,-0.9999907,-0.7055836,6.432491e-16,0.7055836,0.9999907
