In [1]:
import random
import numpy as np
import pandas as pd
import math, random
from scipy import special
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from datetime import datetime, timedelta

In [2]:
def generate_time_of_event(n=100, _lambda=5):
  arrival_time = 0
  event_times = []

  for i in range(n):
    p = random.random()
    inter_arrival_time = -math.log(1.0 - p) / _lambda

    arrival_time += inter_arrival_time
    event_times.append(arrival_time)
  
  return event_times

In [3]:
def load_synt(filepath):
  df = pd.read_csv(filepath)

  random.seed(10)
  res = [round(random.uniform(5,100) * 1000) for i in range(50)]

  id_size = {}
  for i, size in enumerate(res):
    id_size[i + 1] = size
    
  df['size'] = df['object_ID'].apply(lambda _id: id_size[_id])
  
  return df

In [4]:
df = load_synt("datasets/syntheticDataset_O50.csv")

In [5]:
display(df.head())

display(df.tail())

Unnamed: 0,object_ID,request_time,size
0,3,0.972871,59919
1,1,1.207572,59283
2,1,1.496262,59283
3,20,2.023976,48399
4,29,2.777488,5385


Unnamed: 0,object_ID,request_time,size
287658,42,304950.049238,26308
287659,42,305015.143156,26308
287660,42,305060.330213,26308
287661,42,305065.952156,26308
287662,42,305176.878168,26308


In [6]:
min_request_time = df['request_time'].min()

df['current_day'] = (df['request_time'] - min_request_time) / 86400
df["current_day"] = df["current_day"].apply(int)
df["current_day"] = df["current_day"] + 1

In [7]:
START_DATETIME = datetime(year=2018, month=8, day=1, hour=0, minute=0, second=0)

df['time'] = df['request_time'].apply(lambda r: START_DATETIME + timedelta(seconds=r))
df = df.set_index('time')

In [8]:
BIN_SECONDS = 3600

df.sort_values(by=['request_time'], inplace=True)
bins = np.arange(np.ceil(df.request_time.min()), np.ceil(df.request_time.max()), BIN_SECONDS)
df['hour'] = pd.cut(df['request_time'], bins, labels=np.arange(0, len(bins)-1))
df.dropna(inplace=True)

In [9]:
df['x'] = 1
df = df.groupby(['object_ID', 'hour', 'size']).rolling('1H')['x'].count().reset_index().sort_values('time').set_index('time')
df.rename({'x': 'total_requests_1H'}, axis=1, inplace=True)

In [10]:
tmp_df = df.groupby(['object_ID', 'hour']).rolling('2H')['total_requests_1H'].mean().reset_index().sort_values('time').set_index('time')
df['mean_requests_2H'] = tmp_df['total_requests_1H']

In [11]:
tmp_df = df.copy()
tmp_df['x'] = 1
tmp_df = tmp_df.groupby(['object_ID']).rolling('30D')['x'].sum().reset_index().sort_values('time').set_index('time')

df['count'] = tmp_df['x']

In [12]:
def items_to_keep_in_cache(df, window=15):
  keep_in_cache = []
  for i, r in df.reset_index().iterrows():
    current_id = r['object_ID']
    rolling_df = df.iloc[i + 1:i+window]['object_ID'].values

    keep_in_cache.append(current_id in rolling_df)
  
  return keep_in_cache

In [13]:
df['cache'] = items_to_keep_in_cache(df, 15)

In [14]:
df.head(18)

Unnamed: 0_level_0,object_ID,hour,size,total_requests_1H,mean_requests_2H,count,cache
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-08-01 00:00:01.207572,1,0,59283,1.0,1.0,1.0,True
2018-08-01 00:00:01.496262,1,0,59283,2.0,1.5,2.0,True
2018-08-01 00:00:02.023976,20,0,48399,1.0,1.0,1.0,False
2018-08-01 00:00:02.777488,29,0,5385,1.0,1.0,1.0,False
2018-08-01 00:00:03.251327,37,0,46423,1.0,1.0,1.0,False
2018-08-01 00:00:03.683805,43,0,44223,1.0,1.0,1.0,False
2018-08-01 00:00:04.136985,1,0,59283,3.0,2.0,3.0,True
2018-08-01 00:00:04.188425,22,0,67875,1.0,1.0,1.0,True
2018-08-01 00:00:05.998892,5,0,82266,1.0,1.0,1.0,True
2018-08-01 00:00:06.490449,14,0,9233,1.0,1.0,1.0,True


In [None]:
df.to_csv("final_datasets/teste.csv")