## import module

In [66]:
import pandas as pd
import numpy as np

import plotly.express as px
# plotly.offline.init_notebook_mode(connected=True)

import pickle

from dateutil.parser import *

import warnings
warnings.filterwarnings('ignore')

## Generate days function

In [67]:
def generate_time_series( 
                         start_d : str, end_d : str, 
                         start_t : int, end_t : int, 
                         period : str
                         ):
    series = pd.date_range( start=start_d, end=end_d, freq='D' )
    
    df = pd.DataFrame( columns=['ds','y'])
    df.ds, df.y = series, 0
    
    tick = df.set_index('ds').resample( period ).count().reset_index()
    
    np.random.seed(42)
    tick.y = np.random.randint( 0, 10, size=(len(tick.ds)))
    
    tick['days'] = tick['ds'].dt.day_name()
    tick['hour'] = tick['ds'].dt.hour
    
    tick.loc[ (tick.days == 'Saturday') | (tick.days == 'Sunday'),  'y' ] = 0
#     tick.loc[ (tick.hour < 9 ) | (tick.hour > 18 ), 'y' ] = 0
    
    return tick

In [68]:
gen_df = generate_time_series(
        start_d= '2023-05-02', end_d= '2023-05-03', 
        start_t= 9, end_t= 18, 
        period= '1T'
)

# gen_df.iloc[:-1,:-2]  
gen_df

gen_df =  gen_df.iloc[:,:-2].copy()
gen_df

gen_df = gen_df[ gen_df.y > 0]
gen_df['true'] = 0
gen_df

Unnamed: 0,ds,y,true
0,2023-05-02 00:00:00,6,0
1,2023-05-02 00:01:00,3,0
2,2023-05-02 00:02:00,7,0
3,2023-05-02 00:03:00,4,0
4,2023-05-02 00:04:00,6,0
...,...,...,...
1436,2023-05-02 23:56:00,7,0
1437,2023-05-02 23:57:00,8,0
1438,2023-05-02 23:58:00,4,0
1439,2023-05-02 23:59:00,1,0


## 근무시간내 80개 추출

In [69]:
# 근무 시간 내 80개 추출

day_working_time = gen_df[(gen_df['ds'] >= '2023-05-02 09:00:00') & (gen_df['ds'] <= '2023-05-02 17:59:00')].sample(n=80, random_state=42).sort_index()
day_working_time

print( 
    f'''
    max : {np.max(day_working_time.y)}
    min : {np.min(day_working_time.y)}      
    ''')

fig = px.scatter( day_working_time, x='ds', y='y', title='Day working time')
fig.show()

day_working_time



    max : 9
    min : 1      
    


Unnamed: 0,ds,y,true
540,2023-05-02 09:00:00,5,0
549,2023-05-02 09:09:00,4,0
551,2023-05-02 09:11:00,2,0
557,2023-05-02 09:17:00,6,0
560,2023-05-02 09:20:00,3,0
...,...,...,...
1059,2023-05-02 17:39:00,7,0
1065,2023-05-02 17:45:00,4,0
1067,2023-05-02 17:47:00,6,0
1072,2023-05-02 17:52:00,3,0


## anmomal data 생성

### 근무 외 시간(9시 이전)

In [70]:
# 근무 외 시간(9시 이전)
off_time_before_9 = gen_df[(gen_df['ds'] < '2023-05-02 08:00:00')].sample( n=5, random_state=42 )

# check anomal data
off_time_before_9['true'] = 1

print( off_time_before_9.shape )

print( 
    f'''
    max : {np.max(off_time_before_9.y)}
    min : {np.min(off_time_before_9.y)}      
    ''')

fig = px.scatter( off_time_before_9, x='ds', y='y', title='Off time before 9 am' )
fig.show()
off_time_before_9

(5, 3)

    max : 7
    min : 1      
    


Unnamed: 0,ds,y,true
159,2023-05-02 02:39:00,7,1
382,2023-05-02 06:22:00,2,1
198,2023-05-02 03:18:00,1,1
422,2023-05-02 07:02:00,4,1
476,2023-05-02 07:56:00,3,1


### 근무 외 시간(18시 이후)

In [71]:
# 근무 외 시간(18시 이후)
off_time_after_18 = gen_df[(gen_df['ds'] >= '2023-05-02 19:00:00')].sample(n=5, random_state=42)

# check anomal data
off_time_after_18['true'] = 1

print( off_time_after_18.shape )

print( 
    f'''
    max : {np.max(off_time_after_18.y)}
    min : {np.min(off_time_after_18.y)}      
    ''')

fig = px.scatter( off_time_after_18, x='ds', y='y', title='Off time after 6 pm' )
fig.show()
off_time_after_18

(5, 3)

    max : 9
    min : 2      
    


Unnamed: 0,ds,y,true
1174,2023-05-02 19:34:00,2,1
1269,2023-05-02 21:09:00,4,1
1227,2023-05-02 20:27:00,9,1
1283,2023-05-02 21:23:00,7,1
1358,2023-05-02 22:38:00,4,1


### 근무시간 내 10개 과다 접속 적용

In [72]:
indexes = day_working_time.sample(n=10, random_state=42).index
day_working_time.loc[indexes, ['y',]] = day_working_time.loc[indexes,:].y * 10

# check anomal data
day_working_time.loc[indexes, ['true']] = 1 

day_working_time.loc[indexes, ['y',]]

Unnamed: 0,y
741,70
540,50
656,60
746,90
628,50
738,30
610,40
1035,90
560,30
619,80


In [73]:
day_working_time

Unnamed: 0,ds,y,true
540,2023-05-02 09:00:00,50,1
549,2023-05-02 09:09:00,4,0
551,2023-05-02 09:11:00,2,0
557,2023-05-02 09:17:00,6,0
560,2023-05-02 09:20:00,30,1
...,...,...,...
1059,2023-05-02 17:39:00,7,0
1065,2023-05-02 17:45:00,4,0
1067,2023-05-02 17:47:00,6,0
1072,2023-05-02 17:52:00,3,0


## make total data

In [74]:
data_test = pd.concat([off_time_before_9,day_working_time,off_time_after_18]).sort_index().reset_index(drop=True)
print( data_test.shape )

print( 
    f'''
    max : {np.max(data_test.y)}
    min : {np.min(data_test.y)}      
    ''')

# fig = px.scatter( data_test, x='ds', y='y', title='Total test data' )
fig = px.line( data_test, x='ds', y='y', title='Total test data' )
fig.show()

(90, 3)

    max : 90
    min : 1      
    


## test data binary 로 저장

In [75]:
filename = '../ai-plus-test-data/test_data.pickle'
with open( filename, 'wb') as f:
    pickle.dump(data_test, f)

## Result

In [76]:
def generate_test_data():
        gen_df = generate_time_series(
                start_d= '2023-05-02', end_d= '2023-05-03', 
                start_t= 9, end_t= 18, 
                period= '1T'
        ).iloc[:,:-2].copy()
        gen_df = gen_df[ gen_df.y > 0]

        # 근무 시간 내 80개 추출
        day_working_time = gen_df[(gen_df['ds'] >= '2023-05-02 08:00:00') & (gen_df['ds'] <= '2023-05-02 18:59:00')].sample(n=80, random_state=42).sort_index()

        # 근무시간 내 10개 과다 접속 적용
        indexes = day_working_time.sample(n=10, random_state=42).index
        day_working_time.loc[indexes, ['y',]] = day_working_time.loc[indexes,:].y * 10
        
        # check anomal data
        day_working_time.loc[indexes, ['true']] = 1 

        # 근무 외 시간(9시 이전)
        off_time_before_9 = gen_df[(gen_df['ds'] < '2023-05-02 09:00:00')].sample( n=5, random_state=42 )

        # check anomal data
        off_time_before_9['true'] = 1

        # 근무 외 시간(18시 이후)
        off_time_after_18 = gen_df[(gen_df['ds'] >= '2023-05-02 18:00:00')].sample(n=5, random_state=42)
        
        # check anomal data
        off_time_after_18['true'] = 1

        # 전체 100 개 test data
        data_test = pd.concat([off_time_before_9,day_working_time,off_time_after_18]).sort_index().reset_index(drop=True)

        # 전체 test data 저장
        filename = '../ai-plus-test-data/test_data.pickle'
        with open( filename, 'wb') as f:
                pickle.dump(data_test, f)
