In [1]:
from process import process_data
from utils import read_dataset

from time import time
import numpy as np
import torch


### Initialize training parameters

In [2]:
# Set all parameters for the experiment
# Maximum dataset size to be considered as keeping track of previous drifts slows the system down considerably
# Default: 50000
max_dataset_size = 50000 
# Training window size. Default: 100
training_window_size = 100
# Training epochs. Default: 150
epochs = 150
# Set repeat factor. 1/factor will be the number of instances from previous instances that are considered for training
# Default: 25. This means 4% data from previous identical drift windows will be added to the current training data
repeat_factor = 25
# Equalize the number of training instances across different drifts. Default: True
equalize = True


In [3]:
# GAN parameters

# Sequence length for the generator. Default: 10
sequence_length = 10

# For the collate function to split the rows accordingly
seq_len = sequence_length

# Training steps. default: 100
steps_generator = 100

# Set the batch_size of the discriminator. Default: 8
batch_size = 8

# Batch size for training the generator
generator_batch_size = 8

# Number of instances that should have the same label for a drift to be confirmed. Default: 4
test_batch_size = 4

# Set the learning rate. Default: 0.001
lr = 0.001  

# Set the weight decay rate. Default: 0.00005
weight_decay = 0.00005

In [4]:
# Set the seeds to get some deterministic behaviour in the experiment
seed = np.random.randint(65536)

In [5]:
# Set the training to cpu or gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
# Choose the dataset name for loading the data.
# The datafiles can be obtained from https://github.com/ogozuacik/concept-drift-datasets-scikit-multiflow
# Dataset names: airlines, chess, covtrype, electricity, ludata, outdoor, phishing, poker, rialto, spam
dataset_name = 'outdoor'
features, labels = read_dataset(dataset_name)

In [7]:
# If the dataset exceeds the maximum dataset size, clip the dataset
features = features[:max_dataset_size]
labels = labels[:max_dataset_size]

In [8]:
# Standardize the dataset for the GAN
features = np.array(features)
mean = np.mean(features, axis=1).reshape(features.shape[0], 1)
std = np.std(features, axis=1).reshape(features.shape[0], 1)

standardized_features = (features - mean)/(std + 0.000001)
concatenated_features = features 
features = (features - mean)/(std + 0.000001)

In [9]:
t1 = time()
y_pred, y_true, drifts_detected = process_data(features=features, labels=labels, device=device, epochs=epochs,
                                               steps_generator=steps_generator, seed=seed,
                                               batch_size=batch_size, lr=lr,
                                               weight_decay=weight_decay, test_batch_size=test_batch_size,
                                               training_window_size=training_window_size,
                                               generator_batch_size=generator_batch_size, equalize=equalize,
                                               sequence_length=sequence_length, repeat_factor=repeat_factor,
                                               training_features=concatenated_features)
t2 = time()

120
220
320
420
520
620
720
820
920
1020
1124
1224
1324
1424
1532
1632
1732
1832
1932
2032
2132
2232
2332
2440
2540
2640
2740
2852
2952
3052
3156
3256
3356
3456
3556
3656
3756
3856


In [10]:
# print the time taken to execute
exec_time = t2 - t1
print('Execution time is %d seconds' % exec_time)

Execution time is 14726 seconds


In [11]:
# Compute the accuracy 
from sklearn.metrics import accuracy_score
accuracy_value = accuracy_score(y_true=y_true, y_pred=y_pred)
print(' Accuracy value is %f for dataset %s' % (accuracy_value, dataset_name))

 Accuracy value is 0.589750 for dataset outdoor


In [12]:
# print the number of detected drifts
print('No. of drifts is %d' % len(drifts_detected))

No. of drifts is 38
