In [45]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from flowprintOptimal.sekigo.core.flowConfig import FlowConfig
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from flowprintOptimal.sekigo.flowUtils.commons import loadFlows, saveFlows
from flowprintOptimal.sekigo.flowUtils.sampler import FixedLengthSampler, FixedLengthSimpleSampler
from flowprintOptimal.sekigo.core.flowRepresentation import FlowRepresentation
from typing import List
from flowprintOptimal.sekigo.flowUtils.flowDatasets import MaxNormalizedDataset,ActivityDataset,BaseFlowDataset,DDQNActivityDataset
from flowprintOptimal.sekigo.dataAnalysis.vNATDataFrameProcessor import VNATDataFrameProcessor
from flowprintOptimal.sekigo.flowUtils.commons import getValidInvalidStartingPointsForSubFlowStart, getActivityArrayFromFlow
from flowprintOptimal.sekigo.modeling.neuralNetworks import LSTMNetwork,TransformerGenerator,CNNNetwork1D
from flowprintOptimal.sekigo.modeling.loggers import Logger
from sklearn.ensemble import RandomForestClassifier
from torch.utils.data import Dataset, DataLoader,WeightedRandomSampler
from tqdm import tqdm
from copy import deepcopy
import matplotlib.pyplot as plt
from joblib import delayed, Parallel
from typing import List
from flowprintOptimal.sekigo.earlyClassification.DQL.memoryFiller import MemoryFiller
from flowprintOptimal.sekigo.earlyClassification.DQL.core import MemoryElement,Rewarder,State
from flowprintOptimal.sekigo.earlyClassification.DQL.datasets import MemoryDataset
from torch.nn.utils.rnn import pack_sequence, unpack_sequence
from flowprintOptimal.sekigo.dataAnalysis.dataFrameProcessor import SoftwareUpdateDataProcessor,GamingDownloadDataFrameProcessor
from flowprintOptimal.sekigo.dataAnalysis.dataFrameExtractor import DataFrameExtractor
from sklearn.metrics import precision_recall_fscore_support
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix,  ConfusionMatrixDisplay
from flowprintOptimal.sekigo.earlyClassification.DQL.trainers import EarlyClassificationtrainer
from flowprintOptimal.sekigo.modeling.neuralNetworks import LinearPredictor
import warnings
from flowprintOptimal.sekigo.utils.commons import augmentData
warnings.filterwarnings('ignore')
import torch.nn.functional as F
import random
from flowprintOptimal.sekigo.modeling.trainers import NNClassificationTrainer

In [None]:
flow_config = FlowConfig(grain= 1, band_thresholds= [1250])
processor1 = SoftwareUpdateDataProcessor(parquet_path= "data/software-update-1w.parquet")
processor2 = GamingDownloadDataFrameProcessor(parquet_path= "data/2023-12-01--2023-12-31--flowprint.parquet")
flows = DataFrameExtractor.getData(data_frame_processors= [processor2,processor1],needed_flow_config= flow_config)
class_type_filtered_flows = flows
FT_filtered_flows = list(filter(lambda x : x.class_type != "Download",class_type_filtered_flows))
FT_flows = list(filter(lambda x : x.class_type == "Download",class_type_filtered_flows))
len(FT_filtered_flows) + len(FT_flows) == len(class_type_filtered_flows)

In [46]:
flows = loadFlows(path= "data/VNAT/flowStore/vnatflowshalfsecond_2_bands.json")
class_type_filtered_flows = VNATDataFrameProcessor.convertLabelsToTopLevel(flows)
FT_filtered_flows = list(filter(lambda x : x.class_type != "FT",class_type_filtered_flows))
FT_flows = list(filter(lambda x : x.class_type == "FT",class_type_filtered_flows))
len(FT_filtered_flows) + len(FT_flows) == len(class_type_filtered_flows)

True

In [63]:
sampler = FixedLengthSampler(flow_config= FT_filtered_flows[0].flow_config,required_length_in_seconds = 30,ratio_of_median_to_sample= .1,min_activity_for_start_point= 25,sample_wise_train_ratio= .8, temporal_train_ratio= 1)
FT_filtered_split_flows = sampler.sampleAndCutToLength(data= FT_filtered_flows)
ft_sampler = FixedLengthSampler(flow_config= FT_filtered_flows[0].flow_config,required_length_in_seconds = 30,ratio_of_median_to_sample= .5,min_activity_for_start_point= 25,sample_wise_train_ratio= 1, temporal_train_ratio= 1)
FT_split_flows = ft_sampler.sampleAndCutToLength(data= FT_flows)
all_sampler = FixedLengthSampler(flow_config= class_type_filtered_flows[0].flow_config,required_length_in_seconds = 30,ratio_of_median_to_sample= .1,min_activity_for_start_point= 25,sample_wise_train_ratio= .8, temporal_train_ratio= 1)
all_flows = all_sampler.sampleAndCutToLength(data= class_type_filtered_flows)

{'chat': 0.009890938699667779, 'control': 0.1760825125441368, 'streaming': 0.09997625893587275}
overlapping points = 0
{'FT': 0.4996028594122319}
overlapping points = 0
{'FT': 1, 'chat': 0.054945599837151825, 'control': 0.9781639100538934, 'streaming': 0.5553826268168509}
overlapping points = 0


In [64]:
print(pd.Series(map(lambda x : x.class_type,FT_filtered_split_flows["train_flows"])).value_counts())
print(pd.Series(map(lambda x : x.class_type,FT_filtered_split_flows["test_flows"])).value_counts())
print(pd.Series(map(lambda x : x.class_type,all_flows["train_flows"])).value_counts())
print(pd.Series(map(lambda x : x.class_type,all_flows["test_flows"])).value_counts())
print(pd.Series(map(lambda x : x.class_type,FT_split_flows["train_flows"])).value_counts())
#print(pd.Series(map(lambda x : x.class_type,FT_split_flows["test_flows"])).value_counts())

streaming    3451
chat         3274
control      2633
Name: count, dtype: int64
control      1367
streaming     630
chat          573
Name: count, dtype: int64
streaming    17451
chat         16007
control      14749
FT            1066
Name: count, dtype: int64
control      6420
chat         5113
streaming    3805
FT            193
Name: count, dtype: int64
FT    634
Name: count, dtype: int64


In [65]:
class DDQNActivityDataset(BaseFlowDataset):
    def __init__(self, flows: List[FlowRepresentation], label_to_index: dict,do_aug = False,balance = False):
        super().__init__(flows = flows, label_to_index= label_to_index)
        self.do_aug = do_aug
        self.labels = list(map(lambda x : self.label_to_index[x.class_type],self.flows))
        self.flows = list(map(lambda x : getActivityArrayFromFlow(x), self.flows))

        if balance == True:
            print("balancing")
            self.cureImbalance()
    
    def __getitem__(self, index):
        return dict(data = self.flows[index] if (self.do_aug == False) else augmentData(self.flows[index],fraction_range= [0,.1]), label  = self.labels[index])
    
    def cureImbalance(self):
        
        label_to_indices = dict()
        for i,label in enumerate(self.labels):
            if label not in label_to_indices:
                label_to_indices[label] = []
            label_to_indices[label].append(i)
        
        
        counts = [len(x) for _,x in label_to_indices.items()]
        max_counts = max(counts)
        print(counts)
        added_flows,added_labels = [],[]
        for label,indices in label_to_indices.items():
            to_add = max_counts - len(indices)
            if to_add > 0:
                replication_indices = random.choices(population= indices,k= to_add)
            
                for replication_index in replication_indices:
                    added_flows.append(self.flows[replication_index].copy())
                    added_labels.append(self.labels[replication_index])
        


        self.flows.extend(added_flows)
        self.labels.extend(added_labels)

In [66]:
all_train_dataset = DDQNActivityDataset(flows= all_flows["train_flows"],label_to_index= None,do_aug= False,balance= False)
all_test_dataset = DDQNActivityDataset(flows = all_flows["test_flows"], label_to_index= all_train_dataset.label_to_index)
train_dataset = DDQNActivityDataset(flows= FT_filtered_split_flows["train_flows"],label_to_index= None,do_aug= False)
test_dataset = DDQNActivityDataset(flows= FT_filtered_split_flows["test_flows"],label_to_index= train_dataset.label_to_index)
ood_dataset = DDQNActivityDataset(flows= FT_split_flows["train_flows"],label_to_index= None)
#ood_support_dataset = DDQNActivityDataset(flows= FT_split_flows["train_flows"],label_to_index= ood_dataset.label_to_index)

In [67]:
classifier_ = CNNNetwork1D(in_channels= 6,num_filters= 32,output_dims= [len(all_train_dataset.label_to_index)],num_layers= 3)#LSTMNetwork(lstm_hidden_size= 64,lstm_input_size=6,output_dim = len(all_train_dataset.label_to_index))
logger= Logger(name= "classification",verbose= True)
logger.default_step_size = 500
classification_trainer = NNClassificationTrainer(classifier = classifier_,device= device,logger= logger)
classification_trainer.train(train_dataset= all_train_dataset,test_dataset= all_test_dataset,epochs=20,batch_size= 64,lr= .0001)

classification ---- 1 metric test_f1 = 0.006137115237852963
classification ---- 1 metric train_f1 = 0.010588211923160968
classification ---- 10 metric train_loss = 1.3990599036216735
classification ---- 20 metric train_loss = 1.38609139919281
classification ---- 30 metric train_loss = 1.377616274356842
classification ---- 40 metric train_loss = 1.3585224151611328
classification ---- 50 metric train_loss = 1.2981319785118104
classification ---- 60 metric train_loss = 1.1943374276161194
classification ---- 70 metric train_loss = 1.1328577995300293
classification ---- 80 metric train_loss = 1.1128541469573974
classification ---- 90 metric train_loss = 1.0744221568107606
classification ---- 100 metric train_loss = 1.0778009116649627
classification ---- 110 metric train_loss = 1.0252083241939545
classification ---- 120 metric train_loss = 0.989182960987091
classification ---- 130 metric train_loss = 0.9135378539562226
classification ---- 140 metric train_loss = 0.8773638248443604
classifica

In [60]:
labels = list(map(lambda x : x["label"], all_test_dataset))

In [68]:
preds,labels = classification_trainer.predictOnDataset(dataset= all_test_dataset)

In [71]:
precision_recall_fscore_support(labels,preds,zero_division= 0)

(array([0.80310881, 0.99765488, 0.9995311 , 0.99084489]),
 array([0.80310881, 0.99843536, 0.99610592, 0.99553219]),
 array([0.80310881, 0.99804497, 0.99781557, 0.99318301]),
 array([ 193, 5113, 6420, 3805]))

In [44]:
sum([0.6978022 , 0.99075786, 0.98024583, 0.98798674])/4

0.9141981575