In [25]:
import pandas as pd
import numpy as np
from flowprintOptimal.sekigo.core.flowRepresentation import PacketFlowRepressentation
from flowprintOptimal.sekigo.flowUtils.commons import getIATFromTimeStamps , getTimeStampsFromIAT
from flowprintOptimal.sekigo.flowUtils.commons import saveFlows
from flowprintOptimal.sekigo.flowUtils.conversions import convertPacketRepToTimeslotRepEffecient
import datetime
from joblib import Parallel, delayed

In [26]:

class DeploymentProcessor:
    def __init__(self,csv_paths, filter_out_classes = []):
        self.csv_paths = csv_paths
        self.filter_out_classes = filter_out_classes

    
    @staticmethod
    def processLength(length : float):
        """
        Since there are a lot of fragmented packets, that tcp dump shows length of > 1500
        We will threshold 
        """
        if length <= 3000:
            return length/1500
        return 2


    
    def getPacketRepsFromDf(self,csv_path):
        df = pd.read_csv(csv_path)
        df.Timestamp = pd.to_datetime(df.Timestamp)

        packet_reps = []

        for _,mini_df in df.groupby(by = "FlowId"):
            mini_df.sort_values(by= "Timestamp", inplace= True)
            IAT = getIATFromTimeStamps(timestamps= mini_df.Timestamp.values)
            directions = mini_df.Direction.values.astype(np.int8).tolist()
            lengths = mini_df.Length.apply(DeploymentProcessor.processLength).values.tolist()
            class_type = mini_df.iloc[0].Type
            provider_type = mini_df.iloc[0].Provider

            if class_type in self.filter_out_classes:
                continue

            packet_rep = PacketFlowRepressentation(lengths= lengths, directions= directions, inter_arrival_times= IAT,class_type= class_type)
            packet_rep.provider_type = provider_type
            packet_reps.append(packet_rep)
        
        return packet_reps


    def getPacketReps(self):
        packet_reps = []
        reps_list = Parallel(n_jobs=min(len(self.csv_paths),10))(delayed(self.getPacketRepsFromDf)(csv_path) for csv_path in self.csv_paths)
        for reps in reps_list:
            packet_reps.extend(reps)

        """
        for csv_path in self.csv_paths:
            packet_reps.extend(self.getPacketRepsFromDf(csv_path= csv_path))
        """
        return packet_reps


In [27]:
deployment_processor = DeploymentProcessor(csv_paths= ["data/deployment/final_flows2.csv", "data/deployment/final_flows1.csv",
                                                        "data/deployment/final_flows3.csv", "data/deployment/final_flows4.csv",
                                                        "data/deployment/final_flows_conf.csv","data/deployment/final_flows_conf2.csv"])

In [28]:
packet_reps = deployment_processor.getPacketReps()

In [29]:
#filtered_packet_reps = filter(lambda x : x.class_type not in ["Music", "Conferencing"])

In [30]:
pd.Series([rep.class_type for rep in packet_reps]).value_counts()

_unknown           32242
Conferencing       26972
Video              19186
Software Update    15150
Download            9358
File Storage        8651
Mail                4764
Social Media        2178
Music               1019
VPN                  485
Speed Test           275
Live Video             3
Name: count, dtype: int64

In [31]:
video_flows = list(filter(lambda x : x.class_type == "Video", packet_reps))
conf_flows = list(filter(lambda x : x.class_type == "Conferencing", packet_reps))
su_flows = list(filter(lambda x : x.class_type == "Software Update", packet_reps))
sm_flows = list(filter(lambda x : x.class_type == "Social Media", packet_reps))
fs_flows = list(filter(lambda x : x.class_type == "File Storage", packet_reps))

In [32]:
for video_flow in video_flows:
    video_flow.class_type = video_flow.provider_type

for su_flow in su_flows:
    su_flow.class_type = su_flow.provider_type

for sm_flow in sm_flows:
    sm_flow.class_type = sm_flow.provider_type

for conf_flow in conf_flows:
    conf_flow.class_type = conf_flow.provider_type

for fs_flow in fs_flows:
    fs_flow.class_type = fs_flow.provider_type

In [33]:
pd.Series(map(lambda x : x.class_type , fs_flows)).value_counts().to_dict()

{'Microsoft Sharepoint': 6158,
 'Google Drive': 683,
 'Dropbox': 629,
 'Apple iCloud': 524,
 'Google Docs': 433,
 'OneDrive': 181,
 'Google Photos': 15,
 'BOXcloud': 15,
 'Samsung Cloud': 5,
 'qCloud': 4,
 'YouTube': 4}

In [12]:
video_flows = list(filter(lambda x : x.class_type in ["Microsoft", "Fastly", "YouTube", "QQ", "WeChat"], video_flows))

In [17]:
su_flows = list(filter(lambda x : x.class_type in ["Adobe","Windows Update", "Apple", "Ubuntu"], su_flows))

In [30]:
sm_flows = list(filter(lambda x : x.class_type in ["Facebook", "Reddit", "LinkedIn", "Instagram", "TikTok", "Twitter"], sm_flows))

In [11]:
conf_flows = list(filter(lambda x : x.class_type in ["Discord","WhatsAppVoice", "Microsoft Teams", "GoogleMeet", "Facetime", "Zoom"], conf_flows))

In [34]:
fs_flows = list(filter(lambda x : x.class_type in ["Microsoft Sharepoint", "Google Drive", "Dropbox", "Apple iCloud", "OneDrive"], fs_flows))

In [36]:
pd.Series(map(lambda x : x.class_type , fs_flows)).value_counts().to_dict()

{'Microsoft Sharepoint': 6158,
 'Google Drive': 683,
 'Dropbox': 629,
 'Apple iCloud': 524,
 'OneDrive': 181}

In [18]:
saveFlows(path= "data/deployment/deployment_su_packet_rep.json", flows= su_flows)

In [13]:
saveFlows(path= "data/deployment/deployment_video_packet_rep.json", flows= video_flows)

In [11]:
saveFlows(path= "data/deployment/deployment_packet_rep.json", flows= packet_reps)

In [31]:
saveFlows(path= "data/deployment/deployment_sm_packet_rep.json", flows= sm_flows)

In [13]:
saveFlows(path= "data/deployment/deployment_conf_packet_rep.json", flows= conf_flows)

In [37]:
saveFlows(path= "data/deployment/deployment_fs_packet_rep.json", flows= fs_flows)

In [None]:
p_rep = packet_reps[0]
iat = p_rep.inter_arrival_times
ts = getTimeStampsFromIAT(iat)

In [91]:
ts = convertPacketRepToTimeslotRepEffecient(packet_flow_rep= p_rep.getSubFlow(0,30), grain= .001, band_thresholds= [1250])

In [92]:
(ts.down_packets + ts.up_packets).sum(axis = 0)

array([ 2,  6,  1,  1,  2,  3, 15])

In [93]:
p_rep.lengths

[0.09733333333333333,
 0.044,
 0.262,
 0.08533333333333333,
 0.04,
 0.09733333333333333,
 0.04,
 0.056666666666666664,
 0.044,
 0.04,
 0.056666666666666664,
 0.04,
 1.0093333333333334,
 1.9826666666666666,
 2.0,
 2.0,
 2.0,
 2.0,
 1.3093333333333332,
 1.0093333333333334,
 2.0,
 2.0,
 0.04,
 0.04,
 0.04,
 0.04,
 0.04,
 0.04,
 0.04,
 0.04,
 0.04,
 0.04,
 0.04,
 0.04,
 0.04,
 0.04,
 1.9826666666666666,
 2.0,
 0.04,
 1.0093333333333334,
 0.04,
 2.0,
 0.04,
 2.0,
 2.0,
 2.0,
 1.0093333333333334,
 1.0093333333333334,
 2.0,
 2.0,
 0.04,
 2.0,
 1.0093333333333334,
 2.0,
 0.04,
 0.04,
 1.0093333333333334,
 2.0,
 2.0,
 0.04,
 1.9826666666666666,
 0.04,
 0.04,
 1.0093333333333334,
 2.0,
 2.0,
 1.9826666666666666,
 0.04,
 0.04,
 1.0093333333333334,
 0.04,
 0.04,
 2.0,
 0.04,
 0.04,
 0.04,
 0.04,
 0.04,
 1.0093333333333334,
 0.04,
 2.0,
 2.0,
 0.04,
 0.04,
 0.04,
 1.0093333333333334,
 2.0,
 2.0,
 0.04,
 0.04,
 0.04,
 0.04,
 0.04,
 1.0093333333333334,
 0.04,
 2.0,
 2.0,
 0.04,
 0.04,
 0.04,
 0.04,
 