In [44]:
import pandas as pd
import ipaddress
import json
import os
from fastFlow.flowprintOptimal.sekigo.flowUtils.commons import getIATFromTimeStamps, saveFlows
from fastFlow.flowprintOptimal.sekigo.core.flowRepresentation import PacketFlowRepressentation


In [86]:
def getIpToProv(file_path = "data/conf_prov_to_ips.json"):
    with open(file_path, "r") as f:
        prov_to_ips = json.loads(f.read())
    
    ip_to_prov = dict()
    for prov,ips in prov_to_ips.items():

        for ip in ips:
            if ip in ip_to_prov:
                print("Fllo")
                continue
            ip_to_prov[ip] = prov
    return ip_to_prov


def getDfFromCSV(csv_path, timestamp_columns = []):
    def formatDateTimeString(string):
        string =  " ".join(string.split(" ")[:2])

        # now in case the time is not in the format of 2021-01-01 00:00:00.000000 we add the .000000
        if len(string.split(" ")[1].split(".")) == 1:
            string += ".000000"
        return string
        
    df = pd.read_csv(csv_path).dropna()
    for timestamp_column in timestamp_columns:

        
        df[f"{timestamp_column}"] = df[f"{timestamp_column}"].apply(formatDateTimeString)
        df[f"{timestamp_column}"] = pd.to_datetime(df[f"{timestamp_column}"])

    
    return df

def processFiveTuple(df,ip_to_prov_path):
    ip_to_prov = getIpToProv(file_path= ip_to_prov_path)

    src_ips, src_ports , dst_ips, dst_ports, protocols = [],[],[],[],[]
    provs = []

    for i in range(len(df)):
        s = df.iloc[i].FiveTuple
        src_addr, dst_addr, protocol = s.split("->")
        src_ip,src_port = src_addr.split(":")
        dst_ip,dst_port = dst_addr.split(":")
        protocol = int(protocol.split(":")[1][:-1].strip())

        src_ips.append(src_ip)
        src_ports.append(src_port)
        dst_ips.append(dst_ip)
        dst_ports.append(dst_port)
        protocols.append(protocol)

        if src_ip in ip_to_prov:
            provs.append(ip_to_prov[src_ip])
        elif dst_ip in ip_to_prov:
            provs.append(ip_to_prov[dst_ip])
        else:
            provs.append("UNK")
    
    df["src_ip"] = src_ips
    df["src_port"] = src_ports
    df["dst_ip"] = dst_ips
    df["dst_port"] = dst_ports
    df["protocol"] = protocols
    df["provider"] = provs

    df["src_unsw"] = df["src_ip"].apply(lambda ip : ip.startswith("129.94"))
    df["dst_unsw"] = df["dst_ip"].apply(lambda ip :ip.startswith("129.94"))

    

    df.drop(columns= ["FiveTuple"], inplace= True)
    return df

def getFlowIDToProv(df):
    flow_id_to_prov = dict()
    for i in range(len(df)):
        row = df.iloc[i]
        flow_id = row.FlowId
        prov = row.provider

        if flow_id in flow_id_to_prov:
            assert False
        
        flow_id_to_prov[flow_id] = prov
    
    return flow_id_to_prov
    

In [87]:
def extractRepresentationFeatures(df):
    directions, timestamps, lengths = [],[],[]


    for i in range(len(df)):
        row = df.iloc[i]
        directions.append(int(row.Direction))
        timestamps.append(row.Timestamp)
        lengths.append(int(row.PacketLength))

    IATs = getIATFromTimeStamps(timestamps)
    return directions,IATs,lengths

    

In [88]:
def getFlowReps(packets_df : pd.DataFrame,flow_id_to_prov):
    flow_reps = []
    for flow_id, mini_df in packets_df.groupby(by= ["FlowID"]):
        flow_id = flow_id[0]
        if flow_id not in flow_id_to_prov:
            continue
        prov = flow_id_to_prov[flow_id]
        mini_df.sort_values(by= "Timestamp", inplace= True)
        directions,IATs,lengths  = extractRepresentationFeatures(mini_df)
        flow_rep = PacketFlowRepressentation(lengths= lengths, directions= directions, inter_arrival_times= IATs,class_type=prov, provider_type= prov)
        flow_reps.append(flow_rep)
    return flow_reps


In [89]:
def getRepresentation(flows_path, packets_path):
    packets_df,flow_df = getDfFromCSV(csv_path= packets_path, timestamp_columns= ["Timestamp"]), getDfFromCSV(csv_path= flows_path, timestamp_columns= ["StartTime", "EndTime"])
    flow_df = processFiveTuple(flow_df, ip_to_prov_path= "data/conf_prov_to_ips.json")
    flow_df  = flow_df[flow_df.provider != "UNK"]
    flow_id_to_prov = getFlowIDToProv(flow_df)
    packet_flow_reps = getFlowReps(packets_df=packets_df ,flow_id_to_prov= flow_id_to_prov)
    return packet_flow_reps

    
    

In [90]:
class DataPath:
    def __init__(self, flows_path, packets_path):
        self.flows_path = flows_path
        self.packets_path = packets_path
        assert os.path.exists(self.flows_path) and os.path.exists(self.packets_path)

In [91]:
data_paths = [DataPath(flows_path= "data/output/flows.csv", packets_path= "data/output/packets.csv"), DataPath(flows_path= "data/output/flows24.csv", packets_path= "data/output/packets24.csv")]

In [92]:
representations = []
for data_path in data_paths:
    representations += getRepresentation(flows_path= data_path.flows_path, packets_path= data_path.packets_path)

Fllo
Fllo
Fllo
Fllo


  df = pd.read_csv(csv_path).dropna()


Fllo
Fllo
Fllo
Fllo


In [93]:
types = list(map(lambda x: x.class_type, representations))

In [94]:
pd.Series(types).value_counts()

WhatsAppVoice      11655
Facebook             589
GoogleMeet           586
Signal               472
Microsoft Teams      191
Discord               29
SkypeCall             14
Telegram              10
Name: count, dtype: int64

In [None]:
saveFlows("data/dummy/dummy.json",packet_flow_reps)

In [95]:
getIpToProv()

Fllo
Fllo
Fllo
Fllo


{'35.213.200.132': 'Discord',
 '66.22.225.63': 'Discord',
 '35.213.245.119': 'Discord',
 '35.213.176.48': 'Discord',
 '66.22.231.165': 'Discord',
 '35.213.247.225': 'Discord',
 '35.213.213.225': 'Discord',
 '66.22.231.51': 'Discord',
 '35.213.194.70': 'Discord',
 '35.213.202.9': 'Discord',
 '35.213.214.28': 'Discord',
 '35.213.240.212': 'Discord',
 '35.213.244.146': 'Discord',
 '35.213.231.80': 'Discord',
 '35.215.167.173': 'Discord',
 '35.215.185.106': 'Discord',
 '35.215.161.239': 'Discord',
 '35.213.217.53': 'Discord',
 '35.213.215.255': 'Discord',
 '35.215.163.248': 'Discord',
 '35.213.199.106': 'Discord',
 '35.213.253.155': 'Discord',
 '35.213.211.177': 'Discord',
 '35.213.217.88': 'Discord',
 '35.213.205.170': 'Discord',
 '35.213.225.253': 'Discord',
 '35.213.218.248': 'Discord',
 '35.213.219.46': 'Discord',
 '35.213.223.182': 'Discord',
 '35.215.160.212': 'Discord',
 '35.213.236.67': 'Discord',
 '35.213.195.178': 'Discord',
 '35.213.249.79': 'Discord',
 '35.213.221.85': 'Discord