In [3]:
"""
@author: Yuqiang (Ethan) Heng
"""
import numpy as np
import pandas as pd
from tqdm import tqdm
from os import listdir
from os.path import isfile, join

scenario = 'random' #deterministic, random or wild
if scenario == 'random':
    mypath = 'data/UTMobileNet2021/Randomized Automated Data'
elif scenario == 'deterministic':
    mypath = 'data/UTMobileNet2021/Deterministic Automated Data'
elif scenario == 'wild':
    mypath = 'data/UTMobileNet2021/Wild Test Data'
else:
    raise NameError('Dataset Not Supported')

onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
apps = np.unique([f.split('_')[0] for f in onlyfiles])
print(apps, len(apps), len(onlyfiles))
app_actions = np.unique(['_'.join(f.split('_')[:2]) for f in onlyfiles])
print(app_actions, len(app_actions))

sel_apps = apps
sel_app_files = {i:[] for i in sel_apps}

for fname in onlyfiles:
    app_name = fname.split('_')[0]
    if app_name in sel_apps:
        sel_app_files[app_name].append(fname)

['dropbox' 'facebook' 'gmail' 'google-drive' 'hulu' 'instagram'
 'messenger' 'netflix' 'pandora' 'pinterest' 'reddit' 'spotify' 'twitter'
 'youtube'] 14 288
['dropbox_download' 'dropbox_upload' 'facebook_scroll-newsfeed'
 'facebook_search-page' 'gmail_open-email' 'gmail_send-email'
 'google-drive_download' 'google-drive_upload' 'hulu_scroll-home'
 'hulu_watch-video' 'instagram_IgSearchBrowse' 'instagram_send-message'
 'messenger_send-message' 'netflix_browse-home' 'netflix_watch-video'
 'pandora_play-music' 'pandora_search-music' 'pinterest_tap-board'
 'reddit_browse' 'reddit_post' 'spotify_play-music' 'spotify_search-music'
 'twitter_post-tweet' 'twitter_scroll-feed' 'twitter_send-message'
 'youtube_play-video' 'youtube_search'] 27


In [16]:
flow_columns = ['ip.src', 'srcport', 'ip.dst', 'dstport', 'protocal']

def get_protocal(row):
    if not pd.isnull(row['tcp.len']):
        return 'TCP'
    elif not pd.isnull(row['udp.length']):
        return 'UDP'
    else:
        return 'Unknown'
    
def get_srt_port(row):
    if not pd.isnull(row['tcp.len']):
        return row['tcp.srcport']
    elif not pd.isnull(row['udp.length']):
        return row['udp.srcport']
    else:
        return 'Unknown'
    
def get_dst_port(row):
    if not pd.isnull(row['tcp.len']):
        return row['tcp.dstport']
    elif not pd.isnull(row['udp.length']):
        return row['udp.dstport']
    else:
        return 'Unknown'
    
columns = ['frame.number','frame.time','frame.len','frame.cap_len','ip.hdr_len',
           'ip.dsfield.ecn','ip.len','ip.frag_offset','ip.ttl','ip.proto','ip.src',
           'ip.dst','tcp.hdr_len','tcp.len','tcp.srcport','tcp.dstport','tcp.flags.ns',
           'tcp.flags.fin','tcp.window_size_value','tcp.urgent_pointer','tcp.option_kind',
           'tcp.option_len','udp.srcport','udp.dstport','udp.length']

def compute_flow_features(df):
    flow_features = {}
    flow_features['total_num_pkts'] = len(df)
    pkt_size = df['ip.len'].astype(float)
    flow_features['total_num_bytes'] = pkt_size.sum()
    flow_features['min_pkt_size'] = pkt_size.min()
    flow_features['max_pkt_size'] = pkt_size.max()
    flow_features['mean_pkt_size'] = pkt_size.mean()
    flow_features['std_pkt_size'] = pkt_size.std()
    # pd.to_datetime(df["frame.time"].apply(lambda x : x.replace("CDT","").replace("CST","").strip()))
    iat = pd.to_datetime(df['frame.time'].apply(lambda x : x.replace("CDT","").replace("CST","").strip())).diff(1).dt.total_seconds().iloc[1:]
    flow_features['min_iat'] = iat.min()
    flow_features['max_iat'] = iat.max()
    flow_features['mean_iat'] = iat.mean()
    flow_features['std_iat'] = iat.std()
    flow_features['dur'] = iat.sum()
    return flow_features

def process_df_by_flow(df):
    df['protocal'] = df.apply(lambda row: get_protocal(row), axis=1)
    df['srcport'] = df.apply(lambda row: get_srt_port(row), axis=1)
    df['dstport'] = df.apply(lambda row: get_dst_port(row), axis=1)  
    df_flow = pd.DataFrame()
    flow_columns = ['ip.src', 'srcport', 'ip.dst', 'dstport', 'protocal']
    ul_flows = {}
    dl_flows = {}
    for flow, flow_df in df.groupby(by=flow_columns):
        print(flow)
        print(flow_df)
        if flow[0].split('.')[0] == '10':
            ul_flows[flow] = compute_flow_features(flow_df)
        else:
            dl_flows[flow] = compute_flow_features(flow_df)
    for ul_flow, ul_flow_features in ul_flows.items():
        for dl_flow, dl_flow_features in dl_flows.items():
            if (ul_flow[0] == dl_flow[2]) & (ul_flow[2] == dl_flow[0]) & (ul_flow[1] == dl_flow[3]) & (ul_flow[3] == dl_flow[1]) & (ul_flow[4] == dl_flow[4]):
                ul_flow_features = {'ul_'+feature_name:feature for feature_name,feature in ul_flow_features.items()}
                dl_flow_features = {'dl_'+feature_name:feature for feature_name,feature in dl_flow_features.items()}
                bi_flow_features = {**ul_flow_features,**dl_flow_features}
                bi_flow_features['ip_A'] = ul_flow[0]
                bi_flow_features['port_A'] = ul_flow[1]
                bi_flow_features['ip_B'] = ul_flow[2]
                bi_flow_features['port_B'] = ul_flow[3]
                bi_flow_features['protocal'] = ul_flow[4]
                print(bi_flow_features)
                df_flow = pd.concat([df_flow,pd.DataFrame(bi_flow_features)],ignore_index= True)#df_flow.append(bi_flow_features, ignore_index=True)
    return df_flow

def clean_up_duplicate(row):
    if len(str(row['ip.hdr_len']).split(','))>1:
        row['ip.hdr_len'] = str(row['ip.hdr_len']).split(',')[1]
    if len(str(row['ip.len']).split(','))>1:
        row['ip.len'] = str(row['ip.len']).split(',')[1]
    else:
        row['ip.len'] = str(row['ip.len']).split(',')[0]
    if len(row['ip.src'].split(','))>1:
        row['ip.src'] = row['ip.src'].split(',')[1]
    if len(row['ip.dst'].split(','))>1:
        row['ip.dst'] = row['ip.dst'].split(',')[1]
    return row

In [17]:
df_all = pd.DataFrame()
for app in sel_apps:
    integrity = True
    df_app = pd.DataFrame()
    for fname in sel_app_files[app]:
        action = fname.split('_')[1]
        df = pd.read_csv(join(mypath,fname),usecols = columns,low_memory=False)
        df = df[df['ip.src'].notna()]
        
        df = df.apply(lambda row:clean_up_duplicate(row),axis=1)
        
        # Remove self loop pkts
        df = df[(df['ip.src']!='127.0.0.1') & (df['ip.dst']!='127.0.0.1')]
        try:
            df_flow = process_df_by_flow(df)
            df_flow['action'] = action
            df_app = pd.concat([df_app,df_flow])#df_app.append(df_flow)
            assert False
        except Exception as e:
            print(e)
            integrity = False
            print('\n Error while processing {}. \n'.format(fname))
            assert False

    df_app['app'] = app
    
    if integrity:
        df_all = df_all.append(df_app)
        
#df_all.to_csv('./Processed Data/{}_scenario_bi_flow_features.csv'.format(scenario))
print('Finished processing {} scenario data.'.format(scenario))

('10.146.66.8', 37879.0, '162.125.18.133', 443.0, 'TCP')
     frame.number                           frame.time  frame.len  \
172           173  Apr 30, 2019 08:20:34.125882000 CDT         76   
178           179  Apr 30, 2019 08:20:34.168755000 CDT         68   
179           180  Apr 30, 2019 08:20:34.169692000 CDT        237   
188           189  Apr 30, 2019 08:20:34.213768000 CDT         68   
190           191  Apr 30, 2019 08:20:34.213866000 CDT         68   
192           193  Apr 30, 2019 08:20:34.213913000 CDT         68   
196           197  Apr 30, 2019 08:20:34.272388000 CDT        161   
198           199  Apr 30, 2019 08:20:34.316007000 CDT        207   
206           207  Apr 30, 2019 08:20:34.354660000 CDT         68   
208           209  Apr 30, 2019 08:20:34.359010000 CDT        846   
212           213  Apr 30, 2019 08:20:34.441134000 CDT         68   
233           234  Apr 30, 2019 08:20:41.945830000 CDT        110   

     frame.cap_len  ip.hdr_len  ip.dsfield.ec

AssertionError: 

0