In [18]:
import pyshark as ps
import matplotlib.pyplot as plt
from scipy.signal import argrelextrema, find_peaks_cwt
from scipy.stats import stats
import numpy as np
import re
import pprint as pp
from datetime import datetime
from collections import OrderedDict, defaultdict, deque
from scalogram import *

In [19]:
base_pcap = "../../../shared/"
cap = ps.FileCapture(base_pcap + "netflix4.pcapng")
#cap.load_packets()

In [3]:
def get_info(up_pkts, down_pkts):
    

    up_ordered = OrderedDict(sorted(up_pkts.items(), key=lambda t: t[0]))
    down_ordered= OrderedDict(sorted(down_pkts.items(), key=lambda t: t[0]))    

    result = {}
    up = defaultdict(list)
    down = defaultdict(list)
    byte_count = defaultdict(list)

    for pkts in up_ordered.values():
        up['byte_count'].append(sum(int(pkt.captured_length) for pkt in pkts))
        up['packet_count'].append(len(pkts))
    for pkts in down_ordered.values():
        down['byte_count'].append(sum(int(pkt.captured_length) for pkt in pkts))
        down['packet_count'].append(len(pkts))
    result['up'] = up
    result['down'] = down
    return result

In [4]:
private_ip_pattern = re.compile("(^127\.)|(^10\.)|(^172\.1[6-9]\.)|(^172\.2[0-9]\.)|(^172\.3[0-1]\.)|(^192\.168\.)")
step = 30 # batches of 30 seconds
info = []
window = 1 # 1 second
window = 1/window

batch_time = int(cap[0].sniff_time.timestamp() * window)
last_timestamp_up = int(cap[0].sniff_time.timestamp() * window)
last_timestamp_down = int(cap[0].sniff_time.timestamp() * window)

download = defaultdict(list)
upload = defaultdict(list)
download[last_timestamp_up] = []
upload[last_timestamp_up] = []
count = 0

In [5]:
def redirect_packets(pkt):
    if hasattr(pkt, 'ip'):
        global batch_time
        global last_timestamp_up
        global last_timestamp_down
        global download, upload
        if (int(pkt.sniff_time.timestamp() * window) - batch_time) >= step:
            info.append(get_info(upload, download))
            batch_time = int(pkt.sniff_time.timestamp() * window)
            download = defaultdict(list)
            upload = defaultdict(list)
            download[last_timestamp_up] = []
            upload[last_timestamp_up] = []
            last_timestamp_up = int(pkt.sniff_time.timestamp() * window)
            last_timestamp_down = int(pkt.sniff_time.timestamp() * window)
        if private_ip_pattern.match(pkt.ip.src.get_default_value()):
            time_diff = int(pkt.sniff_time.timestamp() * window) - last_timestamp_up
            if time_diff > 1:
                for i in range(1, time_diff):
                    upload[last_timestamp_up + i] = []
            last_timestamp_up = int(pkt.sniff_time.timestamp() * window)
            upload[int(pkt.sniff_time.timestamp() * window)].append(pkt)
        elif private_ip_pattern.match(pkt.ip.dst.get_default_value()):
            time_diff = int(pkt.sniff_time.timestamp() * window) - last_timestamp_down
            if time_diff > 1:
                for i in range(1, time_diff):
                    download[last_timestamp_down + i] = []
            last_timestamp_down = int(pkt.sniff_time.timestamp() * window)
            download[int(pkt.sniff_time.timestamp() * window)].append(pkt)
        else:
            print("Curious!\n", pkt)
    elif hasattr(pkt, 'ipv6'):
        print("not yet implemented")
        # TODO
    global count
    print(count, end="\r")
    count += 1

cap.apply_on_packets(redirect_packets)
pp.pprint(info)

[{'down': defaultdict(<class 'list'>,
                      {'byte_count': [5252,
                                      451342,
                                      184643,
                                      0,
                                      3138,
                                      2073,
                                      6426,
                                      3120,
                                      9697,
                                      215240,
                                      124472,
                                      340027,
                                      1184358,
                                      1404540,
                                      1999708,
                                      2510869,
                                      2526701,
                                      3033052,
                                      3473196,
                                      3781537,
                                      3837407,
    

                                      0,
                                      60,
                                      303911,
                                      0,
                                      60,
                                      0,
                                      251966,
                                      0,
                                      0,
                                      120,
                                      303140,
                                      0,
                                      0,
                                      0,
                                      482408,
                                      0,
                                      0,
                                      0,
                                      261929,
                                      8214,
                                      0,
                                      0,
                                      244623],
                   

                                        0,
                                        16,
                                        0,
                                        0,
                                        0,
                                        27,
                                        0,
                                        0,
                                        0,
                                        42,
                                        0,
                                        0,
                                        0,
                                        37,
                                        0,
                                        0,
                                        0,
                                        34,
                                        1,
                                        0,
                                        0,
                                        25]}),
  'up': defaultdict(<class 'list'>,
         

                    {'byte_count': [0,
                                    214,
                                    214,
                                    2798,
                                    0,
                                    0,
                                    0,
                                    2258,
                                    0,
                                    0,
                                    0,
                                    1826,
                                    0,
                                    0,
                                    0,
                                    2042,
                                    0,
                                    0,
                                    0,
                                    3325,
                                    108,
                                    154,
                                    383,
                                    1826,
                                    

In [6]:
#N=len(data)

def calc_scalogram(data, scales):
    
    S,scales= scalogramCWT(data,scales)
    return S


In [7]:
def show_scalo(data, scales, colors):
    for i in range (0, len(data)):
        plt.plot(scales, data[i], colors[i], lw=3)
    plt.show()

In [8]:
scalos_up = []
scalos_down = []

N = step
dj=1/128
s0=2
J=1/dj * np.log2(0.5*N/s0)
scales=s0*2**(np.arange(J)*dj)

for idx, sample in enumerate(info):
    scalos_up.append(
        (calc_scalogram(np.asarray(sample['up']['byte_count']), scales),
         calc_scalogram(np.asarray(sample['up']['packet_count']), scales))
    )
    scalos_down.append(
        (calc_scalogram(np.asarray(sample['down']['byte_count']), scales),
         calc_scalogram(np.asarray(sample['down']['packet_count']), scales))
    )
#    show_scalo([scalos_down[idx], scalos_up[idx]], scales, ['r', 'b'])
#smooth_down = np.convolve(scalo_down, np.ones(len(scalo_down)), mode='same')
#smooth_up = np.convolve(scalo_up, np.ones(len(scalo_up)), mode='same')
#show_scalo([smooth_down, smooth_up], scales, ['r', 'b'])

#scalo, scales = calc_and_show(np.asarray(stats['down']['packet_count']), 'r')
#scalo, scales = calc_and_show(np.asarray(stats['up']['packet_count']), 'b')

In [9]:
# Get top X spikes from scalogram, sorted by value
def get_spikes(scalo, comparator):
    len(scalo)
    spikes = deque([(-1,-1)] * 5, maxlen=5)
    #aux = argrelextrema(scalo, comparator, order=int(len(scalo)/10))
    aux = argrelextrema(scalo, comparator)
    if aux[0].size:
        for x in np.nditer(aux) or []:
            spikes.append((scalo[x], scales[x]))
    ordered = sorted(spikes, key=lambda x: x[1], reverse=True)
    values = np.hstack(zip(*ordered))
    return values

In [10]:
local_max_up_bytes = []
local_min_up_bytes = []
local_max_up_packet = []
local_min_up_packet = []
local_max_down_bytes = []
local_min_down_bytes = []
local_max_down_packet = []
local_min_down_packet = []


for scalo in scalos_up:
    local_max_up_bytes.append(get_spikes(scalo[0], np.greater))
    local_min_up_bytes.append(get_spikes(scalo[0], np.less))
    local_max_up_packet.append(get_spikes(scalo[1], np.greater))
    local_min_up_packet.append(get_spikes(scalo[1], np.less))

for scalo in scalos_down:
    local_max_down_bytes.append(get_spikes(scalo[0], np.greater))
    local_min_down_bytes.append(get_spikes(scalo[0], np.less))
    local_max_down_packet.append(get_spikes(scalo[1], np.greater))
    local_min_down_packet.append(get_spikes(scalo[1], np.less))

In [11]:
from itertools import groupby
def get_stats_numpy(data):
    mean = np.mean(data)
    median = np.median(data)
    std = np.std(data)
    var = np.var(data)
    skew = stats.skew(data)
    kurt = stats.kurtosis(data)
    pc = [25,50,75]
    percentiles = np.array(np.percentile(data, pc))
    silences = np.count_nonzero(np.asarray(data)==0.0)
    longest_silence = max(sum(1 for _ in g) for k, g in groupby(data) if k==0) if silences > 0 else 0
    shortest_silence = min(sum(1 for _ in g) for k, g in groupby(data) if k==0) if silences > 0 else 0
    
    #print("Mean: " + str(mean))
    #print("Media: " + str(median))
    #print("StdDev: " + str(std))
    #print("Variance: " + str(var))
    #print("Skewness: " + str(skew))
    #print("Kurtosis: " + str(kurt))
    #print("Pc25: " + str(percentiles[0]))
    #print("Pc50: " + str(percentiles[1]))
    #print("Pc75: " + str(percentiles[2]))
    
    features = np.hstack((mean, median, std, var, skew, kurt, percentiles, silences, longest_silence, shortest_silence))
    return features

In [12]:
def get_stats_json(data):
    mean = np.mean(data)
    median = np.median(data)
    std = np.std(data)
    var = np.var(data)
    skew = stats.skew(data)
    kurt = stats.kurtosis(data)
    pc = [25,50,75]
    percentiles = np.array(np.percentile(data, pc))
    silences = np.count_nonzero(np.asarray(data)==0.0)
    longest_silence = max(sum(1 for _ in g) for k, g in groupby(data) if k==0) if silences > 0 else 0
    shortest_silence = min(sum(1 for _ in g) for k, g in groupby(data) if k==0) if silences > 0 else 0
    #print("Mean: " + str(mean))
    #print("Media: " + str(median))
    #print("StdDev: " + str(std))
    #print("Variance: " + str(var))
    #print("Skewness: " + str(skew))
    #print("Kurtosis: " + str(kurt))
    #print("Pc25: " + str(percentiles[0]))
    #print("Pc50: " + str(percentiles[1]))
    #print("Pc75: " + str(percentiles[2]))
    
    statistiscs = {
        'mean': mean,
        'median': median,
        'std': std,
        'var': var,
        'skew': skew,
        'kurt': kurt,
        'pc25': percentiles[0],
        'pc50': percentiles[1],
        'pc75': percentiles[2],
    }
    
    return statistiscs

In [13]:
# Put it all on a numpy array
def get_features_numpy(info):
    np.set_printoptions(suppress=True)

    result = []

    for idx in range(0, len(info)):
        result.append(
            np.hstack(
                (
                 get_stats_numpy(info[idx]['up']['byte_count']),
                 get_stats_numpy(info[idx]['up']['packet_count']),
                 local_max_up_bytes[idx], local_min_up_bytes[idx],
                 local_max_up_packet[idx], local_min_up_packet[idx],
                 get_stats_numpy(info[idx]['down']['byte_count']),
                 get_stats_numpy(info[idx]['down']['packet_count']),
                 local_max_down_bytes[idx], local_min_down_bytes[idx],
                 local_max_down_packet[idx], local_min_down_packet[idx],

            ))
        )
    return result


In [14]:
# Put it all on a json
def get_features_json(info):
    stat = {
        'down': defaultdict(list),
        'up': defaultdict(list)
    }

    result = []
    for idx in range(0, len(info)):
        stat['down']['byte_count'] = get_stats_json(info[idx]['down']['byte_count'])
        stat['down']['packet_count'] = get_stats_json(info[idx]['down']['packet_count'])
        stat['down']['byte_count']['scalo_spikes_max'] =  local_max_down_bytes[idx]
        stat['down']['byte_count']['scalo_spikes_min'] =  local_min_down_bytes[idx]
        stat['down']['packet_count']['scalo_spikes_max'] =  local_max_down_packet[idx]
        stat['down']['packet_count']['scalo_spikes_min'] =  local_min_down_packet[idx]

        stat['up']['byte_count'] = get_stats_json(info[idx]['up']['byte_count'])
        stat['up']['packet_count'] = get_stats_json(info[idx]['up']['packet_count'])
        stat['up']['byte_count']['scalo_spikes_max'] =  local_max_up_bytes[idx]
        stat['up']['byte_count']['scalo_spikes_min'] =  local_min_up_packet[idx]
        stat['up']['packet_count']['scalo_spikes_max'] =  local_max_up_packet[idx]
        stat['up']['packet_count']['scalo_spikes_min'] =  local_min_up_packet[idx]


        result.append(stat)
    return result

In [15]:
import pandas as pd

samples = get_features_numpy(info)

names = [
    'up_bytes_mean', 'up_bytes_median', 'up_bytes_std', 'up_bytes_var', 'up_bytes_skew', 'up_bytes_kurt',
    'up_bytes_perc25', 'up_bytes_perc50', 'up_bytes_perc75',
    'up_bytes_silences', 'up_bytes_longest_silence', 'up_bytes_shortest_silence',
    'up_packet_mean', 'up_packet_median', 'up_packet_std', 'up_packet_var', 'up_packet_skew', 'up_packet_kurt',
    'up_packet_perc25', 'up_packet_perc50', 'up_packet_perc75',
    'up_packet_silences', 'up_packet_longest_silence', 'up_packet_shortest_silence',
    'up_bytes_1max_y', 'up_bytes_2max_y', 'up_bytes_3max_y', 'up_bytes_4max_y', 'up_bytes_5max_y',
    'up_bytes_1max_x', 'up_bytes_2max_x', 'up_bytes_3max_x', 'up_bytes_4max_x', 'up_bytes_5max_x',
    'up_bytes_1min_y', 'up_bytes_2min_y', 'up_bytes_3min_y', 'up_bytes_4min_y', 'up_bytes_5min_y',
    'up_bytes_1min_x', 'up_bytes_2min_x', 'up_bytes_3min_x', 'up_bytes_4min_x', 'up_bytes_5min_x',
    'up_packet_1max_y', 'up_packet_2max_y', 'up_packet_3max_y', 'up_packet_4max_y', 'up_packet_5max_y',
    'up_packet_1max_x', 'up_packet_2max_x', 'up_packet_3max_x', 'up_packet_4max_x', 'up_packet_5max_x',
    'up_packet_1min_y', 'up_packet_2min_y', 'up_packet_2min_y', 'up_packet_4min_y', 'up_packet_5min_y',
    'up_packet_1min_x', 'up_packet_2min_x', 'up_packet_3min_x', 'up_packet_4min_x', 'up_packet_5min_x',

    'down_bytes_mean', 'down_bytes_median', 'down_bytes_std', 'down_bytes_var', 'down_bytes_skew', 'down_bytes_kurt',
    'down_bytes_perc25', 'down_bytes_perc50', 'down_bytes_perc75',
    'down_bytes_silences', 'down_bytes_longest_silence', 'down_bytes_shortest_silence',
    'down_packet_mean', 'down_packet_median', 'down_packet_std', 'down_packet_var', 'down_packet_skew', 'down_packet_kurt',
    'down_packet_perc25', 'down_packet_perc50', 'down_packet_perc75',  
    'down_packet_silences', 'down_packet_longest_silence', 'down_packet_shortest_silence',
    'down_bytes_1max_y', 'down_bytes_2max_y', 'down_bytes_3max_y', 'down_bytes_4max_y', 'down_bytes_5max_y',
    'down_bytes_1max_x', 'down_bytes_2max_x', 'down_bytes_3max_x', 'down_bytes_4max_x', 'down_bytes_5max_x',
    'down_bytes_1min_y', 'down_bytes_2min_y', 'down_bytes_3min_y', 'down_bytes_4min_y', 'down_bytes_5min_y',
    'down_bytes_1min_x', 'down_bytes_2min_x', 'down_bytes_3min_x', 'down_bytes_4min_x', 'down_bytes_5min_x',
    'down_packet_1max_y', 'down_packet_2max_y', 'down_packet_3max_y', 'down_packet_4max_y', 'down_packet_5max_y',
    'down_packet_1max_x', 'down_packet_2max_x', 'down_packet_3max_x', 'down_packet_4max_x', 'down_packet_5max_x',
    'down_packet_1min_y', 'down_packet_2min_y', 'down_packet_2min_y', 'down_packet_4min_y', 'down_packet_5min_y',
    'down_packet_1min_x', 'down_packet_2min_x', 'down_packet_3min_x', 'down_packet_4min_x', 'down_packet_5min_x'
]


df = pd.DataFrame(samples, columns=names)
df

Unnamed: 0,up_bytes_mean,up_bytes_median,up_bytes_std,up_bytes_var,up_bytes_skew,up_bytes_kurt,up_bytes_perc25,up_bytes_perc50,up_bytes_perc75,up_bytes_silences,...,down_packet_1min_y,down_packet_2min_y,down_packet_2min_y.1,down_packet_4min_y,down_packet_5min_y,down_packet_1min_x,down_packet_2min_x,down_packet_3min_x,down_packet_4min_x,down_packet_5min_x
0,19790.517241,18403.0,17693.808858,313070900.0,1.738496,4.673223,3499.0,18403.0,27974.0,2.0,...,0.012942,0.009683,0.006452,0.000497,-1.0,11.436907,8.819061,6.004056,2.723218,-1.0
1,1365.516129,322.0,2713.155464,7361213.0,3.355087,12.177423,27.0,322.0,1425.0,8.0,...,0.001239,0.00744,0.007985,0.00567,-1.0,13.527398,8.771433,7.217817,5.215525,-1.0
2,1971.666667,126.0,4752.565646,22586880.0,3.23976,9.638549,0.0,126.0,1689.0,9.0,...,0.000534,0.001177,0.005169,-1.0,-1.0,13.674702,7.913824,5.187358,-1.0,-1.0
3,5310.83871,383.0,14876.559308,221312000.0,3.472531,10.275353,0.0,383.0,3279.0,9.0,...,0.008523,0.011114,-1.0,-1.0,-1.0,12.337687,8.724062,-1.0,-1.0,-1.0
4,2135.032258,216.0,3549.238887,12597100.0,2.386976,5.370788,0.0,216.0,2884.0,10.0,...,0.001549,0.00107,-1.0,-1.0,-1.0,9.462278,6.269879,-1.0,-1.0,-1.0
5,764.129032,54.0,1312.757962,1723333.0,1.514918,0.867099,0.0,54.0,885.0,13.0,...,0.000479,0.001257,0.001979,-1.0,-1.0,12.745137,9.462278,5.718453,-1.0,-1.0
6,1014.933333,0.0,2252.551412,5073988.0,3.5722,13.754641,0.0,0.0,1594.5,16.0,...,0.001949,0.004661,0.004265,0.003386,-1.0,12.676306,9.209514,5.387767,2.551884,-1.0
7,605.482759,54.0,1022.635764,1045784.0,1.349016,0.086275,0.0,54.0,270.0,13.0,...,0.000107,0.003018,0.002345,-1.0,-1.0,14.672065,6.98707,5.47601,-1.0,-1.0
8,1878.451613,756.0,3469.814273,12039610.0,3.117419,9.279937,0.0,756.0,2121.0,10.0,...,0.000504,0.001583,0.00516,0.003528,-1.0,13.674702,8.0,5.159343,2.607763,-1.0
9,744.258065,54.0,1111.147299,1234648.0,1.614826,1.571731,0.0,54.0,1074.0,14.0,...,0.00048,0.003002,0.003364,-1.0,-1.0,10.207534,6.441961,2.430495,-1.0,-1.0


In [16]:
# Not necessary to have silences in both 'bytes' and 'packet'
df.drop(columns=['down_packet_silences', 'up_packet_silences', 'up_packet_longest_silence', 'up_packet_shortest_silence'], inplace=True)
#df.describe()

In [17]:
df['label'] = 'Netflix'
filename = 'csv/30s1s/netflix4.csv'
df.to_csv(filename, sep=',', encoding='utf-8')