In [1]:
from scipy.signal import argrelextrema

def calc_scalogram(data, scales):
    S,scales= scalogramCWT(data,scales)
    return S
def show_scalo(data, scales, colors):
    for i in range (0, len(data)):
        plt.plot(scales, data[i], colors[i], lw=3)
    plt.show()
    
# Get top X spikes from scalogram, sorted by value
def get_spikes(scalo, comparator):
    len(scalo)
    spikes = deque([(-1,-1)] * 5, maxlen=5)
    #aux = argrelextrema(scalo, comparator, order=int(len(scalo)/10))
    aux = argrelextrema(scalo, comparator)
    if aux[0].size:
        for x in np.nditer(aux) or []:
            spikes.append((scalo[x], scales[x]))
    ordered = sorted(spikes, key=lambda x: x[1], reverse=True)
    values = np.hstack(zip(*ordered))
    return values

In [2]:
from itertools import groupby
from scipy import stats
def get_stats_numpy(data, zero):
    mean = np.mean(data)
    median = np.median(data)
    std = np.std(data)
    var = np.var(data)
    skew = stats.skew(data)
    kurt = stats.kurtosis(data)
    pc = [25,50,75,90]
    percentiles = np.array(np.percentile(data, pc))
    silences = np.count_nonzero(np.asarray(data)==zero)
    silence_mean = np.mean(list(sum(1 for _ in g) for k, g in groupby(data) if k==zero))
    longest_silence = max(sum(1 for _ in g) for k, g in groupby(data) if k==zero) if silences > 0 else 0
    shortest_silence = min(sum(1 for _ in g) for k, g in groupby(data) if k==zero) if silences > 0 else 0
    
    #print("Mean: " + str(mean))
    #print("Media: " + str(median))
    #print("StdDev: " + str(std))
    #print("Variance: " + str(var))
    #print("Skewness: " + str(skew))
    #print("Kurtosis: " + str(kurt))
    #print("Pc25: " + str(percentiles[0]))
    #print("Pc50: " + str(percentiles[1]))
    #print("Pc75: " + str(percentiles[2]))
    
    features = np.hstack((mean, median, std, var, skew, kurt, percentiles, silences, silence_mean, longest_silence, shortest_silence))
    return features

In [3]:
# Put it all on a numpy array
def get_features_numpy(info, local_max_up_bytes, local_min_up_bytes, local_max_up_packet, local_min_up_packet, 
                       local_max_down_bytes, local_min_down_bytes, local_max_down_packet, local_min_down_packet, zeros):
    np.set_printoptions(suppress=True)

    result = []

    for idx in range(0, len(info)):
        result.append(
            np.hstack(
                (
                 get_stats_numpy(info[idx]['up']['byte_count'], zeros[1]),
                 get_stats_numpy(info[idx]['up']['packet_count'], zeros[0]),
                 local_max_up_bytes[idx], local_min_up_bytes[idx],
                 local_max_up_packet[idx], local_min_up_packet[idx],
                 get_stats_numpy(info[idx]['down']['byte_count'], zeros[3]),
                 get_stats_numpy(info[idx]['down']['packet_count'], zeros[2]),
                 local_max_down_bytes[idx], local_min_down_bytes[idx],
                 local_max_down_packet[idx], local_min_down_packet[idx],

            ))
        )
    return result


In [4]:
def get_info(data):
    info = []
    up = defaultdict(list)
    down = defaultdict(list)
    result = {}
    count = 0
    for second in data:
        up['packet_count'].append(second[0])
        up['byte_count'].append(second[1])
        down['packet_count'].append(second[2])
        down['byte_count'].append(second[3])
        count+=1
        if count >= sample_size:
            result['up'] = up
            result['down'] = down
            info.append(result)
            up = defaultdict(list)
            down = defaultdict(list)
            result = {}
            count = 0
    return info

In [5]:
import re

def split_number(s):
    return list(filter(None, re.split(r'(\d+)', s)))

In [1]:
import os
import numpy as np
from collections import defaultdict, deque
from scalogram import *
import pandas as pd

base_url = '../../../shared/'
sample_size = 30 # number of intervals for each sample

for path, subdirs, files in os.walk(base_url):
    for name in files:
        print(name)
        if os.path.basename(path) != 'dat':
            continue
        
        print(os.path.join(path, name))
        
        #[up_n_packets, up_n_bytes, up_flag, down_n_packets, down_n_bytes, down_flag]
        data = np.loadtxt(os.path.join(path, name))
        data = np.delete(data, [2,5], 1)


        # Normalize
        data = np.vstack([data,[0,0,0,0]]) # put zeros
        
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        data = scaler.fit_transform(data)
        
        zeros = data[-1]
        data = data[:-1]
        
        # Split data into categories
        info = get_info(data)

        #Scalogram
        N = sample_size
        dj=1/128
        s0=2
        J=1/dj * np.log2(0.5*N/s0)
        scales=s0*2**(np.arange(J)*dj)

        scalos_up = []
        scalos_down = []
        for idx, sample in enumerate(info):
                scalos_up.append(
                    (calc_scalogram(np.asarray(sample['up']['byte_count']), scales),
                     calc_scalogram(np.asarray(sample['up']['packet_count']), scales))
                )
                scalos_down.append(
                    (calc_scalogram(np.asarray(sample['down']['byte_count']), scales),
                     calc_scalogram(np.asarray(sample['down']['packet_count']), scales))
                )

        local_max_up_bytes = []
        local_min_up_bytes = []
        local_max_up_packet = []
        local_min_up_packet = []
        local_max_down_bytes = []
        local_min_down_bytes = []
        local_max_down_packet = []
        local_min_down_packet = []

        for scalo in scalos_up:
            local_max_up_bytes.append(get_spikes(scalo[0], np.greater))
            local_min_up_bytes.append(get_spikes(scalo[0], np.less))
            local_max_up_packet.append(get_spikes(scalo[1], np.greater))
            local_min_up_packet.append(get_spikes(scalo[1], np.less))

        for scalo in scalos_down:
            local_max_down_bytes.append(get_spikes(scalo[0], np.greater))
            local_min_down_bytes.append(get_spikes(scalo[0], np.less))
            local_max_down_packet.append(get_spikes(scalo[1], np.greater))
            local_min_down_packet.append(get_spikes(scalo[1], np.less))

        # Put it in a pandas Dataframe and save it to csv
        samples = get_features_numpy(info, local_max_up_bytes, local_min_up_bytes, local_max_up_packet, local_min_up_packet,
                                    local_max_down_bytes, local_min_down_bytes, local_max_down_packet, local_min_down_packet,
                                    zeros)


        names = [
            'up_bytes_mean', 'up_bytes_median', 'up_bytes_std', 'up_bytes_var', 'up_bytes_skew', 'up_bytes_kurt',
            'up_bytes_perc25', 'up_bytes_perc50', 'up_bytes_perc75', 'up_bytes_perc90',
            'up_bytes_silences', 'up_bytes_silence_mean', 'up_bytes_longest_silence', 'up_bytes_shortest_silence',
            'up_packet_mean', 'up_packet_median', 'up_packet_std', 'up_packet_var', 'up_packet_skew', 'up_packet_kurt',
            'up_packet_perc25', 'up_packet_perc50', 'up_packet_perc75', 'up_packet_perc90',
            'up_packet_silences', 'up_packet_silence_mean', 'up_packet_longest_silence', 'up_packet_shortest_silence',
            'up_bytes_1max_y', 'up_bytes_2max_y', 'up_bytes_3max_y', 'up_bytes_4max_y', 'up_bytes_5max_y',
            'up_bytes_1max_x', 'up_bytes_2max_x', 'up_bytes_3max_x', 'up_bytes_4max_x', 'up_bytes_5max_x',
            'up_bytes_1min_y', 'up_bytes_2min_y', 'up_bytes_3min_y', 'up_bytes_4min_y', 'up_bytes_5min_y',
            'up_bytes_1min_x', 'up_bytes_2min_x', 'up_bytes_3min_x', 'up_bytes_4min_x', 'up_bytes_5min_x',
            'up_packet_1max_y', 'up_packet_2max_y', 'up_packet_3max_y', 'up_packet_4max_y', 'up_packet_5max_y',
            'up_packet_1max_x', 'up_packet_2max_x', 'up_packet_3max_x', 'up_packet_4max_x', 'up_packet_5max_x',
            'up_packet_1min_y', 'up_packet_2min_y', 'up_packet_3min_y', 'up_packet_4min_y', 'up_packet_5min_y',
            'up_packet_1min_x', 'up_packet_2min_x', 'up_packet_3min_x', 'up_packet_4min_x', 'up_packet_5min_x',

            'down_bytes_mean', 'down_bytes_median', 'down_bytes_std', 'down_bytes_var', 'down_bytes_skew', 'down_bytes_kurt',
            'down_bytes_perc25', 'down_bytes_perc50', 'down_bytes_perc75', 'down_bytes_perc90',
            'down_bytes_silences', 'down_bytes_silence_mean', 'down_bytes_longest_silence', 'down_bytes_shortest_silence',
            'down_packet_mean', 'down_packet_median', 'down_packet_std', 'down_packet_var', 'down_packet_skew', 'down_packet_kurt',
            'down_packet_perc25', 'down_packet_perc50', 'down_packet_perc75', 'down_packet_perc90',
            'down_packet_silences', 'down_packet_silence_mean', 'down_packet_longest_silence', 'down_packet_shortest_silence',
            'down_bytes_1max_y', 'down_bytes_2max_y', 'down_bytes_3max_y', 'down_bytes_4max_y', 'down_bytes_5max_y',
            'down_bytes_1max_x', 'down_bytes_2max_x', 'down_bytes_3max_x', 'down_bytes_4max_x', 'down_bytes_5max_x',
            'down_bytes_1min_y', 'down_bytes_2min_y', 'down_bytes_3min_y', 'down_bytes_4min_y', 'down_bytes_5min_y',
            'down_bytes_1min_x', 'down_bytes_2min_x', 'down_bytes_3min_x', 'down_bytes_4min_x', 'down_bytes_5min_x',
            'down_packet_1max_y', 'down_packet_2max_y', 'down_packet_3max_y', 'down_packet_4max_y', 'down_packet_5max_y',
            'down_packet_1max_x', 'down_packet_2max_x', 'down_packet_3max_x', 'down_packet_4max_x', 'down_packet_5max_x',
            'down_packet_1min_y', 'down_packet_2min_y', 'down_packet_3min_y', 'down_packet_4min_y', 'down_packet_5min_y',
            'down_packet_1min_x', 'down_packet_2min_x', 'down_packet_3min_x', 'down_packet_4min_x', 'down_packet_5min_x'
        ]
    
        df = pd.DataFrame(samples, columns=names)
        outdir = 'csv/' + str(sample_size) + 's' + '.'.join(name.split('_')[1].split('.')[:-1]) + '/' + split_number(name)[0] + '/'        
        outname =  name.split('.')[0] + '.csv'
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        df['label'] = name.split('/')[-1].split('.')[0]
        print(name.split('/')[-1].split('.')[0])
        df.to_csv(os.path.join(outdir, outname), sep=',', encoding='utf-8')


SyntaxError: invalid syntax (<ipython-input-1-2acb7edf8205>, line 50)

In [7]:
df

NameError: name 'df' is not defined