In [2]:
import pandas as pd
import numpy as np

In [24]:
def store_smallest(reservoir, tags, new_ip, new_tag, k):
    reservoir = np.append(reservoir, new_ip)
    tags = np.append(tags, new_tag)
        
    ind = np.argsort(tags)[:k]
    return (reservoir[ind], tags[ind])

In [25]:
data_file = '../data/capture20110811.pcap.netflow_43_.labeled'
infected = '147.32.84.165'
n = 10
k_s = [100, 1000, 5000]

In [26]:
with open(data_file, "r") as ins:
    lines = ins.readlines()

In [46]:
import time

iterations = 10
run_times = np.zeros([iterations,len(k_s)])

for it in range(iterations):
    estimates = []
    for k_i, k in enumerate(k_s):
        reservoir = np.array([])
        tags = np.array([])

        start = time.time()
        for line in lines:
            parts = line.split()
            ip_port_src = parts[4].split(':')
            ip_src = ip_port_src[0]

            if ip_src == infected:
                ip_port_dst = parts[6].split(':')
                ip_dst = ip_port_dst[0]

                r = np.random.rand()
                (reservoir, tags) = store_smallest(reservoir, tags, ip_dst, r, k)

        stop = time.time()
        run_times[it][k_i] = stop - start

        ips_estimated, counts = np.unique(reservoir, return_counts=True)
        ind = np.argsort(-counts)[:n]
        estimates.append({
            'ips': ips_estimated[ind],
            'freqs': counts[ind] / k
        })    

array([16.74789014, 19.55251765, 32.33119361])

## Figure out true top 10

In [22]:
ips = {}
infected_flow_count = 0
with open("../data/capture20110811.pcap.netflow_43_.labeled", "r") as ins:
    for line in ins:
        parts = line.split()
        ip_port_src = parts[4].split(':')
        ip_src = ip_port_src[0]
        if ip_src == infected:
            ip_port_dst = parts[6].split(':')
            ip_dst = ip_port_dst[0]
            if not ip_dst in ips:
                ips[ip_dst] = 0
            ips[ip_dst] += 1
            infected_flow_count += 1

In [34]:
ips_ip = np.array(list(ips.keys()))
ips_count = np.array(list(ips.values()))

ind = np.argsort(-ips_count)[:n]
true = {}
true['ips'] = ips_ip[ind]
true['freqs'] = ips_count[ind] / infected_flow_count

## Build output table

In [38]:
for i in range(n):
    line = [i+1, true['ips'][i], round(true['freqs'][i],3)]
    for j,k in enumerate(k_s):
        line.append(estimates[j]['ips'][i])
        line.append(round(estimates[j]['freqs'][i],3))
    print("\t".join([str(x) for x in line]))

1	193.23.181.44	0.136	193.23.181.44	0.13	193.23.181.44	0.146	193.23.181.44	0.125
2	174.128.246.102	0.076	174.128.246.102	0.1	174.37.196.55	0.081	174.128.246.102	0.083
3	174.37.196.55	0.074	67.19.72.206	0.08	67.19.72.206	0.069	174.37.196.55	0.078
4	67.19.72.206	0.069	174.37.196.55	0.08	174.128.246.102	0.061	67.19.72.206	0.076
5	72.20.15.61	0.066	72.20.15.61	0.06	72.20.15.61	0.052	72.20.15.61	0.065
6	173.236.31.226	0.038	46.4.36.120	0.06	173.236.31.226	0.044	173.236.31.226	0.039
7	184.154.89.154	0.037	173.236.31.226	0.05	46.4.36.120	0.041	184.154.89.154	0.036
8	46.4.36.120	0.036	184.154.89.154	0.04	184.154.89.154	0.033	46.4.36.120	0.034
9	147.32.80.9	0.017	184.82.147.252	0.03	217.163.21.37	0.019	147.32.80.9	0.018
10	217.163.21.37	0.015	217.163.21.41	0.02	217.163.21.36	0.017	217.163.21.37	0.016


## Recall

In [41]:
'''
freq_distance measures the distance between the frequencies in the 
ground-truth list and the frequencies in an estimated list.

true, estimate: objects with two list .ips and .freqs
'''
def freq_distance(true, estimate):
    estimate_map = {}
    for i, ip in enumerate(estimate['ips']):
        estimate_map[ip] = estimate['freqs'][i]
    
    score = 0
    for i, ip in enumerate(true['ips']):
        if ip in estimate_map:
            score += abs(true['freqs'][i] - estimate_map[ip])
        else:
            score += true['freqs'][i]
    return score

In [52]:
print("\t".join(['k','recall','freq distance','runtime']))
for j,k in enumerate(k_s):
    recall = float(len(np.intersect1d(true['ips'], estimates[j]['ips']))) / float(n)
    freq_score = round(freq_distance(true, estimates[j]),4)
    run_time = round(np.mean(run_times,axis=0)[j],3)
    print("{}\t{}\t{}\t{} s".format(k, recall, freq_score, run_time))

k	recall	freq distance	runtime
100	0.8	0.1798	16.748 s
1000	0.8	0.0802	19.553 s
5000	1.0	0.0279	32.331 s
