# Sampling

## Imports

In [1]:
import pandas as pd
import numpy as np

## Reservoir update function

In [2]:
'''
store_smallest updates the reservoir, it only stores those ips in the
reservoir that have the smallest tag associated.
'''
def store_smallest(reservoir, tags, new_ip, new_tag, k):
    reservoir = np.append(reservoir, new_ip)
    tags = np.append(tags, new_tag)
        
    ind = np.argsort(tags)[:k]
    return (reservoir[ind], tags[ind])

## MIN-WISE sampling

### Parameters

In [3]:
data_file = '../data/capture20110811.pcap.netflow_43_.labeled'
infected = '147.32.84.165'
n = 10
k_s = [100, 1000, 5000]

### Read data file

In [4]:
with open(data_file, "r") as ins:
    lines = ins.readlines()

### Compute estimates for different k

In [12]:
import time

# run 10 iterations to average the run-time
iterations = 10
run_times = np.zeros([iterations,len(k_s)])

for it in range(iterations):
    estimates = []
    for k_i, k in enumerate(k_s):
        reservoir = np.array([])
        tags = np.array([])

        start = time.time()
        for line in lines:
            parts = line.split()
            ip_port_src = parts[4].split(':')
            ip_src = ip_port_src[0]
            
            # if this is from our infected host
            if ip_src == infected:
                ip_port_dst = parts[6].split(':')
                ip_dst = ip_port_dst[0]
                
                # generate tag
                r = np.random.rand()
                # obtain new reservoir
                (reservoir, tags) = store_smallest(reservoir, tags, ip_dst, r, k)

        stop = time.time()
        run_times[it][k_i] = stop - start

        # only use the top 10 most frequent
        ips_estimated, counts = np.unique(reservoir, return_counts=True)
        ind = np.argsort(-counts)[:n]
        estimates.append({
            'ips': ips_estimated[ind],
            'freqs': counts[ind] / k
        })    

## Obtain ground-truth

In [6]:
ips = {}
infected_flow_count = 0
for line in lines:
    parts = line.split()
    ip_port_src = parts[4].split(':')
    ip_src = ip_port_src[0]

    # if this is from the infected host
    if ip_src == infected:
        ip_port_dst = parts[6].split(':')
        ip_dst = ip_port_dst[0]
        
        # if the map entry doesn't exist yet
        if not ip_dst in ips:
            ips[ip_dst] = 0
            
        # add a count to the destination ip
        ips[ip_dst] += 1
        infected_flow_count += 1

In [13]:
ips_ip = np.array(list(ips.keys()))
ips_count = np.array(list(ips.values()))

ind = np.argsort(-ips_count)[:n]
true = {}
true['ips'] = ips_ip[ind]
true['freqs'] = ips_count[ind] / infected_flow_count

## Build estimate table

In [14]:
for i in range(n):
    line = [i+1, true['ips'][i], round(true['freqs'][i],3)]
    for j,k in enumerate(k_s):
        line.append(estimates[j]['ips'][i])
        line.append(round(estimates[j]['freqs'][i],3))
    print("\t".join([str(x) for x in line]))

1	193.23.181.44	0.136	193.23.181.44	0.12	193.23.181.44	0.142	193.23.181.44	0.145
2	174.128.246.102	0.076	72.20.15.61	0.1	174.37.196.55	0.082	174.37.196.55	0.073
3	174.37.196.55	0.074	174.128.246.102	0.09	67.19.72.206	0.067	174.128.246.102	0.073
4	67.19.72.206	0.069	174.37.196.55	0.09	174.128.246.102	0.066	72.20.15.61	0.066
5	72.20.15.61	0.066	173.236.31.226	0.07	72.20.15.61	0.061	67.19.72.206	0.06
6	173.236.31.226	0.038	67.19.72.206	0.06	46.4.36.120	0.043	184.154.89.154	0.04
7	184.154.89.154	0.037	46.4.36.120	0.05	173.236.31.226	0.041	46.4.36.120	0.035
8	46.4.36.120	0.036	212.117.171.138	0.03	184.154.89.154	0.038	173.236.31.226	0.035
9	147.32.80.9	0.017	209.85.227.27	0.03	147.32.80.9	0.019	217.163.21.37	0.016
10	217.163.21.37	0.015	217.163.21.34	0.02	217.163.21.37	0.019	147.32.80.9	0.016


## Performance per k

### Frequency distance function

In [9]:
'''
freq_distance measures the distance between the frequencies in the 
ground-truth list and the frequencies in an estimated list.

true, estimate: objects with two list .ips and .freqs
'''
def freq_distance(true, estimate):
    estimate_map = {}
    for i, ip in enumerate(estimate['ips']):
        estimate_map[ip] = estimate['freqs'][i]
    score = 0
    for i, ip in enumerate(true['ips']):
        if ip in estimate_map:
            score += abs(true['freqs'][i] - estimate_map[ip])
        else:
            score += true['freqs'][i]
    return score

### Print recall, freq_distance and runtime

In [10]:
print("\t".join(['k','recall','freq distance','runtime']))
for j,k in enumerate(k_s):
    recall = float(len(np.intersect1d(true['ips'], estimates[j]['ips']))) / float(n)
    freq_score = round(freq_distance(true, estimates[j]),4)
    run_time = round(np.mean(run_times,axis=0)[j],3)
    print("{}\t{}\t{}\t{} s".format(k, recall, freq_score, run_time))

k	recall	freq distance	runtime
100	0.7	0.0	7.976 s
1000	0.9	0.0	8.808 s
5000	1.0	0.0	13.139 s
