## Count-Min Sketching

In [1]:
import pandas as pd
import numpy as np
import time
import random

## Data preparation

In [35]:
# load data
names = ['date', 'time', 'duration', 'protocol', 'src', 'direction', 'dst', 'flags', 'tos', 'packets', 'bytes', 'flows', 'label']
df = pd.read_csv('../data/capture20110811.pcap.netflow_43_.labeled',skiprows=1,header=0,sep='\s+',names=names)

# split ip:port columns
df['src_ip'], df['src_port'] = df['src'].str.split(':', 1).str
df['dst_ip'], df['dst_port'] = df['dst'].str.split(':', 1).str
# get only the flows of the main host
infected_ip = '147.32.84.165'
infected = df[df['src_ip'] == infected_ip]

## Perform sketching

In [40]:
# define various values for the number of hash functions and their range
ds = [50,200,25]
ws = [500,1000,700]

estimates = []
dst_ips = infected['dst_ip']
total = len(dst_ips)

for i in range(len(ds)):
    d = ds[i]
    w = ws[i]

    # dictionaries to facilitate IP to id translation and vice versa
    ip_dict = {}
    rev_ip_dict = {}

    # initialize count min sketch as an array of zeros
    cm = np.zeros((d,w))
    k = 0

    coefs = []
    consts = []

    # use the hash function from the slides
    def hash_func(coef,const,base,value):
        return (coef*value + const)%base

    # start time recording
    start_time = time.time()

    # create independent hash functions by using a different coefficient and bias term for each
    for j in range(d):
        temp = random.randint(1,d)
        while temp in coefs:
            temp = random.randint(1,d)
        coefs.append(temp)
        temp = random.randint(1, d)
        while temp in consts:
            temp = random.randint(1,d)
        consts.append(temp)
        

    for dst_ip in dst_ips:
        # find the id of the IP
        if dst_ip not in ip_dict.keys():
            ip_dict[dst_ip] = k
            rev_ip_dict[k] = dst_ip
            k += 1
        temp = ip_dict[dst_ip]
        # use the id to get a hash from each hash function and update the sketch
        for j in range(d):
            col = hash_func(coefs[j], consts[j], w, temp)
            cm[j, col] = cm[j, col] + 1

    # find the minimum value for each IP
    A = np.zeros((k - 1, 1), dtype=np.int32)
    for i in range(k - 1):
        minimum = len(dst_ips)
        for j in range(d):
            temp = cm[j][hash_func(coefs[j], consts[j], w, i)]
            if temp < minimum:
                A[i] = temp
                minimum = temp

    # find the 10 most frequent IPs
    out = A.flatten()
    res = np.argsort(out)
    ips_estimated = []
    counts = []
  
    print((w,d))
    for j in res[-10:]:
        ips_estimated.append(rev_ip_dict[j])
        counts.append(round(out[i]/total,3))
        print(rev_ip_dict[j] + ' :' + str(round(out[j]/total,3)))
        
    
    # stop time recording
    elapsed_time = time.time() - start_time
    print(elapsed_time)
    # store results for evaluation
    estimates.append({
        'ips': np.array(ips_estimated),
        'freqs': np.array(counts),
        'time': elapsed_time,
    })

(500, 50)
67.19.72.206 :0.079
85.158.228.111 :0.084
174.37.196.55 :0.084
67.69.240.18 :0.084
204.11.209.99 :0.086
174.128.246.102 :0.086
64.182.71.51 :0.086
200.143.5.118 :0.155
193.23.181.44 :0.155
209.59.172.38 :0.155
1.6311008930206299
(1000, 200)
209.223.88.11 :0.075
72.20.15.61 :0.075
63.209.10.244 :0.079
67.19.72.206 :0.079
67.69.240.18 :0.084
174.37.196.55 :0.084
174.128.246.102 :0.086
204.11.209.99 :0.086
209.59.172.38 :0.155
193.23.181.44 :0.155
6.456085443496704
(700, 25)
72.20.15.61 :0.075
68.67.179.211 :0.075
67.19.72.206 :0.079
74.125.232.199 :0.079
174.37.196.55 :0.084
95.172.94.59 :0.084
83.238.197.86 :0.086
174.128.246.102 :0.086
193.23.181.44 :0.155
168.61.70.66 :0.155
0.8308825492858887


## Figure out true top 10

In [37]:
# calculate the actual counts of each IP
ips = {}
infected_flow_count = 0
n = 10
with open("../data/capture20110811.pcap.netflow_43_.labeled", "r") as ins:
    for line in ins:
        parts = line.split()
        ip_port_src = parts[4].split(':')    
        ip_src = ip_port_src[0]
        if ip_src == infected_ip:
            ip_port_dst = parts[6].split(':')
            ip_dst = ip_port_dst[0]
            if not ip_dst in ips:
                ips[ip_dst] = 0
            ips[ip_dst] += 1
            infected_flow_count += 1

ips_ip = np.array(list(ips.keys()))
ips_count = np.array(list(ips.values()))

# keep the top 10 IPs
ind = np.argsort(-ips_count)[:n]
true = {}
true['ips'] = ips_ip[ind]
true['freqs'] = ips_count[ind] / infected_flow_count

## Recall

In [38]:
'''
freq_distance measures the distance between the frequencies in the 
ground-truth list and the frequencies in an estimated list.

true, estimate: objects with two list .ips and .freqs
'''
def freq_distance(true, estimate):
    estimate_map = {}
    for i, ip in enumerate(estimate['ips']):
        estimate_map[ip] = estimate['freqs'][i]
    
    score = 0
    for i, ip in enumerate(true['ips']):
        if ip in estimate_map:
            score += abs(true['freqs'][i] - estimate_map[ip])
        else:
            score += true['freqs'][i]
    return score   

In [39]:
# create output for recall table
for j in range(len(estimates)):
    recall = float(len(np.intersect1d(true['ips'], estimates[j]['ips']))) / float(n)
    freq_score = freq_distance(true, estimates[j])
    print("{}\t{}\t{}\t{}".format((ws[j],ds[j]), recall, round(freq_score,4),estimates[j]['time']))

(500, 50)	0.4	0.6152	1.6472406387329102
(1000, 200)	0.5	0.6152	6.3459014892578125
(700, 25)	0.5	0.6102	0.8260455131530762
