In [None]:
import pandas as pd
import numpy as np
import time
import random

In [None]:
# load data
names = ['date', 'time', 'duration', 'protocol', 'src', 'direction', 'dst', 'flags', 'tos', 'packets', 'bytes', 'flows', 'label']
df = pd.read_csv('../data/capture20110818.pcap.netflow.labeled',skiprows=1,header=0,sep='\s+',names=names)

# drop Background flows
df = df[df['label'] != 'Background']

# split ip:port columns
df['src_ip'], df['src_port'] = df['src'].str.split(':', 1).str
df['dst_ip'], df['dst_port'] = df['dst'].str.split(':', 1).str
df['src_ip_num'] = pd.Categorical(df['src_ip'], categories=df['src_ip'].unique()).codes
df['dst_ip_num'] = pd.Categorical(df['dst_ip'], categories=df['dst_ip'].unique()).codes
df['src_port'] = pd.to_numeric(df['src_port'])
df['dst_port'] = pd.to_numeric(df['dst_port'])

# convert categorical data
df['protocol_num'] = pd.Categorical(df['protocol'], categories=df['protocol'].unique()).codes

# merge date and time columns
df['date_time'] = pd.to_datetime(df[['date', 'time']].apply(lambda x: ' '.join(x), axis=1))

In [None]:
# get only the flows of the main host
infected_ip = '147.32.84.165'
infected = df[df['src_ip'] == infected_ip]

In [None]:
# define the number of hash functions and their range of values
d = 10
w = 20

# dictionaries to facilitate IP to id translation and vice versa
ip_dict = {}
rev_ip_dict = {}

# initialize count min sketch as an array of zeros
cm = np.zeros((d,w))
k = 0

coefs = []
consts = []

# use the hash function from the slides
def hash_func(coef,const,base,value):
    return (coef*value + const)%base

# start time recording
start_time = time.time()

# create independent hash functions by using a different coefficient and bias term for each
for i in range(d):
    temp = random.randint(1,d)
    while temp in coefs:
        temp = random.randint(1,d)
    coefs.append(temp)
    temp = random.randint(1, d)
    while temp in consts:
        temp = random.randint(1,d)
    consts.append(temp)

In [None]:
dst_ips = infected['dst_ip']
for dst_ip in dst_ips:
    # find the id of the IP
    if dst_ip not in ip_dict.keys():
        ip_dict[dst_ip] = k
        rev_ip_dict[k] = dst_ip
        k += 1
        temp = ip_dict[dst_ip]
        # use the id to get a hash from each hash function and update the sketch
        for i in range(d):
            col = hash_func(coefs[i], consts[i], w, temp)
            cm[i, col] = cm[i, col] + 1

In [None]:
# find the minimum value for each IP
A = np.zeros((k - 1, 1), dtype=np.int32)
for i in range(k - 1):
    minimum = len(lines)
    for j in range(d):
        temp = cm[j][hash_func(coefs[j], consts[j], w, i)]
        if temp < minimum:
            A[i] = temp
            minimum = temp

# find the 10 most frequent IPs
out = A.flatten()
res = np.argsort(out)
for i in res[-10:]:
    print(rev_ip_dict[i] + ' :' + str(out[i]))

# stop time recording
elapsed_time = time.time() - start_time
print(elapsed_time)