In [22]:
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math 
import mmh3 
sns.set()

In [6]:
DATA_PATH_ORIG = r'../data/capture20110812.pcap.netflow.labeled.csv'
DATA_PATH_NEW = r'../data/capture20110812.pcap.netflow.labeled_processed.csv'

## IMPORTANT     
The code below reads the file and erases the '->' characters.    
It takes quite a few minutes, so run it only if you don't have the processed file yet :(

In [7]:
#text = open(DATA_PATH_ORIG, "r")
#text = ''.join([i for i in text]) \
#     .replace("->", "")
#x = open(DATA_PATH_NEW,"w")
#x.writelines(text)
#x.close()

In [8]:
column_names = ['_', '_2', 'duration', 'protocol', 'src_addr', 'dst_addr', 'flags', 'tos', 'packets', 'bytes', 'flows', 'label']

df = pd.read_csv(
    DATA_PATH_NEW,
    delimiter='\s+',
    skiprows=[0],
    header=None, 
    names=column_names,
    index_col=False,
    parse_dates={"start_date": [0,1]}
)
df.head()

Unnamed: 0,start_date,duration,protocol,src_addr,dst_addr,flags,tos,packets,bytes,flows,label
0,2011-08-12 15:24:01.105,4.677,TCP,74.125.108.243:80,147.32.86.187:55707,PA_,0,452,682688,1,Background
1,2011-08-12 15:24:01.105,3.11,TCP,147.32.84.164:22,83.208.193.123:15173,PA_,0,5,506,1,LEGITIMATE
2,2011-08-12 15:24:01.105,4.678,TCP,147.32.86.187:55707,74.125.108.243:80,A_,0,106,6510,1,LEGITIMATE
3,2011-08-12 15:24:01.106,4.989,TCP,217.168.211.184:3953,147.32.84.14:22,PA_,0,1393,95812,1,Background
4,2011-08-12 15:24:01.113,4.988,UDP,173.9.132.155:54369,147.32.84.59:15046,INT,0,811,680898,1,Background


We obtained the IP of the infected host from the documentation of the scenario 44.

In [9]:
host_ip = "147.32.84.165" # IP obtained from documentation

Let's keep all the other IPs the host connects with

In [10]:
# let's build an iterable with all the other IPs this guy connects with
connections = df[(df['src_addr'] == host_ip) | (df['dst_addr'] == host_ip)][['src_addr', 'dst_addr']]

other_ips = connections.apply(lambda row: row['dst_addr'] if row['src_addr'] == host_ip else row['src_addr'], axis=1)
print(f"{other_ips.size} connections with the host found.")

3652 connections with the host found.


Just a sanity check to assert that we do not have our host_ip somewhere in the list of IPs

In [11]:
print(len([ip for ip in other_ips if ip != host_ip]))
print(len(other_ips))

3652
3652


# Count Min Sketch

Resource: https://olivif.github.io/2017/count-min-sketch/

Resource for independent hash functions: https://www.geeksforgeeks.org/bloom-filters-introduction-and-python-implementation/

Let's starts by implementing the function that computes the INDEPEDENT hashings.

In [19]:
def get_hash_indexes(item, hash_count, bit_array_size):
    digests = [] 
    for i in range(hash_count):  
        digest = mmh3.hash(item,i) % bit_array_size 
        digests.append(digest) 
  
    return digests
    

In [26]:
height = 7
width = 100 
buckets = np.zeros((height, width))
unique_ips = set()

for ip in other_ips:
    digests = get_hash_indexes(ip, height, width)
    unique_ips.add(ip)
    
    for id_height, id_width in enumerate(digests):
        buckets[id_height, id_width] += 1
  
print(buckets)


#Loop over the unique ips, hash them again, get the counts from the indexes and choose the min one as frequency

    

[[ 14.  21.  36.  24.  21.  23.  18.  28.  16.  31.  34.  22.   8.  23.
   12.  38.  67.  20. 107.  23.  22.  21.  25.  21.  37.  33.  24. 747.
   34.  18.  25.  34.  24.  56.  25.  15.  27.  15.  15.  23.  34.  26.
   19.  18.  20.  33.  65.  27.  23.  30.  25.  26.  36. 209.  31.  24.
   28.  23.  16.  28.  35.  26.  25.  21.  30.  21.  17.  23.  21.  37.
   19.  23.  37.  24.  11.  25.  25.  27.  15. 103.  21.  31.  33.  16.
   19.  22.  34.  27.  21.  27.  14.  59.  18.  28.  25.  27.  20.  37.
   21.  19.]
 [ 35.  28.  37.  21.  22.  21.  28.  16.  28.  15.  20.  40.  25.  32.
   41.  21.  17.  28.  11.  27.  23.  13.  34.  12.  22.  68.  21.  26.
   33.  17.   8.  33.  16.  19.  22.  22.  19.  24.  21.  19.  14.  22.
   28.  15.  25.  22.  10.  24.   9.  56.  30.  19.  32. 103.  11.  16.
   25.  31.  27.  16.  19.  19.  41.  16.  26.  13.  27. 212.  28.  16.
  770.  36.  16.  18.  22.  24.  22.  24.  29.  27.  30.  46.  25.  12.
   76.  33.  24.  40.  25.  23.  17.  30. 112.  31.