# Sampling & Sketching tasks

We choose **CTU-Malware-Capture-Botnet-54** dataset to implement in this assignment.
We finish the sampling and sketching tasks step by step as follows:
1. Data preprocessing
2. Find out the 10 most frequent IP address by traversing the data
3. Apply Min-Wise hashing to find out 10 most frequent IP-addresses and analysis
4. Apply Count-Min sketch to find out 10 most frequent IP-addresses and analysis.

In [1]:
import matplotlib.pyplot as plt
import math
import numpy as np
import seaborn as sns
import pandas as pd
import random
import collections
import string
import time
from io import StringIO
import csv
from pandas import read_csv

1. Data preprocessing

In [None]:
## Preprocessing data
output = StringIO()
csv_writer = csv.writer(output)
column_name = ['Dateflowstart', 'Durat', 'Prot', 'SrcIPAddr:Port', 'DstIPAddr:Port', 'Flags', 'Tos', 'PacketsBytes', 'Flows', 'Label', 'Labels']
Botnet_data = pd.DataFrame(columns = column_name)
with open('capture20110818.pcap.netflow.labeled', 'rb') as f:
    next(f)
    for index, line in enumerate(f):
        parser = line.decode("utf-8")
        split_by_space = parser.split('\t')
        temp1 =  [x for x in split_by_space if x != '->']
        temp2 =  [x for x in temp1 if x != '']
        temp2[-1] = temp2[-1].split('\n')[0]
        #print(temp2)
        if(len(temp2)==1):
            split_by_space = parser.split(' ')
            temp1 =  [x for x in split_by_space if x != '->']
            temp2 =  [x for x in temp1 if x != '']
            temp2[-1] = temp2[-1].split('\n')[0]
            #print(temp2)
            x=temp2.pop(1)
            temp2[0]=temp2[0]+' '+x
        csv_writer.writerow(temp2)
    output.seek(0) # we need to get back to the start of the BytesIO
    Botnet_data = read_csv(output)
#print(Botnet_data)
Botnet_data.to_csv('Botnet_data.csv', sep='\t')

2. Find out the 10 most frequent IP address by traversing the data

In [2]:
#Load preprocessed data
data=pd.read_csv('Botnet_data.csv',sep='\t')
data.shape
    

(1299089, 12)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1299089 entries, 0 to 1299088
Data columns (total 12 columns):
Unnamed: 0        1299089 non-null int64
Dateflowstart     1299089 non-null object
Durat             1299089 non-null float64
Prot              1299089 non-null object
SrcIPAddr:Port    1299089 non-null object
DstIPAddr:Port    1299089 non-null object
Flags             1299089 non-null object
Tos               1299089 non-null int64
PacketsBytes      1299089 non-null int64
Flows             1299089 non-null int64
Label             1299089 non-null int64
Labels            1299089 non-null object
dtypes: float64(1), int64(5), object(6)
memory usage: 118.9+ MB


In [36]:
print(list(data))

['Unnamed: 0', 'Dateflowstart', 'Durat', 'Prot', 'SrcIPAddr:Port', 'DstIPAddr:Port', 'Flags', 'Tos', 'PacketsBytes', 'Flows', 'Label', 'Labels']


In [142]:
source=list(data['SrcIPAddr:Port'])+list(data['DstIPAddr:Port'])
print(len(source))

2598178


In [42]:
len(list(data['SrcIPAddr:Port']))

1299089

### Host ip from the website of the dataset

In [47]:
host=['147.32.84.165','147.32.84.170','147.32.84.134','147.32.84.164','147.32.87.36','147.32.80.9','147.32.87.11']

2. Find out the 10 most frequent IP address by traversing the data

In [143]:
#Regard ip starting with '147' as host ip
other_ip1={}
for ip in source:
    curr_ip=ip.split(':')[0]
    #print(ip.split(',')[0])
    if ip.split('.')[0]!='147':
    #if curr_ip not in host:
        if curr_ip in other_ip1:
            other_ip1[curr_ip]+=1
        else:
            other_ip1[curr_ip]=1
            
        
#print(other_ip)        

In [144]:
other_ip2={}
for ip in source:
    curr_ip=ip.split(':')[0]
    #print(ip.split(',')[0])
    if curr_ip not in host:
        if curr_ip in other_ip2:
            other_ip2[curr_ip]+=+1
        else:
            other_ip2[curr_ip]=1
            

In [145]:
sum(other_ip1.values())
sum(other_ip2.values())

2294423

In [146]:
start_time = time.time()
#Regard ip starting with '147' as host ip
other_ip1={}
for ip in source:
    curr_ip=ip.split(':')[0]
    #print(ip.split(',')[0])
    if ip.split('.')[0]!='147':
    #if curr_ip not in host:
        if curr_ip in other_ip1:
            other_ip1[curr_ip]+=1
        else:
            other_ip1[curr_ip]=1
sorted(other_ip1.items(),key=lambda item:item[1],reverse=True)[0:10]
print("--- %s seconds ---" % (time.time() - start_time))


[('76.13.114.90', 6723),
 ('74.125.232.213', 6469),
 ('66.194.55.249', 5964),
 ('68.233.5.80', 5521),
 ('188.138.84.239', 5190),
 ('74.125.39.125', 4989),
 ('88.86.102.50', 4055),
 ('209.85.148.104', 3622),
 ('87.98.230.229', 3196),
 ('74.125.232.215', 3084)]

In [147]:
start_time = time.time()         
other_ip2={}
for ip in source:
    curr_ip=ip.split(':')[0]
    #print(ip.split(',')[0])
    if curr_ip not in host:
        if curr_ip in other_ip2:
            other_ip2[curr_ip]+=+1
        else:
            other_ip2[curr_ip]=1
sorted(other_ip2.items(),key=lambda item:item[1],reverse=True)[0:10]
print("--- %s seconds ---" % (time.time() - start_time))


[('147.32.84.229', 590455),
 ('147.32.84.59', 126102),
 ('147.32.84.138', 117141),
 ('147.32.85.25', 31955),
 ('147.32.85.34', 23387),
 ('147.32.86.179', 23360),
 ('147.32.86.20', 21996),
 ('147.32.80.13', 21886),
 ('147.32.85.7', 17097),
 ('147.32.84.171', 13831)]

### MIN-WISE Hashing

In [151]:
def min_wise_sample (streamdata, k):
    sampledata=[]
    sampledata_r=[]
    for ip in streamdata[:k]:
        #curr_ip=ip.split(':')[0]
        r=random.random()
        sampledata.append(ip)
        sampledata_r.append(r)
    
    for ip in streamdata[k:]:
        #curr_ip=ip.split(':')[0]
        r=random.random()
        if r<max(sampledata_r):
            index=sampledata_r.index(max(sampledata_r))
            sampledata.pop(index)
            sampledata_r.pop(index)
            sampledata.append(ip)
            sampledata_r.append(r)
    return sampledata
            
        

In [150]:
#Preprocess need!!!!

#preprocess 1
source1=[]
for ip in source:
    curr_ip=ip.split(':')[0]
    if ip.split('.')[0]!='147':
    #if curr_ip not in host:
        source1.append(curr_ip)
        
        
#preprocess 2
source2=[]
for ip in source:
    curr_ip=ip.split(':')[0]
    if curr_ip not in host:
        source2.append(curr_ip)




['147.32.84.144', '198.36.38.132', '41.103.64.21', '147.32.86.183', '147.32.80.13', '147.32.84.162', '83.163.158.159', '147.32.85.84', '147.32.84.171', '123.219.75.41']


In [159]:
num=30000
start_time=time.time()
sample1=min_wise_sample (source1, num)
print(collections.Counter(sample1).most_common(10))
print("--- %s seconds ---" % (time.time() - start_time))


start_time=time.time()
sample2=min_wise_sample (source2, num)
print(collections.Counter(sample2).most_common(10))
print("--- %s seconds ---" % (time.time() - start_time))

[('76.13.114.90', 215), ('74.125.232.213', 194), ('66.194.55.249', 193), ('68.233.5.80', 155), ('74.125.39.125', 143), ('188.138.84.239', 136), ('88.86.102.50', 119), ('209.85.148.104', 116), ('86.49.87.107', 98), ('109.80.225.83', 97)]
[('147.32.84.229', 7714), ('147.32.84.59', 1613), ('147.32.84.138', 1511), ('147.32.85.25', 446), ('147.32.85.34', 326), ('147.32.86.179', 308), ('147.32.86.20', 290), ('147.32.80.13', 286), ('147.32.85.7', 240), ('147.32.84.171', 180)]


In [160]:
print(hash(1),hash('1'),hash(1))

1 3624858007723412619 1


In [198]:
def count_min_sketch(data,height,weight):
    sketchmatrix=np.zeros((height,weight))
    #random strings to execute different hash function
    randomstring=[]
    for i in range(height):
        randomstring.append(''.join(random.sample(string.ascii_letters + string.digits, 8)))
    
    for ip in data:
        for i in range(height):
            temp=hash(randomstring[i]+str(ip))%weight
            #print(temp)
            sketchmatrix[i,temp]+=1
    return sketchmatrix,randomstring

def get_sketch_frequency(data,sketchmatrix,randomstring,num):
    datafrequency={}
    for ip in data:
        tempnumber=[]
        for i in range(len(randomstring)):
            temp=hash(randomstring[i]+str(ip))%sketchmatrix.shape[1]
            #print(temp)
            tempnumber.append(sketchmatrix[i,temp])
        datafrequency[ip]=min(tempnumber)
        
    return sorted(datafrequency.items(),key=lambda item:item[1],reverse=True)[0:num]
            
    

In [218]:
height=20
weight=1000
num=10
start_time=time.time()
sketchmatrix1,randomstring1=count_min_sketch(source1,height,weight)
print(get_sketch_frequency(other_ip1.keys(),sketchmatrix1,randomstring1,num))

print("--- %s seconds ---" % (time.time() - start_time))

[('76.13.114.90', 7122.0), ('74.125.232.213', 6946.0), ('66.194.55.249', 6461.0), ('68.233.5.80', 6001.0), ('188.138.84.239', 5594.0), ('74.125.39.125', 5433.0), ('88.86.102.50', 4574.0), ('209.85.148.104', 4082.0), ('87.98.230.229', 3614.0), ('74.125.232.215', 3516.0)]
--- 21.70082974433899 seconds ---


In [219]:
start_time=time.time()
sketchmatrix2,randomstring2=count_min_sketch(source2,height,weight)
print(get_sketch_frequency(other_ip2.keys(),sketchmatrix2,randomstring2,num))
print("--- %s seconds ---" % (time.time() - start_time))

[('147.32.84.229', 590780.0), ('147.32.84.59', 126660.0), ('147.32.84.138', 117664.0), ('147.32.85.25', 32367.0), ('147.32.85.34', 23871.0), ('147.32.86.179', 23764.0), ('147.32.86.20', 22489.0), ('147.32.80.13', 22299.0), ('147.32.85.7', 17478.0), ('147.32.84.171', 14263.0)]
--- 45.90023970603943 seconds ---


0
1
2
3
4
5
6
7
8
9


[[0. 7. 0. 1.]
 [1. 2. 2. 3.]
 [1. 3. 2. 2.]]


[(1, 3.0), ('123', 2.0)]

3