# Sampling & Sketching tasks

We choose **CTU-Malware-Capture-Botnet-54** dataset to implement in this assignment.
We finish the sampling and sketching tasks step by step as follows:
1. Data preprocessing
2. Find out the 10 most frequent IP address by traversing the data
3. Apply Min-Wise sampling to find out 10 most frequent IP-addresses and analysis
4. Apply Count-Min sketch to find out 10 most frequent IP-addresses and analysis.

In [1]:
import matplotlib.pyplot as plt
import math
import numpy as np
import seaborn as sns
import pandas as pd
import random
import collections
import string
import time
from io import StringIO
import csv
from pandas import read_csv

1. Data preprocessing

In [None]:
## Preprocessing data
output = StringIO()
csv_writer = csv.writer(output)
column_name = ['Dateflowstart', 'Durat', 'Prot', 'SrcIPAddr:Port', 'DstIPAddr:Port', 'Flags', 'Tos', 'PacketsBytes', 'Flows', 'Label', 'Labels']
Botnet_data = pd.DataFrame(columns = column_name)
with open('capture20110818.pcap.netflow.labeled', 'rb') as f:
    next(f)
    for index, line in enumerate(f):
        parser = line.decode("utf-8")
        split_by_space = parser.split('\t')
        temp1 =  [x for x in split_by_space if x != '->']
        temp2 =  [x for x in temp1 if x != '']
        temp2[-1] = temp2[-1].split('\n')[0]
        #print(temp2)
        if(len(temp2)==1):
            split_by_space = parser.split(' ')
            temp1 =  [x for x in split_by_space if x != '->']
            temp2 =  [x for x in temp1 if x != '']
            temp2[-1] = temp2[-1].split('\n')[0]
            #print(temp2)
            x=temp2.pop(1)
            temp2[0]=temp2[0]+' '+x
        csv_writer.writerow(temp2)
    output.seek(0) # we need to get back to the start of the BytesIO
    Botnet_data = read_csv(output)
#print(Botnet_data)
Botnet_data.to_csv('Botnet_data.csv', sep='\t')

2. Find out the 10 most frequent IP address by traversing the data

In [2]:
#Load preprocessed data
data=pd.read_csv('Botnet_data.csv',sep='\t')
data.shape
    

(1299089, 12)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1299089 entries, 0 to 1299088
Data columns (total 12 columns):
Unnamed: 0        1299089 non-null int64
Dateflowstart     1299089 non-null object
Durat             1299089 non-null float64
Prot              1299089 non-null object
SrcIPAddr:Port    1299089 non-null object
DstIPAddr:Port    1299089 non-null object
Flags             1299089 non-null object
Tos               1299089 non-null int64
PacketsBytes      1299089 non-null int64
Flows             1299089 non-null int64
Label             1299089 non-null int64
Labels            1299089 non-null object
dtypes: float64(1), int64(5), object(6)
memory usage: 118.9+ MB


In [36]:
print(list(data))

['Unnamed: 0', 'Dateflowstart', 'Durat', 'Prot', 'SrcIPAddr:Port', 'DstIPAddr:Port', 'Flags', 'Tos', 'PacketsBytes', 'Flows', 'Label', 'Labels']


In [30]:
source=list(data['SrcIPAddr:Port'])+list(data['DstIPAddr:Port'])
print(len(source))

2598178


### Host ip from the website of the dataset
The host ip starts like '147.'

2.Find out the 10 most frequent IP address by traversing the data

In [29]:
sum(other_ip1.values())
#sum(other_ip2.values())

980067

In [21]:
start_time = time.time()
#Regard ip starting with '147' as host ip
other_ip1={}
for ip in source:
    curr_ip=ip.split(':')[0]
    #print(ip.split(',')[0])
    if ip.split('.')[0]!='147':
    #if curr_ip not in host:
        if curr_ip in other_ip1:
            other_ip1[curr_ip]+=1
        else:
            other_ip1[curr_ip]=1
print("--- %s seconds ---" % (time.time() - start_time))

sorted(other_ip1.items(),key=lambda item:item[1],reverse=True)[0:10]


--- 2.256657361984253 seconds ---


[('76.13.114.90', 6723),
 ('74.125.232.213', 6469),
 ('66.194.55.249', 5964),
 ('68.233.5.80', 5521),
 ('188.138.84.239', 5190),
 ('74.125.39.125', 4989),
 ('88.86.102.50', 4055),
 ('209.85.148.104', 3622),
 ('87.98.230.229', 3196),
 ('74.125.232.215', 3084)]

In [16]:
start_time = time.time()         
other_ip2={}
for ip in source:
    curr_ip=ip.split(':')[0]
    #print(ip.split(',')[0])
    if curr_ip not in host:
        if curr_ip in other_ip2:
            other_ip2[curr_ip]+=+1
        else:
            other_ip2[curr_ip]=1

print("--- %s seconds ---" % (time.time() - start_time))
sorted(other_ip2.items(),key=lambda item:item[1],reverse=True)[0:10]

--- 0.962348222732544 seconds ---


[('147.32.84.229', 301762),
 ('147.32.84.59', 71886),
 ('147.32.84.138', 58564),
 ('147.32.86.20', 16483),
 ('147.32.85.25', 15972),
 ('147.32.86.179', 12115),
 ('147.32.85.34', 11631),
 ('147.32.80.13', 9735),
 ('147.32.85.7', 9432),
 ('147.32.84.171', 7298)]

### MIN-WISE Sampling
Apply Min-Wise sampling to find out 10 most frequent IP-addresses and analysis

In [22]:
def min_wise_sample (streamdata, k):
    sampledata=[]
    sampledata_r=[]
    for ip in streamdata[:k]:
        #curr_ip=ip.split(':')[0]
        r=random.random()
        sampledata.append(ip)
        sampledata_r.append(r)
    
    for ip in streamdata[k:]:
        #curr_ip=ip.split(':')[0]
        r=random.random()
        if r<max(sampledata_r):
            index=sampledata_r.index(max(sampledata_r))
            sampledata.pop(index)
            sampledata_r.pop(index)
            sampledata.append(ip)
            sampledata_r.append(r)
    return sampledata
            
        

In [23]:
#get all so-called 'other' IP address
source1=[]
for ip in source:
    curr_ip=ip.split(':')[0]
    if ip.split('.')[0]!='147':
    #if curr_ip not in host:
        source1.append(curr_ip)

In [36]:
num=[500,3000,10000,30000,50000,100000]
for n in num:
    start_time=time.time()
    sample1=min_wise_sample(source1, n)
    print(collections.Counter(sample1).most_common(10))
    print("--- %s seconds ---" % (time.time() - start_time))
    print('####################################')
    print('####################################')


[('74.125.39.125', 7), ('68.233.5.80', 6), ('74.125.232.213', 4), ('74.125.232.216', 4), ('76.13.114.90', 4), ('74.125.232.215', 3), ('85.160.62.18', 3), ('74.125.232.220', 3), ('66.194.55.249', 3), ('209.85.148.106', 3)]
--- 11.10586953163147 seconds ---
####################################
####################################
[('76.13.114.90', 35), ('74.125.232.213', 28), ('66.194.55.249', 24), ('68.233.5.80', 23), ('88.86.102.50', 17), ('209.85.148.104', 16), ('188.138.84.239', 16), ('62.168.92.250', 14), ('74.125.39.125', 13), ('109.80.225.83', 12)]
--- 64.90811848640442 seconds ---
####################################
####################################
[('68.233.5.80', 69), ('76.13.114.90', 69), ('188.138.84.239', 61), ('74.125.232.213', 59), ('74.125.39.125', 55), ('66.194.55.249', 52), ('88.86.102.50', 48), ('78.45.43.209', 42), ('78.80.14.227', 39), ('109.80.225.83', 37)]
--- 224.85903024673462 seconds ---
####################################
#################################

### Count-Min Sketch
Apply Count-Min sketch to find out 10 most frequent IP-addresses and analysis.

In [39]:
#define functions
def count_min_sketch(data,height,weight):
    sketchmatrix=np.zeros((height,weight))
    #random strings to execute different hash function
    randomstring=[]
    for i in range(height):
        randomstring.append(''.join(random.sample(string.ascii_letters + string.digits, 8)))
    
    for ip in data:
        for i in range(height):
            temp=hash(randomstring[i]+str(ip))%weight
            #print(temp)
            sketchmatrix[i,temp]+=1
    return sketchmatrix,randomstring

def get_sketch_frequency(data,sketchmatrix,randomstring,num):
    datafrequency={}
    for ip in data:
        tempnumber=[]
        for i in range(len(randomstring)):
            temp=hash(randomstring[i]+str(ip))%sketchmatrix.shape[1]
            #print(temp)
            tempnumber.append(sketchmatrix[i,temp])
        datafrequency[ip]=min(tempnumber)
        
    return sorted(datafrequency.items(),key=lambda item:item[1],reverse=True)[0:num]
            
    

In [41]:
height=[1,5,10,30]
width=[500,3000,10000,30000]
num=10
#Here only outputs the 10 most frequent IP
for h in height:
    for w in width:
        start_time=time.time()
        sketchmatrix1,randomstring1=count_min_sketch(source1,h,w)
        print('height=',h,' width=',w)
        print(get_sketch_frequency(other_ip1.keys(),sketchmatrix1,randomstring1,num))
        print("--- %s seconds ---" % (time.time() - start_time))
        print('####################################')
        print('####################################')

height= 1  width= 500
[('99.227.246.143', 8513.0), ('76.235.44.111', 8513.0), ('114.158.231.170', 8513.0), ('95.107.251.197', 8513.0), ('95.26.223.181', 8513.0), ('89.113.228.222', 8513.0), ('79.97.111.2', 8513.0), ('188.242.130.40', 8513.0), ('84.54.184.132', 8513.0), ('79.18.33.19', 8513.0)]
--- 1.852426290512085 seconds ---
####################################
####################################
height= 1  width= 3000
[('209.85.148.106', 7044.0), ('201.83.55.2', 7044.0), ('111.254.193.68', 7044.0), ('76.68.241.9', 7044.0), ('68.147.48.123', 7044.0), ('84.73.158.96', 7044.0), ('207.245.236.60', 7044.0), ('173.180.138.102', 7044.0), ('216.106.105.234', 7044.0), ('130.209.232.117', 7044.0)]
--- 1.7594540119171143 seconds ---
####################################
####################################
height= 1  width= 10000
[('76.13.114.90', 6739.0), ('46.33.232.151', 6739.0), ('197.194.76.48', 6739.0), ('66.87.94.140', 6739.0), ('108.82.237.43', 6739.0), ('77.75.72.11', 6739.0), ('74.12

In [42]:
#Testing when the width is small
height=[1,5,10,30]
width=[50,100]
num=10
#Here only outputs the 10 most frequent IP
for h in height:
    for w in width:
        start_time=time.time()
        sketchmatrix1,randomstring1=count_min_sketch(source1,h,w)
        print('height=',h,' width=',w)
        print(get_sketch_frequency(other_ip1.keys(),sketchmatrix1,randomstring1,num))
        print("--- %s seconds ---" % (time.time() - start_time))
        print('####################################')
        print('####################################')

height= 1  width= 50
[('161.111.235.112', 29384.0), ('68.233.5.80', 29384.0), ('114.24.172.183', 29384.0), ('86.49.87.107', 29384.0), ('82.113.106.224', 29384.0), ('70.88.254.65', 29384.0), ('211.23.1.198', 29384.0), ('81.95.98.8', 29384.0), ('83.166.167.22', 29384.0), ('87.255.19.94', 29384.0)]
--- 1.8674647808074951 seconds ---
####################################
####################################
height= 1  width= 100
[('83.163.158.159', 16873.0), ('94.209.77.164', 16873.0), ('82.243.23.121', 16873.0), ('205.188.10.233', 16873.0), ('78.80.14.227', 16873.0), ('193.219.76.93', 16873.0), ('94.245.121.251', 16873.0), ('74.125.232.204', 16873.0), ('2.47.251.165', 16873.0), ('46.56.228.96', 16873.0)]
--- 1.515679121017456 seconds ---
####################################
####################################
height= 5  width= 50
[('95.57.239.2', 23380.0), ('95.43.236.214', 23380.0), ('88.86.102.50', 23317.0), ('187.5.68.58', 23181.0), ('50.80.11.83', 23144.0), ('64.134.176.91', 23123.0),