In [39]:
import pandas as pd
import os
import pickle
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold




# Botnet	fingerprinting task

First, we create a function to read our data

In [40]:
# columns = StartTime, Dur, Proto, SrcAddr, Sport, Dir, DstAddr, Dport, State, sTos, dTos, TotPkts, TotBytes, SrcBytes, Label

scenario_size = [(1,368), (2,235), (3,610), (4,146), (5,17), (6,73), (7,15), (8,385), (9,272), (11,14), (12,43), (13,250)]
scenarios = [t[0] for t in sorted(scenario_size, key=lambda x: x[1])]
print(scenarios)

def read_from_file(scenario):
    print("Reading from file. Scenario: %s" % scenario)

    # Get the path of the file
    dir_path = os.path.join( "data", "CTU-13-Dataset", str(scenario))
    file_name = filter(lambda x: x.endswith(".binetflow"), os.listdir(dir_path))[0]
    file_path = os.path.join(dir_path, file_name)

    # Read the csv file in a pandas dataframe
    # Convert label: "flow=From-Botnet" to 1, label: "flow=From-Normal" to 0 and the rest to 2
    converters = {"Label": lambda x: 1 if x.startswith("flow=From-Botnet") else (0 if x.startswith("flow=From-Normal") else 2)}
    df = pd.read_csv(file_path, skip_blank_lines=True, delimiter=",", converters=converters)

    # Drop rows that contain null values for atleast one of these rows: "DstAddr", "SrcAddr", "Dport", "Sport", "Label"
    df.dropna(subset=["DstAddr", "SrcAddr", "Dport", "Sport", "Label"], inplace=True, how="any")

    # remove the background flows
    df = df[df.Label != 2]

    print("\tDone!!")
    return df

[11, 7, 5, 12, 6, 4, 2, 13, 9, 1, 8, 3]


In [41]:
# Read a scenario

scenario = 1

df = read_from_file(scenario)


Reading from file. Scenario: 1
	Done!!


In [63]:
# Remove the columns which will not affect the clustering
# sTos, dTos (Nan and 0 values)

df_map =df.drop(["sTos","dTos"], 1)

print list(df_data)
#or i in list(df_data):
#   print df_data[i].unique()

['StartTime', 'Dur', 'Proto', 'SrcAddr', 'Sport', 'Dir', 'DstAddr', 'Dport', 'State', 'TotPkts', 'TotBytes', 'SrcBytes', 'Label']


### First	cluster	the	NetFlows	using	a	clustering	method	of	your	choice	in	order	to	discretize	the	data

The method we chose to cluster the NetFlows is the same which is described in *Pellegrino,	Gaetano,	et	al.	"Learning	Behavioral	Fingerprints	From	Netflows	Using	Timed Automata."*

First, we have to create the mapping described in the paper. The columns which contrain non numerical values and need to be mapped are: "Proto" {udp,tcp,icmp}, "State" {various states} and "direction" {'   ->' '  <->' '  <?>' '   ?>'}.

## Remember to check what happens with the directions!!! 4 different cases???/

As the paper suggests we will use hot encoding for those three columns. In addition, we will use the srcAddr to separate each flow and the we will store the "StartTime" to obtain the time values of each event.

In [64]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

# Transforming the "Proto" column
le.fit(df_map["Proto"])
df_map["Proto"] = le.transform(df_map["Proto"])

# Transforming the "State" column
le.fit(df_data["State"])
df_map["State"] = le.transform(df_map["State"])

# Transforming the "Dir" column
le.fit(df_map["Dir"])
df_map["Dir"] = le.transform(df_map["Dir"])

#print df_map.groupby("Dir").count()

Next, we have to create the mapping descibed in the paper. Thus, we will create a function which takes as input the dataset, the number of desired clusters and a  specific feature value and outputs on which percentille of the dataset this value exists (once again this method is thoroughly described in paper [5]).

In [300]:
def mapping_function(data, feature, value, bins):
    data.sort(columns=feature)
    percentilles = 100/bins
    N = len(data[feature])
    r = {} # collection values
    
    for p in range(percentilles, 100, percentilles):
        r[p] = int((p/100.0) * N)
    
    result = 0
    for p in range(percentilles, 100, percentilles):
        #print p, value, data[feature].iloc[r[p]]
        limit = int(data[feature].iloc[r[p]])
        if value <= limit:
            #print ("P = %s, %s <= %s " %(p, value, limit))
            return result
        result+=1
    #print"asdasdsdasdsd"
    return result
        
    

In [294]:
for p in range(33, 100, 33):
    print (p/100.0)*1000
    #r[p] = int((p/100.0) * N)

330.0
660.0
990.0


In [314]:
from types import *

def clusterNetFlows(data, netFlow, features, nUniqueValues, bins, spaceSize):
    code =  0

    #print spaceSize
    #print spaceSize, nUniqueValues
    
    for f in features:
        if f == "Proto" or f =="State" or f == "Dir":
            size = nUniqueValues[f]
        else:
            size = bins
        if type(netFlow[f]) is StringType:
            print f
            netFlow[f] = int(netFlow[f])
        
        mapped = mapping_function(data, f, netFlow[f],bins)
        #print mapped
        #print spaceSize/nUniqueValues[f]
        code += mapped * (spaceSize/size)
        spaceSize = spaceSize/size
        
    return netFlow["StartTime"], netFlow["SrcAddr"], netFlow["DstAddr"], code, netFlow["Label"]

In [315]:
# Define the netFlow features which we will use
features = list(df_map)
features.remove("StartTime")
features.remove("Label")
features.remove("SrcAddr")
features.remove("DstAddr")

                
nUniqueValues= {}
for f in features:
    nUniqueValues[f] = df_map[f].nunique()

bins = 5
cluster_labels = []

spaceSize = 1
for f in features:
    if f == "Proto" or f =="State" or f == "Dir":
        spaceSize *= nUniqueValues[f]
    else:
        spaceSize *= bins
        
clustered_data = {}

for index, row in df_map.iterrows():
    startTime, src, dst, code, label = clusterNetFlows(df_map, row, features, nUniqueValues, bins, spaceSize)
    clustered_data[startTime] = {}
    clustered_data[startTime]["src"] = src
    clustered_data[startTime]["dst"] = dst
    clustered_data[startTime]["cluster"] = code
    clustered_data[startTime]["label"] = label

    #print startTime, src, dst, code, label

  from ipykernel import kernelapp as app


Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Sport
Dport
Spor

ValueError: invalid literal for int() with base 10: '0x0303'

In [325]:
#le.fit(cluster_labels)
#c = le.transform(cluster_labels)
#print c
#print clustered_data
unique_clusters = {}
for i in clustered_data:
    if clustered_data[i]["cluster"] ==9309624
        print i ,clustered_data[i]
    #unique_clusters[clustered_data[i]["cluster"]] = 1

#print len(unique_clusters)
    

2011/08/10 09:52:56.784382 {'src': '147.32.84.134', 'dst': '88.86.100.176', 'label': 0, 'cluster': 9309624}
2011/08/10 09:52:29.627734 {'src': '147.32.84.134', 'dst': '88.86.100.176', 'label': 0, 'cluster': 9309624}
2011/08/10 09:52:38.685167 {'src': '147.32.84.134', 'dst': '88.86.100.176', 'label': 0, 'cluster': 9309624}
2011/08/10 09:53:26.944245 {'src': '147.32.84.134', 'dst': '88.86.100.176', 'label': 0, 'cluster': 9309624}
2011/08/10 09:52:11.535812 {'src': '147.32.84.134', 'dst': '88.86.100.176', 'label': 0, 'cluster': 9309624}
2011/08/10 09:53:45.053401 {'src': '147.32.84.134', 'dst': '88.86.100.176', 'label': 0, 'cluster': 9309624}
2011/08/10 09:52:05.506420 {'src': '147.32.84.134', 'dst': '88.86.100.176', 'label': 0, 'cluster': 9309624}
2011/08/10 09:51:20.254043 {'src': '147.32.84.134', 'dst': '88.86.100.176', 'label': 0, 'cluster': 9309624}
2011/08/10 09:46:54.780268 {'src': '147.32.84.134', 'dst': '88.86.100.176', 'label': 0, 'cluster': 9309624}
2011/08/10 09:48:37.321285 {