# Botnet Profiling

In [1]:
import pandas as pd
import numpy as np
from hmmlearn.hmm import GaussianHMM


# classes and functions used for the discretization of flow data
class M_num:
    '''
    Compute the mapping for numerical features
    '''

    # paramater to change number of percentiles
    p = 5

    # percentile vals contains the feature values corresponding to the different percentiles
    percentile_vals = []
    # s is the size of the feature (i.e. |M_i|)
    s = 0
    # feature that we're working with
    feat = ''
    # the full dataframe
    df = None

    def __init__(self, df, feat):
        percentiles = np.arange(.1, 1, 1 / self.p)
        self.percentile_vals = map(lambda p: np.percentile(df[feat], p), percentiles)
        self.s = len(percentiles)
        self.feat = feat
        self.df = df

    # return the size |M_i|
    def size(self):
        return self.s

    # return the value M_i
    def val(self, row):
        val = row[self.feat]
        prev = 0
        for i, cur in enumerate(self.percentile_vals):
            if val >= prev and val < cur:
                return i
            prev = cur
        return 0


class M_cat:
    '''
    Compute the mapping for categorical features
    '''

    # s is the size of the feature (i.e. |M_i|)
    s = 0
    # feature that we're working with
    feat = ''
    # the full dataframe
    df = None

    def __init__(self, df, feat):
        self.s = len(df[feat].unique())
        self.feat = feat
        self.df = df

    # return the size |M_i|
    def size(self):
        return self.s

    # return the value M_i
    def val(self, row):
        return row[self.feat]


# encode a single flow
def encode_flow(feats, row):
    code = 0
    space_size = 1
    for feat in feats:
        space_size *= feats[feat].size()
    for feat in feats:
        s = feats[feat].size()
        v = feats[feat].val(row)
        code += v * (space_size / s)
        space_size = space_size / s
    return code


# encode an entire series of flows
def encode_series(df, feats):
    encoded = []
    for index, row in df.iterrows():
        encoded.append(encode_flow(feats, row))
    return encoded

## Data preparation

In [2]:
# load data
names = ['date', 'time', 'duration', 'protocol', 'src', 'direction', 'dst', 'flags', 'tos', 'packets', 'bytes', 'flows', 'label']
df = pd.read_csv('capture20110818.pcap.netflow.labeled',skiprows=1,header=0,sep='\s+',names=names)
df = df[df['label'] != 'Background']
# split ip:port columns
df['src_ip'], df['src_port'] = df['src'].str.split(':', 1).str
df['dst_ip'], df['dst_port'] = df['dst'].str.split(':', 1).str
df['src_ip_num'] = pd.Categorical(df['src_ip'], categories=df['src_ip'].unique()).codes
df['dst_ip_num'] = pd.Categorical(df['dst_ip'], categories=df['dst_ip'].unique()).codes
df['src_port'] = pd.to_numeric(df['src_port'])
df['dst_port'] = pd.to_numeric(df['dst_port'])

# convert categorical data
df['protocol_num'] = pd.Categorical(df['protocol'], categories=df['protocol'].unique()).codes

# merge date and time columns
df['date_time'] = pd.to_datetime(df[['date', 'time']].apply(lambda x: ' '.join(x), axis=1))

## Infected host data discretization and profiling

In [3]:
# the infected host flows that we will profile
chosen = df[df['src_ip'] == '147.32.84.204']
# rest of the hosts split between benign and malicious for testing purposes
normal =  ['147.32.84.170', '147.32.84.134', '147.32.84.164', '147.32.87.36', '147.32.80.9', '147.32.87.11']
infected = ['147.32.84.165','147.32.84.191','147.32.84.192','147.32.84.193','147.32.84.205','147.32.84.206','147.32.84.207','147.32.84.208','147.32.84.209']

# discretization based on the important features found in the previous task
feats = {'protocol_num': M_cat(chosen, 'protocol_num'), 'bytes': M_num(chosen, 'bytes')}
chosen_discrete = encode_series(chosen,feats)
# define sliding window size
win = 10
size = len(chosen_discrete) - win
# create sliding window data
data = np.zeros((size,win),dtype=np.int32)
for i in range(size):
    data[i] = np.array([flow for flow in chosen_discrete[i:i+win]])

# learn a Gaussian Hidden Markov Model with 4 states from the infected host data
hmm = GaussianHMM(n_components=4)
hmm.fit(data)
# these are the states that the HMM found
profile = hmm.decode(data)[1]

  np.log(self.startprob_),
  np.log(self.transmat_),
  np.log(self.startprob_),
  np.log(self.transmat_),
  np.log(self.transmat_),
  n_samples, n_components, np.log(self.startprob_),
  np.log(self.transmat_), framelogprob)


## Compare learned profile with other hosts data

In [5]:
scores = {}

# create the sliding window data for each host, train the HMM and compare log-likelihoods
for ip in normal:
    # discretize host data
    host = df[df['src_ip'] == ip]
    host_discrete = encode_series(host,feats)
    size = len(host_discrete) - win
    # if host has enough data for at least one window, perform matching
    if size > 0:
        data = np.zeros((size,win),dtype=np.int32)
        for i in range(size):
            data[i] = np.array([flow for flow in host_discrete[i:i+win]])
        #hmm = GaussianHMM(n_components=4)
        #hmm.fit(data)
        # store log-likelihood
        scores[ip] = hmm.decode(data)[0]
    else:
        scores[ip] = 0

# repeat the same process for malicious hosts
for ip in infected:
    host = df[df['src_ip'] == ip]
    host_discrete = encode_series(host,feats)
    size = len(host_discrete) - win
    if size > 0:
        data = np.zeros((size,win),dtype=np.int32)
        for i in range(size):
            data[i] = np.array([flow for flow in host_discrete[i:i+win]])
        #hmm = GaussianHMM(n_components=4)
        #hmm.fit(data)
        scores[ip] = hmm.decode(data)[0]
    else:
        scores[ip] = 0

  n_samples, n_components, np.log(self.startprob_),
  np.log(self.transmat_), framelogprob)


In [6]:
scores

{'147.32.84.170': -371771.3303285913,
 '147.32.84.134': 92355.66723097724,
 '147.32.84.164': 41055.06748533943,
 '147.32.87.36': 38146.9991537319,
 '147.32.80.9': 10389.412691994119,
 '147.32.87.11': 0,
 '147.32.84.165': 1201683.9407368088,
 '147.32.84.191': 1308862.7992681281,
 '147.32.84.192': 1311087.7355581948,
 '147.32.84.193': 1253031.158489874,
 '147.32.84.205': 1501728.1689783407,
 '147.32.84.206': 1419974.469399241,
 '147.32.84.207': 1332073.5384093663,
 '147.32.84.208': 1395245.0028217982,
 '147.32.84.209': 1225366.6078121033}