# Markov Model and Markov Chains - T-Pot - Cowrie Honeypot
author: Austin Hogan
contact: a1hogan@ryerson.ca

Hidden Markov models (HMM)

In [44]:
import numpy
print(numpy.__path__)

%matplotlib inline
import time
import pandas as pd
import random
import numpy
import matplotlib.pyplot as plt
import seaborn; seaborn.set_style('whitegrid')
import itertools

from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
import pomegranate
from hmmviz import TransGraph
from hmmlearn import hmm

random.seed(0)
numpy.random.seed(0)
numpy.set_printoptions(suppress=True)

%load_ext watermark
%watermark -m -n -p numpy,scipy,pomegranate,pandas,hmmlearn

['/Users/aceinthesleeve/miniforge3/lib/python3.9/site-packages/numpy']
The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
numpy      : 1.22.4
scipy      : 1.8.1
pomegranate: 0.14.8
pandas     : 1.4.3
hmmlearn   : 0.2.7

Compiler    : Clang 13.0.1 
OS          : Darwin
Release     : 21.5.0
Machine     : arm64
Processor   : arm
CPU cores   : 8
Architecture: 64bit



In [45]:
def query_es_return_dataframe(index, query): # query es instance and return dataframe
    search_context = Search(using=es, index=index)
    s = search_context.query("query_string", query=query)
    response = s.execute()
    if response.success():
        df = pd.DataFrame((d.to_dict() for d in s.scan()))
    return df

In [46]:
# connect with local T-Pot instance, contains backup from honeypot cluster
es = Elasticsearch("http://localhost:9200")
indice_search_tag = "logstash-20*"

In [47]:
# query database for Cowrie honeypot data
df_tor = query_es_return_dataframe(indice_search_tag, "type.keyword: \"Cowrie\" AND ip_rep.keyword: \"tor exit node\"")
# none tor exit node data
df = query_es_return_dataframe(indice_search_tag, "type.keyword: \"Cowrie\" AND NOT ip_rep.keyword: \"tor exit node\"")
# show none tor dataframe
df.info()
# show tor dataframe
df_tor.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 340366 entries, 0 to 340365
Data columns (total 54 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   message          340299 non-null  object 
 1   ip_rep           75145 non-null   object 
 2   @timestamp       340366 non-null  object 
 3   keyAlgs          23677 non-null   object 
 4   type             340366 non-null  object 
 5   encCS            23677 non-null   object 
 6   host             340366 non-null  object 
 7   macCS            23677 non-null   object 
 8   compCS           23677 non-null   object 
 9   session          340366 non-null  object 
 10  tags             259515 non-null  object 
 11  timestamp        340366 non-null  object 
 12  kexAlgs          23677 non-null   object 
 13  @version         340366 non-null  object 
 14  geoip_ext        340366 non-null  object 
 15  langCS           23677 non-null   object 
 16  src_ip           340366 non-null  obje

In [52]:
# def process_dataframe(df):
    # sort unique event ids and drop unused values
    event_ids = df['eventid'].unique()
    unique_event_ids = []
    for ei in event_ids:
        if isinstance(ei, str):
            unique_event_ids.append(ei)
    unique_event_ids.sort()
    # pretty print unique event ids
    event_id_from_int = {}
    int_from_event_id = {}
    for i, unique_event_id in enumerate(unique_event_ids):
        print("{:02d} {}".format(i, unique_event_id))
        event_id_from_int[i] = unique_event_id
        int_from_event_id[unique_event_id] = i
    # resolve timestamp to real date
    df['@timestamp'] = pd.to_datetime(df['@timestamp'])
    # sort by date
    df = df.sort_values(['@timestamp'])
    # verify dates are sorted
    df['@timestamp']
    # group dataframe by session and aggregate eventids
    df_grouped = df.groupby("session")['eventid'].agg(list)
    # checking one example to verify sort and command order makes sense
    df_grouped[1]
    # calculate and show number of interactions per session
    number_of_interactions_per_session = len(df) / len(df_grouped)
    print("Average Number of Interactions per session {}".format(number_of_interactions_per_session))
    # calculate markov probabalities
    # note total transitions
    total_transitions = 0
    # transition dictionary collection
    transition_dict = {}
    # transition matrix (N X N) where N is the number of states
    for event_id in unique_event_ids:
        transition_dict[event_id] = [0] * len(unique_event_ids)
    # prepate data for fit
    np_fit_data = []
    np_fit_data_hmm = []
    np_fit_data_lengths = []
    # iterate over grouped session and state
    # populate transition dictionary
    for session, states in df_grouped.items():
        for i in range(len(states) - 1):
            transition_dict[states[i]][int_from_event_id[states[i+1]]] += 1
        # prepare data for fit
        states_as_ints = []
        states_as_ints_hmm = []
        for state in states:
            states_as_ints.append(int_from_event_id[state])
            states_as_ints_hmm.append([int_from_event_id[state]])
        np_fit_data.append(states_as_ints)
        np_fit_data_hmm += states_as_ints_hmm
    np_fit_data_lengths = list(map(len, np_fit_data))
    # print(np_fit_data)
    #
    # pomegranate models
    # model_pom = pomegranate.HiddenMarkovModel.from_samples(NormalDistribution, n_components=len(unique_event_ids), X=np_fit_data)
    # model_pom.bake()
    # model_pom.plot()
    # hmm learn models
    model_hmm = hmm.GaussianHMM(n_components=len(unique_event_ids)).fit(np_fit_data_hmm, np_fit_data_lengths)
    # create graph of system
    print(model_hmm.transmat_)
    graph = TransGraph(numpy.array(model_hmm.transmat))
    fig = plt.figure(figsize=(75, 75))
    graph.draw(
        nodefontsize=25,
    )
    plt.show()

In [53]:
process_dataframe(df)

00 cowrie.client.kex
01 cowrie.client.size
02 cowrie.client.var
03 cowrie.client.version
04 cowrie.command.failed
05 cowrie.command.input
06 cowrie.command.success
07 cowrie.direct-tcpip.data
08 cowrie.direct-tcpip.request
09 cowrie.log.closed
10 cowrie.login.failed
11 cowrie.login.success
12 cowrie.session.closed
13 cowrie.session.connect
14 cowrie.session.file_download
15 cowrie.session.file_download.failed
16 cowrie.session.file_upload
17 cowrie.session.params
Average Number of Interactions per session 3.3519395723979004
[[0.         0.         0.         0.         0.         0.
  0.9996611  0.         0.         0.00011185 0.00010896 0.
  0.00011572 0.         0.         0.         0.         0.00000237]
 [0.00090494 0.         0.8504053  0.00133872 0.13103688 0.
  0.         0.         0.00181633 0.0144773  0.0000001  0.
  0.00001212 0.         0.         0.         0.00000831 0.        ]
 [0.00616119 0.         0.19589951 0.         0.78482433 0.
  0.         0.         0.001003

NameError: name 'np' is not defined