In [2]:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import connections
from elasticsearch_dsl import Search
from elasticsearch_dsl import Q
from elasticsearch_dsl.query import MultiMatch, Match
from collections import Counter, deque
from itertools import count
from uuid import uuid4

import distill
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import json
import itertools
import networkx as nx
import hashlib, base64
import plotly.graph_objects as go

In [None]:
def createDiGraph(nodes, edges, *, drop_recursions: bool = False):
    """
    Creates NetworkX Directed Graph Object (G) from defined node, edge list
    :param nodes: Series or List of Events, Elements
    :param edges: Series or List of Pairs
    :param drop_recursions: if True eliminates self:self pairs in edges
    :return: A NetworkX graph object
    """
    G=nx.DiGraph()
    G.add_nodes_from(nodes)
    if drop_recursions==True:
        edges_filtered = []
        for row in edges:
            if row[0] != row[1]:
                edges_filtered.append(row)
        G.add_edges_from(edges_filtered)
        return G
    else:
        G.add_edges_from(edges)
        return G


In [None]:
def pairwiseStag(iterable, *, split: bool = False):
    '''
    Creates sequence of staggered, pairwise tuples for edge-lists: "s -> (s0, s1), (s2, s3), (s4, s5), ..."
    :param iterable: a series or list
    :param split: split=True returns pairwise elements in two separate lists of same len (default=False)
    :return: returns list object(s)
    '''
    a = iter(iterable)
    pairs = zip(a, a)
    if split==True:
        list1, list2 = zip(*pairs)
        return list1, list2
    else:
        return list(pairs)

In [None]:
def pairwiseSeq(iterable, *, split: bool = False):
    """
    Creates sequence of pairwise tuples that can be used as edge-lists: "s -> (s0, s1), (s1, s2), (s2, s3), ..."
    :param iterable: a series or list
    :param split=True returns pairwise elements in two separate lists of same len (default=False)
    :return: returns list object(s)
    """
    a, b = itertools.tee(iterable, 2)
    next(b, None)
    pairs = zip(a, b)
    if split==True:
        list1, list2 = zip(*pairs)
        return list1, list2
    else:
        return list(pairs)

# Define Search

create new connection to test instance, given an alias 'flagonTest' for later reference
note: connections are easy enough such that examples are enough, no need for further abstraction

In [3]:
flagonClient = connections.create_connection('flagonTest', hosts=['localhost:9200'], timeout=60)

#TODO describeabs connections

#hello world test
print(flagonClient)

<Elasticsearch([{'host': 'localhost', 'port': 9200}])>


In [4]:
AleS = Search(using='flagonTest', index="userale")

# Define Queries

## Simple Queries

In [5]:
qLogType = Q("match", logType="raw") | Q("match", logType="custom")
print(qLogType)

Bool(should=[Match(logType='raw'), Match(logType='custom')])


In [6]:
qUserId = Q("match", userId="superset-user")
print(qUserId)

Match(userId='superset-user')


In [7]:
qExcludeSession = Q("match", sessionID="") & Q("match", sessionID="")
print(qExcludeSession)

Bool(must=[Match(sessionID=''), Match(sessionID='')])


## Not-As-Simple Queries

In [8]:
qUrl = Q({"wildcard": {
    "pageUrl": {
        "value": "*/superset/dashboard*"
    }
}})
print(qUrl)

Wildcard(pageUrl={'value': '*/superset/dashboard*'})


# Define Filters

In [9]:
filterEvents = Q('bool', filter=[~Q('terms', type=['mouseover','wheel','keydown','scroll'])])
print(filterEvents)

Bool(filter=[Bool(must_not=[Terms(type=['mouseover', 'wheel', 'keydown', 'scroll'])])])


# Chained Searches

In [10]:
elk_search = AleS \
    .query(qUrl) \
    .query(qLogType) \
    .query(qUserId) \
    .query(filterEvents) \
    .extra(track_total_hits=True) #breaks return limit of 10000 hits

NOTE: `.execute()` will only retreive the first 10 hits with additional terms embedded in queries. Use `.scan()` instead if you want to retreive all the hits. We use `.execute()` below for brevity.

In [11]:
ale_dict = {}
elk_response = elk_search.scan()
for hit in elk_response:
    logEntry = (hit.to_dict())
    logEntry['uid'] = distill.getUUID(logEntry)
    logEntry['clientTime'] = distill.epoch_to_datetime(logEntry['clientTime'])
    ctr = len(ale_dict)
    ctr += 1
    ale_dict[ctr] = logEntry

print(len(ale_dict))

545


# Data Forensics

In [12]:
sorted_data = dict(sorted(ale_dict.items(), key = lambda kv: kv[1]['clientTime']))

In [13]:
sessions = distill.find_meta_values('sessionID', sorted_data)
sessions

['session_1640029398947',
 'session_1640200820004',
 'session_1640118177195',
 'session_1641844965430',
 'session_1641502434428',
 'session_1641584276813']

In [14]:
users = distill.find_meta_values('userId', sorted_data)
users 

['superset-user']

# Segmentation

In [18]:
values = ['path']
sorted_data_paths = {k:v for k, v in sorted_data.items() if any(item in values for item in v.keys())}
len(sorted_data_paths)

544

In [16]:
values = ['click']
sorted_data_paths_clicks = {k:v for k, v in sorted_data_paths.items() if any(item in values for item in v.values())}
len(sorted_data_paths_clicks)

103

In [None]:
value = 'div.superset-legacy-chart-world-map'
segment_times = pairwiseStag([log['clientTime'] for log in sorted_data_paths_clicks.values() if value in log['path']])
len(semiAuto_times)

In [None]:
segment_names = []
for i in range(0,len(segment_times),1):
    segment_names.append(str("segment" + str(i)))

In [None]:
testSegment = segment.Segment.create_segment(sorted_data_paths_clicks, segment_names, segment_times)
for d in testSegment.values():
    print(d.segment_name, d.start_end_val, d.num_logs)

In [None]:
segment_names_2 = []
segment_times_2 = []
for d in testSegment.values():
    if d.num_logs > 20:
        segment_names_2.append(d.segment_name)
        segment_times_2.append(d.start_end_val)

In [None]:
finalSegments = segment.Segment.write_segment(sorted_data_paths_clicks, segment_names_2, segment_times_2)
finalSegments.keys()

# Graphs and Stats

In [None]:
edges_segmentN = pairwiseSeq(['|'.join(log['path']) for log in finalSegments['...'].values()])
edges_segmentN = pairwiseSeq(['|'.join(log['path']) for log in finalSegments['...'].values()])

In [None]:
nodes_segmentN = set(['|'.join(log['path']) for log in finalSegments['...'].values()])
nodes_segmentN = set(['|'.join(log['path']) for log in finalSegments['...'].values()])

In [None]:
G_segmentN = createDiGraph(nodes_semiAuto5, edges_segmentN, drop_recursions = False)
G_segmentN = createDiGraph(nodes_semiAuto3, edges_segmentN, drop_recursions = False)

In [None]:
nx.draw(G_segmentN, with_labels=False)

In [None]:
nx.draw(G_segmentN, with_labels=False)

In [None]:
nx.average_node_connectivity(G_segmentN)

# Enhanced Visualization

In [None]:
edge_list_temp = []
for row in edges_segmentN:
    if row[0] != row[1]: 
        edge_list_temp.append(row)
edge_list = edge_list_temp

edge_list_counter = Counter(edge_list)

source_list = [i[0] for i in edge_list_counter.keys()]
target_list = [i[1] for i in edge_list_counter.keys()]
value_list = [i for i in edge_list_counter.values()]

nodes = []
for row in edge_list:
    for col in row:
        if col not in nodes:
            nodes.append(col)           
            
sources = []
for i in source_list:
       sources.append(nodes.index(i))
targets = []
for i in target_list:
        targets.append(nodes.index(i))
values = value_list

fig = go.Figure(data=[go.Sankey(
    node = dict(
      label = [nodes[item].split("|")[0] for item in range(len(nodes))],
    ),
    link = dict(
      source = sources,
      target = targets,
      value = values
  ))])

fig.show()

# WIP

In [None]:
x = [hashlib.md5('_'.join(log['path']).encode('utf-8')).digest() for log in finalSegments['...'].values()]
y = [hashlib.md5('_'.join(log['path']).encode('utf-8')).digest() for log in finalSegments['...'].values()]
set(x) & set (y)

In [None]:
x = ['_'.join(log['path']) for log in finalSegments['...'].values()]
y = ['_'.join(log['path']) for log in finalSegments['...'].values()]
set(x) & set(y)

In [None]:
nx.graph_edit_distance(G_segmentN, G_segmentN)

In [None]:
for v in nx.optimize_graph_edit_distance(G_segmentN, G_segmentN):
    minv = v
minv