# Distill Example

### License

In [1]:
#
# Copyright 2022 The Applied Research Laboratory for Intelligence and Security (ARLIS)
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

### Imports Used in this Example

In [2]:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import connections
from elasticsearch_dsl import Search
from elasticsearch_dsl import Q
from elasticsearch_dsl.query import MultiMatch, Match
from collections import Counter, deque
from itertools import count
from uuid import uuid4

import distill
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import json
import itertools
import networkx as nx
import hashlib, base64
import plotly.graph_objects as go

## Define Search into Logging Database

Using Elasticsearch as a backend, we can create new connection to a test instance and define a search object based on that instance and a specific index to search.

In [3]:
flagonClient = connections.create_connection('flagonTest', hosts=['localhost:9200'], timeout=60)

#TODO describeabs connections

#hello world test
print(flagonClient)

<Elasticsearch([{'host': 'localhost', 'port': 9200}])>


In [4]:
AleS = Search(using='flagonTest', index="userale")

## Define Queries against Log Data

### Simple Queries

In [5]:
qLogType = Q("match", logType="raw") | Q("match", logType="custom")
print(qLogType)

Bool(should=[Match(logType='raw'), Match(logType='custom')])


In [6]:
qUserId = Q("match", userId="superset-user")
print(qUserId)

Match(userId='superset-user')


In [7]:
qExcludeSession = Q("match", sessionID="") & Q("match", sessionID="")
print(qExcludeSession)

Bool(must=[Match(sessionID=''), Match(sessionID='')])


### Not-As-Simple Queries

In [8]:
qUrl = Q({"wildcard": {
    "pageUrl": {
        "value": "*/superset/dashboard*"
    }
}})
print(qUrl)

Wildcard(pageUrl={'value': '*/superset/dashboard*'})


### Define Filters

In [9]:
filterEvents = Q('bool', filter=[~Q('terms', type=['keydown', 'mousedown', 'mouseup'])])
print(filterEvents)

Bool(filter=[Bool(must_not=[Terms(type=['keydown', 'mousedown', 'mouseup'])])])


## Chained Searches

In [10]:
elk_search = AleS \
    .query(qUrl) \
    .query(qLogType) \
    .query(qUserId) \
    .query(filterEvents) \
    .extra(track_total_hits=True) #breaks return limit of 10000 hits

NOTE: `.execute()` will only retreive the first 10 hits with additional terms embedded in queries. Use `.scan()` instead if you want to retreive all the hits. We use `.execute()` below for brevity.

In [11]:
ale_dict = {}
elk_response = elk_search.scan()
for hit in elk_response:
    logEntry = (hit.to_dict())
    logEntry['uid'] = distill.getUUID(logEntry)
    logEntry['clientTime'] = distill.epoch_to_datetime(logEntry['clientTime'])
    ctr = len(ale_dict)
    ctr += 1
    ale_dict[ctr] = logEntry

print(len(ale_dict))

11761


## Data Forensics
Data Forensics refers to ascertaining what is in our data. We may decide that we filtered to much or too little, and want to re-run our scan through ELK. Or, we may decide just how to apply filters as we go and "carve" out new dictionaries with less data, but more of the data we want. The following examples illustrate how to work with UserALE data in a dictionary format to perform data forensics.

### Sorting
Getting User logs into a logical sequence can aid in a number of operations down the line.

A simple lambda function helps in sorting our user log dict by `clientTime` (when logs were written by the client).

In [12]:
sorted_data = dict(sorted(ale_dict.items(), key = lambda kv: kv[1]['clientTime']))
len(sorted_data)

11761

### Searching
Before we can filter out what we don't want in our data, we have to be able to be able to describe what we do and don't want.  Dictionaries are a fast and efficient way to search through data and Distill provides some supporting libraries for finding the information you want from your user logs.

Distill's `find_meta_values` function uses list comprehensions to quickly provide all the unique values for specific key (e.g., `sessionID`, `userId`).

In [13]:
sessions = distill.find_meta_values('sessionID', sorted_data)
sessions

['session_1642561069785',
 'session_1642012917325',
 'session_1640118177195',
 'session_1642013755036',
 'session_1641502434428',
 'session_1641584276813',
 'session_1642562635205',
 'session_1641844965430',
 'session_1640029398947',
 'session_1642004982781',
 'session_1640200820004']

In [14]:
users = distill.find_meta_values('userId', sorted_data)
users 

['superset-user']

Relying on the dictionary format, we can quickly create new dictionaries with certain characteristics using simple dictionary comprehensions (e.g., a dictionary with all logs that contain the key: `path`; a dictionarey with all logs where `type`== `click`).

In [15]:
values = ['path']
sorted_data_paths = {k:v for k, v in sorted_data.items() if any(item in values for item in v.keys())}
len(sorted_data_paths)

11758

In [17]:
values = ['click']
sorted_data_paths_clicks = {k:v for k, v in sorted_data_paths.items() if any(item in values for item in v.values())}
len(sorted_data_paths_clicks)

390

Using the same methods, we can find all logs that refer to a specific DOM element in the field `path`. 

In [19]:
ele = ['div.superset-legacy-chart-world-map']
sorted_data_pathele = {k:v for k, v in sorted_data_paths.items() if any(item in ele for item in v['path'])}
len(sorted_data_pathele.keys())

1883

## Segmentation
User data is always nested in time--the things they do, explore, and select are bound to time. Segmentation is the practice of slicing a series of data into a set of epochs (time-bound bins of logs) defined by some characteristic. They can be very general (e.g., 30 second, non-overlapping intervals starting from the beginning of a user session), or they can be very specific (e.g., an epoch when users were interacting with a specific UI element with filters set). Segmentation is generally very challenging and in the realm of 'advanced user analytics'--Distill makes segmentation much easier. 

We want to to be able to create and curate segments without having to rewrite new datasets every time. We're going to start by creating a Master Dictionary for all our interesting segments:

In [20]:
superSegments = {}
superSegments

{}

### Finding Deadspace
One of the most simple ways of defining an epoch is that nothing is in it! We call this "deadspace"--a user might have started doing something else AFK, but we have no user behavior to indicate they've switch tasks (e.g., `blur` event). Deadspace can be useful to identify; we can omit it from other segments if we need to. Distill's `detect_deadspace` function is helpful for finding deadspace:

In [56]:
deadSpaceSegments = distill.detect_deadspace(sorted_data_paths, 60, 0, 0)
for counter, d in enumerate(deadSpaceSegments.values(), start=1):
    d.segment_name = str("deadSpace" + str(counter))
    d.segment_length_sec = (d.start_end_val[1] - d.start_end_val[0])/1000
    print(d.segment_name, d.segment_type, d.start_end_val, d.segment_length_sec, d.num_logs, d.uids)

deadSpace1 Segment_Type.DEADSPACE (1640029869288, 1640030063271) 193.983 4 [350, 301, 346, 347]
deadSpace2 Segment_Type.DEADSPACE (1640030219069, 1640098958897) 68739.828 2 [840, 924]
deadSpace3 Segment_Type.DEADSPACE (1640098969598, 1640099715182) 745.584 2 [991, 990]
deadSpace4 Segment_Type.DEADSPACE (1640099716927, 1640118186776) 18469.849 3 [1007, 674, 1126]
deadSpace5 Segment_Type.DEADSPACE (1640118296632, 1640182567282) 64270.65 6 [782, 981, 1121, 1222, 1227, 1241]
deadSpace6 Segment_Type.DEADSPACE (1640182573419, 1640201030250) 18456.831 3 [1231, 1426, 1589]
deadSpace7 Segment_Type.DEADSPACE (1640201075969, 1640201157009) 81.04 2 [1651, 1819]
deadSpace8 Segment_Type.DEADSPACE (1640201172007, 1640201332691) 160.684 2 [1812, 1869]
deadSpace9 Segment_Type.DEADSPACE (1640201346787, 1640201425011) 78.224 4 [1974, 2102, 2118, 2127]
deadSpace10 Segment_Type.DEADSPACE (1640201494535, 1640201568436) 73.901 4 [2183, 2184, 2185, 2186]
deadSpace11 Segment_Type.DEADSPACE (1640201578551, 1640

Seems like deadspace is everywhere; could be that what we thought might be deadspace is really normal use. We can quickly regenerate segments with modified time parameters (real deadspace is a longer absence of behavior):

In [58]:
deadSpaceSegments = distill.detect_deadspace(sorted_data_paths, 180, 0, 0)
for counter, d in enumerate(deadSpaceSegments.values(), start=1):
    d.segment_name = str("deadSpace" + str(counter))
    d.segment_length_sec = (d.start_end_val[1] - d.start_end_val[0])/1000
    print(d.segment_name, d.segment_type, d.start_end_val, d.segment_length_sec, d.num_logs, d.uids)

deadSpace1 Segment_Type.DEADSPACE (1640029869288, 1640030063271) 193.983 4 [350, 301, 346, 347]
deadSpace2 Segment_Type.DEADSPACE (1640030219069, 1640098958897) 68739.828 2 [840, 924]
deadSpace3 Segment_Type.DEADSPACE (1640098969598, 1640099715182) 745.584 2 [991, 990]
deadSpace4 Segment_Type.DEADSPACE (1640099716927, 1640118186776) 18469.849 3 [1007, 674, 1126]
deadSpace5 Segment_Type.DEADSPACE (1640118296632, 1640182567282) 64270.65 6 [782, 981, 1121, 1222, 1227, 1241]
deadSpace6 Segment_Type.DEADSPACE (1640182573419, 1640201030250) 18456.831 3 [1231, 1426, 1589]
deadSpace7 Segment_Type.DEADSPACE (1640202073587, 1641502449931) 1300376.344 3 [3040, 3049, 3061]
deadSpace8 Segment_Type.DEADSPACE (1641502510070, 1641584293776) 81783.706 3 [3018, 4019, 4020]
deadSpace9 Segment_Type.DEADSPACE (1641584821749, 1641585078332) 256.583 2 [3315, 3316]
deadSpace10 Segment_Type.DEADSPACE (1641585237896, 1641585458459) 220.563 2 [3580, 3408]
deadSpace11 Segment_Type.DEADSPACE (1641585467013, 164184

Add the deadspace segments that seem more reasonable to the Master Dictionary of segments:

In [None]:
superSegments.update(deadSpaceSegments)
for d in superSegments.values():
    print(d.segment_name, d.segment_type, d.start_end_val, d.num_logs, d.uids)

### Simple Segments - Toggles

In [None]:
toggleEle = ['button.ant-btn superset-button css-1mljg09', 'div#chart-id-515.filter_box']

In [None]:
mapSegments = distill.generate_segments(sorted_data_paths,'path',['div.superset-legacy-chart-world-map','window'],0,30)
for counter, d in enumerate(mapSegments.values(), start=1):
    d.segment_name = str("map_" + str(counter))
    print(d.segment_name, d.start_end_val, d.num_logs, d.uids)

In [None]:

# Python code to merge dict using update() method
def Merge(dict1, dict2):
    return(dict2.update(dict1))

In [None]:
mapSegments_list = []
mapSegment_times = []
for d in mapSegments.values():
    if d.num_logs > 50:
        mapSegments_list.append(d.segment_name)
        mapSegment_times.append(d.start_end_val)
        print(d.segment_name, d.start_end_val, d.num_logs, d.uids)

In [None]:
filterDataBy = ['click']
sorted_data_paths_clicks = {k:v for k, v in sorted_data_paths.items() if any(item in values for item in v.values())}
len(sorted_data_paths_clicks)

In [None]:
mapSegments_data = distill.write_segment(sorted_data_paths_clicks, mapSegments_list, mapSegment_times)
list(mapSegments_data.keys())

## Graphs and Stats

In [None]:
edges_map_1 = distill.pairwiseSeq(['|'.join(log['path']) for log in mapSegments_data['map_1'].values()])
edges_list_map_1 = list(edges_map_1)
edges_map_2 = distill.pairwiseSeq(['|'.join(log['path']) for log in mapSegments_data['map_2'].values()])
edges_list_map_2 = list(edges_map_2)

In [None]:
nodes_map_1 = set(['|'.join(log['path']) for log in mapSegments_data['map_1'].values()])
nodes_list_map_1 = list(nodes_map_1)
nodes_map_2 = set(['|'.join(log['path']) for log in mapSegments_data['map_2'].values()])
nodes_list_map_2 = list(nodes_map_2)

In [None]:
G_map1 = distill.createDiGraph(nodes_list_map_1, edges_list_map_1, drop_recursions = False)
G_map2 = distill.createDiGraph(nodes_list_map_2, edges_list_map_2, drop_recursions = False)

In [None]:
nx.draw(G_map1, with_labels=False)

In [None]:
nx.draw(G_map2, with_labels=False)

In [None]:
nx.average_node_connectivity(G_map2)

## Exploratory Visualization

In [None]:
distill.sankey(edges_map_2,[nodes_list_map_2[item].split("|")[0] for item in range(len(nodes_list_map_2))])

In [None]:
distill.funnel

In [None]:
#clickRate

In [None]:
edge_list_temp = []
for row in edges_segmentN:
    if row[0] != row[1]: 
        edge_list_temp.append(row)
edge_list = edge_list_temp

edge_list_counter = Counter(edge_list)

source_list = [i[0] for i in edge_list_counter.keys()]
target_list = [i[1] for i in edge_list_counter.keys()]
value_list = [i for i in edge_list_counter.values()]

nodes = []
for row in edge_list:
    for col in row:
        if col not in nodes:
            nodes.append(col)           
            
sources = []
for i in source_list:
       sources.append(nodes.index(i))
targets = []
for i in target_list:
        targets.append(nodes.index(i))
values = value_list

fig = go.Figure(data=[go.Sankey(
    node = dict(
      label = [nodes[item].split("|")[0] for item in range(len(nodes))],
    ),
    link = dict(
      source = sources,
      target = targets,
      value = values
  ))])

fig.show()

# WIP

In [None]:
x = [hashlib.md5('_'.join(log['path']).encode('utf-8')).digest() for log in finalSegments['...'].values()]
y = [hashlib.md5('_'.join(log['path']).encode('utf-8')).digest() for log in finalSegments['...'].values()]
set(x) & set (y)

In [None]:
x = ['_'.join(log['path']) for log in finalSegments['...'].values()]
y = ['_'.join(log['path']) for log in finalSegments['...'].values()]
set(x) & set(y)

In [None]:
nx.graph_edit_distance(G_segmentN, G_segmentN)

In [None]:
for v in nx.optimize_graph_edit_distance(G_segmentN, G_segmentN):
    minv = v
minv