In [1]:
import plotly.express as px

In [2]:
import plotly as plt

In [3]:
import pandas as pd

In [4]:
import plotly.graph_objects as go

In [5]:
import numpy as np

In [6]:
import re

# Language spec
- Задает последовательность ивентов
- Нужны:
    - Произвольный ивент
    - Длина между ивентами
    - Ивент из списка

In [None]:
up -> left -> right|left -> * -> (10 s) -> 

In [7]:
timeseries = np.random.rand(1500, 3).T

In [12]:
time = np.arange(0, timeseries.shape[-1])

In [13]:
events_df = pd.read_csv('mock-session-events.csv')
events_df['start'] = events_df['start_sec']
events_df['end'] = events_df['end_sec'] 


In [14]:
events_df.head()

Unnamed: 0.1,Unnamed: 0,start_sec,end_sec,type,start,end
0,0,0.5333,0.6333,down,0.5333,0.6333
1,1,0.5333,0.6333,left,0.5333,0.6333
2,2,0.93797,1.03797,down,0.93797,1.03797
3,3,1.410773,1.510773,right,1.410773,1.510773
4,4,1.684613,1.784613,up,1.684613,1.784613


# Grammar variant

In [46]:
! pip install lark --upgrade


Collecting lark
  Downloading lark-1.0.0-py2.py3-none-any.whl (109 kB)
[K     |████████████████████████████████| 109 kB 1.4 MB/s eta 0:00:01
[?25hInstalling collected packages: lark
Successfully installed lark-1.0.0


In [47]:
from lark import Lark

Tree(Token('RULE', 'start'), [Token('WORD', 'Hello'), Token('WORD', 'World')])


In [117]:
grammar = """
start: query_part "->" "[" query_part "]" "->" query_part | "[" query_part "]" "->" query_part | query_part "->" "[" query_part "]" | "[" query_part "]"

query_part: connection | node

connection: node "->" query_part

node: wildcard | interval | rejection | EVENT

rejection: "!" EVENT

interval: "(" NUMBER "s" ")"

wildcard: "*"

NUMBER : /\d+/
EVENT: /\w+/

%ignore " "
"""

In [118]:
l = Lark(grammar)

In [119]:
tree = l.parse("up -> [left -> (10s) -> up] -> * -> !left") 

In [121]:
help(tree)

Help on Tree in module lark.tree object:

class Tree(builtins.object)
 |  Tree(data: str, children: 'List[Union[str, Tree]]', meta: Union[lark.tree.Meta, NoneType] = None) -> None
 |  
 |  The main tree class.
 |  
 |  Creates a new tree, and stores "data" and "children" in attributes of the same name.
 |  Trees can be hashed and compared.
 |  
 |  Parameters:
 |      data: The name of the rule or alias
 |      children: List of matched sub-rules and terminals
 |      meta: Line & Column numbers (if ``propagate_positions`` is enabled).
 |          meta attributes: line, column, start_pos, end_line, end_column, end_pos
 |  
 |  Methods defined here:
 |  
 |  __deepcopy__(self, memo)
 |  
 |  __eq__(self, other)
 |      Return self==value.
 |  
 |  __hash__(self) -> int
 |      Return hash(self).
 |  
 |  __init__(self, data: str, children: 'List[Union[str, Tree]]', meta: Union[lark.tree.Meta, NoneType] = None) -> None
 |      Initialize self.  See help(type(self)) for accurate signature

In [122]:
print( _.pretty() )

start
  query_part
    node	up
  query_part
    connection
      node	left
      query_part
        connection
          node
            interval	10
          query_part
            node	up
  query_part
    connection
      node
        wildcard
      query_part
        node
          rejection	left



# DL

In [15]:
def create_events_dictionary(events):
    events_dict = {}
    
    for event in events:
        for letter in event:
            if letter not in events_dict.values():
                events_dict[event] = letter
                break
                
            assert False, "No such letter"
    
    return events_dict

In [16]:
unique_events = events_df['type'].unique()

In [17]:
events_dict = create_events_dictionary(unique_events)

In [18]:
def get_eventql_source_string(events_df):
    global events_dict
    
    parts = events_df['type'].apply(events_dict.get).tolist()
    
    return "".join(parts)

In [19]:
source_string = get_eventql_source_string(events_df)
source_string

'dldruldruldluurdluldrdlurlrldruldrludrlrlurlurlrlurlruldruldruldruldrlrlrdrldruldruldrulrldurdlurdlrlurld'

In [None]:
def extract_event_regex(event):
    global events_dict
    
    event = event.strip()
    if event == "*":
        keys = events_dict.keys()
        keys_str = "".join(keys)
        return f"[{keys_str}]"
    if "|" in event:
        events = [extract_event_regex(e) for e in event.split('|')]
        "|".join(events)
        return f"({})"
    else:
        letter = events_dict[event]
        return letter

In [20]:
def extract_sub_regex(eventql_string):
    global events_dict
    
    events = eventql_string.split('->')
    
    regex_parts = []
    for event in events:
        if not event:
            continue
        letter = events_dict[event]
        regex_parts.append(letter)
    
    return "".join(regex_parts)

In [21]:
def extract_regex(eventql_string):
    eventql_string = eventql_string.replace(' ', '')
    head, body_tail = eventql_string.split('[')
    body, tail = body_tail.split(']')
    
    head_regex = extract_sub_regex(head)
    body_regex = extract_sub_regex(body)
    tail_regex = extract_sub_regex(tail)
    
    return f"""
        (?<={head_regex})({body_regex})(?={tail_regex})
    """.strip()

In [39]:
eventql_regex = extract_regex('[up -> right] -> left')

In [40]:
eventql_regex

'(?<=)(ur)(?=l)'

In [41]:
source_string

'dldruldruldluurdluldrdlurlrldruldrludrlrlurlurlrlurlruldruldruldruldrlrlrdrldruldruldrulrldurdlurdlrlurld'

In [42]:
def search_regex_indices(source_string, regex):
    positions_df = pd.DataFrame(
        [(m.start(0), m.end(0) - 1) for m in re.finditer(regex, source_string)],
        columns=['start', 'end']
    )
    
    print(positions_df)
    
    start_time = events_df.iloc[
        positions_df['start']
    ]['start'].tolist()

    end_time = events_df.iloc[
        positions_df['end']
    ]['end'].tolist()
    
    fragments_df = pd.DataFrame(
        np.array([start_time, end_time]).T,
        columns=['start', 'end']
    )
    fragments_df['type'] = ''
    
    return fragments_df

In [43]:
fragments_df = search_regex_indices(source_string, eventql_regex)

   start  end
0     23   24
1     41   42
2     44   45
3     49   50
4    101  102


In [45]:
fragments_df

Unnamed: 0,start,end,type
0,6.301649,6.709698,
1,11.680421,11.918487,
2,12.480304,12.805356,
3,13.525066,13.898259,
4,27.722666,28.176576,


# Подсчет статистик по набору фрагментов

In [29]:
from scipy import stats

In [34]:
def calculate_simple_statistics(timeseries, fragments_df):
    if fragments_df.shape[0] == 0:
        return {
            'count': 0,
            'length': 0,
            'scores': []
        }
    fragments_length = fragments_df['start'] - fragments_df['end']
    fragments_count = fragments_df.shape[0]
    
    mean_fragment_length = fragments_length.mean()
    timeseries_parts = []
    
    for i, fragment in fragments_df.iterrows():
        timeseries_part = timeseries[:, int(fragment['start']):int(fragment['end'])]
        timeseries_parts.append(timeseries_part)
        
    timeseries_parts_concat = np.hstack(timeseries_parts)
    
    channel_means = timeseries_parts_concat.mean(axis=1)
    channel_stds = timeseries_parts_concat.std(axis=1)
    n_points = timeseries_part.shape[-1]
    channel_scores = stats.norm.cdf(channel_means / channel_stds * np.sqrt(n_points))
    
    return {
        'count': fragments_count,
        'length': mean_fragment_length,
        'scores': channel_scores
    }

In [35]:
calculate_simple_statistics(timeseries, fragments_df)

{'count': 0, 'length': 0, 'scores': []}

# Параметры:
- Показывать/не показывать ивенты
- Текущий регех событий
- 