# core

> Read and query chronicle parquet files.

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import polars as pl
import pyarrow.parquet as pq
import pyarrow.dataset as ds
from s3fs import S3FileSystem
import pandas as pd
import plotly.express as px
from fastcore.basics import patch
import re

## Read chronicle parquet files

Chronicle collects and stores logs and metrics in a series of parquet files.

Use `read_chronicle()` to read either logs or metrics, by specifying the path to the parquet set you need.

The file tree looks like this, with `logs` and `metrics` in separate folders inside `v1`.

``` bash
.
└── v1/
    ├── logs/
    └── metrics/
```

Inside both `logs` and `metrics` the data is stored by date, separated by year, month and day.

``` bash
.
└── v1/
    ├── logs/
    │   └── 2023/
    │       ├── 02/
    │       │   ├── 01
    │       │   ├── 02
    │       │   ├── 03
    │       │   ├── 04
    │       │   ├── 05
    │       │   └── ...
    │       ├── 03
    │       ├── 04
    │       └── ...
    └── metrics/
        └── 2023/
            ├── 02/
            │   ├── 01
            │   ├── 02
            │   ├── 03
            │   ├── 04
            │   ├── 05
            │   └── ...
            ├── 03
            ├── 04
            └── ...
```

## Using the read interface

In [None]:
#| hide
#| export
def read_chronicle(
        path: str, # Path to dataset,
        type: str = "", # must be `metrics` or `logs`
        date:str = "", # date in format `YYYY/MM/DD` 
        version: str = "v1" # currently must be `v1`
    ) -> pl.DataFrame:
    "Read a chronicle parquet file into a polars dataframe."
    path = f"{path}/{version}/{type}/{date}"
    return pl.from_arrow(pq.read_table(path))



In [None]:
#| export
def read_chronicle_metrics(
        path: str, # Path to dataset,
        date:str = "", # date in format `YYYY/MM/DD` 
        version: str = "v1" # currently must be `v1`
) -> pl.DataFrame:
    "Read a chronicle metrics parquet file into a polars dataframe."
    return read_chronicle(path, "metrics", date, version = version) 

def read_chronicle_logs(
        path: str, # Path to dataset,
        date:str = "", # date in format `YYYY/MM/DD` 
        version: str = "v1" # currently must be `v1`
) -> pl.DataFrame:
    "Read a chronicle logs parquet file into a polars dataframe."
    return read_chronicle(path, "logs", date, version) 


In [None]:
z = read_chronicle_metrics("./data", "2023/04/03")
assert type(z) == pl.DataFrame
assert z.columns == [
    'service',
    'host',
    'os',
    'attributes',
    'name',
    'description',
    'unit',
    'type',
    'timestamp',
    'value_float',
    'value_int',
    'value_uint',
    'value_column'
]

In [None]:

z = read_chronicle_logs("./data", "2023/04/03")
assert type(z) == pl.dataframe.frame.DataFrame
assert z.columns == [
    'service', 
    'host', 
    'os', 
    'attributes', 
    'body', 
    'timestamp'
]

## Using the scan interface

In [None]:
#| export
def scan_chronicle(
        path: str, # Path to dataset,
        type: str = "", # must be `metrics` or `logs`
        date:str = None, # date in format `YYYY/MM/DD` 
        filename: str = None, # name of parquet file. If empty, will be inferred.
        version: str = "v1" # currently must be `v1`
    ) -> pl.LazyFrame:
    "Read a chronicle parquet file into a polars LazyFrame."
    if date == None:
        date = "*/*/*"
    else:
        date = re.sub("-", "/", date)
        dateh = re.sub("/", "-", date)
    if filename == None:
        # filename = f"{type}-{dateh}.parquet"
        filename = "*.parquet"
    path = f"{path}/{version}/{type}/{date}/{filename}"
    # return path
    return pl.scan_parquet(path)



In [None]:

scan_chronicle("./data", "metrics", "2023/04/03")


In [None]:
#| export
def scan_chronicle_metrics(
        path: str, # Path to dataset,
        date:str = None, # date in format `YYYY/MM/DD` 
        version: str = "v1" # currently must be `v1`
) -> pl.DataFrame:
    "Read a chronicle metrics parquet file into a polars dataframe."
    return scan_chronicle(path, "metrics", date, version = version) 

def scan_chronicle_logs(
        path: None, # Path to dataset,
        date:str = None, # date in format `YYYY/MM/DD` 
        version: str = "v1" # currently must be `v1`
) -> pl.DataFrame:
    "Read a chronicle logs parquet file into a polars dataframe."
    return scan_chronicle(path, "logs", date, version = version) 


In [None]:
z = scan_chronicle_metrics("./data", "2023/04/03")
assert type(z) == pl.LazyFrame
assert z.collect().columns == [
    'service',
    'host',
    'os',
    'attributes',
    'name',
    'description',
    'unit',
    'type',
    'timestamp',
    'value_float',
    'value_int',
    'value_uint',
    'value_column'
]

In [None]:

z = scan_chronicle_logs("./data", "2023/04/03")
assert type(z) == pl.LazyFrame
assert z.collect().columns == [
    'service', 
    'host', 
    'os', 
    'attributes', 
    'body', 
    'timestamp'
]

## Analyse metrics

In [None]:
#| export
# @pl.api.register_dataframe_namespace("metrics")
# class ChronicleMetrics:
#     def __init__(self, 
#                  df: pl.DataFrame # A `polars` DataFrame
#                  ) -> pl.DataFrame:
#         "Initialise a chronicle metrics class"
#         self._df = df

@pl.api.register_lazyframe_namespace("metrics")
class ChronicleMetrics:
    def __init__(self, 
                 ldf: pl.LazyFrame # A `polars` DataFrame
                 ) -> pl.LazyFrame:
        "Initialise a chronicle metrics class"
        self._ldf = ldf



  class ChronicleMetrics:


Use `.metrics.describe()` to get a DataFrame of the unique metrics in the metrics data, containing the `service`, `name` and `description` of each metric.

In [None]:
#| export
@patch
def describe(self: ChronicleMetrics) -> pl.DataFrame:
    "Reads metrics dataframe and returns a pandas dataframe with summary of service, name and description of all metrics"
    return (
        self._ldf
        .groupby("service", "name")
        .agg(
            pl.col("description").unique(),
            pl.col("value_column").unique(),
        )
        .with_columns(
            pl.col("description").arr.join(", "),
            pl.col("value_column").arr.join("")
        )
        .sort("service", "name")
        .collect()
        .to_pandas()
    )



In [None]:

scan_chronicle_metrics("../temp/", "2023/04/03").metrics.describe()

Unnamed: 0,service,name,description,value_column
0,,system.cpu.time,Total CPU seconds broken down by different sta...,value_float
1,,system.memory.usage,Bytes of memory in use.,value_int
2,connect-metrics,go_goroutines,Number of goroutines that currently exist.,value_float
3,connect-metrics,go_info,Information about the Go environment.,value_float
4,connect-metrics,go_memstats_alloc_bytes,Number of bytes allocated and still in use.,value_float
...,...,...,...,...
176,workbench-metrics,scrape_series_added,The approximate number of new series in this s...,value_float
177,workbench-metrics,statsd_metric_mapper_cache_gets_total,The count of total metric cache gets.,value_float
178,workbench-metrics,statsd_metric_mapper_cache_hits_total,The count of total metric cache hits.,value_float
179,workbench-metrics,statsd_metric_mapper_cache_length,The count of unique metrics currently cached.,value_float


In [None]:

m = scan_chronicle_metrics("./data", "2023/04/03").metrics.describe()
assert list(m) == ['service', 'name', 'description', 'value_column']
m

Unnamed: 0,service,name,description,value_column
0,,system.cpu.time,Total CPU seconds broken down by different sta...,value_float
1,,system.memory.usage,Bytes of memory in use.,value_int
2,connect-metrics,go_goroutines,Number of goroutines that currently exist.,value_float
3,connect-metrics,go_info,Information about the Go environment.,value_float
4,connect-metrics,go_memstats_alloc_bytes,Number of bytes allocated and still in use.,value_float
...,...,...,...,...
176,workbench-metrics,scrape_series_added,The approximate number of new series in this s...,value_float
177,workbench-metrics,statsd_metric_mapper_cache_gets_total,The count of total metric cache gets.,value_float
178,workbench-metrics,statsd_metric_mapper_cache_hits_total,The count of total metric cache hits.,value_float
179,workbench-metrics,statsd_metric_mapper_cache_length,The count of unique metrics currently cached.,value_float


Use `.metrics.filter()` to filter the DataFrame on the `name` column.

In [None]:
#| export
@patch
def filter(self: ChronicleMetrics, 
        name:str, # name of metric to extract
        service:str = None, # service to extract metric from
        alias:str = None # alias to use for new column
    ) -> pd.DataFrame:
    "Extract a single metric from a metrics dataframe"
    if alias == None:
        alias = name
    
    df = (
        self._ldf
        .filter(
            pl.col("name") == name
        )
    )
    
    if service != None:
        df = df.filter(pl.col("service") == service) 
    
    return (
        df
        .sort(pl.col("host"), pl.col("timestamp"))
        .select([
            "host",
            pl.col("timestamp"),
            pl.col("value_float").alias(alias)
        ])
        .collect()
        .to_pandas()
    )


In [None]:
m = scan_chronicle_metrics("./data", "2023/04/03").metrics.filter("rsconnect_system_memory_used")
assert type(m) == pd.DataFrame
assert list(m) == ['host', 'timestamp', 'rsconnect_system_memory_used']

m = scan_chronicle_metrics("./data", "2023/04/03").metrics.filter("rsconnect_system_memory_used", alias="memory")
assert type(m) == pd.DataFrame
assert list(m) == ['host', 'timestamp', 'memory']

m = scan_chronicle_metrics("./data", "2023/04/03").metrics.filter("rsconnect_system_memory_used", service = "connect-metrics", alias = "memory")
assert type(m) == pd.DataFrame
assert list(m) == ['host', 'timestamp', 'memory']


You can plot a single metric using `.metrics.plot()`.

In [None]:
#| export
import altair as alt
@patch
def plot(
        self:ChronicleMetrics, # metrics dataframe
        name:str, # name of metric to extract
        service:str = None, # service to extract metric from
        title:str = None, # title of plot
        alias:str = None, # alias to use for new column
        engine: str = "altair" # plotting engine to use - either plotly or altair
    ) -> px.line: 
    "Plot a selected metric using a Plotly line plot"

    if alias == None:
        alias = name

    if title == None:
        title = alias

    dat = self._ldf.metrics.filter(name, service=service, alias=alias)

    if engine == 'altair':
        fig = (
            alt.Chart(dat, title = title)
            .mark_line()
            .encode(
                x = 'timestamp',
                y = alias,
                color = "host"
            )
        )
    else:
        fig = px.line(dat, x='timestamp', y=alias, line_group="host", color="host", title=title)
    
    return fig



In [None]:
m = scan_chronicle_metrics("./data", "2023/04/03")
p = m.metrics.plot("rsconnect_system_memory_used", service = "connect-metrics", alias = "memory")
# assert str(type(p)) == "<class 'plotly.graph_objs._figure.Figure'>"

p

Plot using `altair`

## Analyse logs

In [None]:

#| export
@pl.api.register_lazyframe_namespace("logs")
class ChronicleLogs:
    def __init__(self, 
                 df: pl.DataFrame # A polars data frame
                 ) -> pl.DataFrame:
        "Initialise a chronicle logs DataFrame"
        self._ldf = df

  class ChronicleLogs:


#### Filter logs on type

You can use `logs/filter_type()` to filter logs on the `type` column.

In [None]:
#| export
@patch
def filter_type(self: ChronicleLogs,
                value: str # Value to extract 
    ) -> pd.DataFrame:
    "Extract all logs where type == value"
    return (
        self._ldf
        .with_columns([
            (pl.col("body").str.json_path_match(f"$.{value}").alias(f".{value}")),
            (pl.col("body").str.json_path_match("$.type").alias(".type"))
        ])
        .filter(pl.col(f".{value}").is_not_null())
        .select(["service", "host", "timestamp", f".{value}", ".type", "body"])
        .sort("service", "host", f".{value}", "timestamp")
        .collect()
    )

In [None]:
logs = scan_chronicle_logs("./data", "2023/04/03").logs.filter_type("username")
assert type(logs) == pl.DataFrame

## Connect logins

In [None]:

#| export
@patch
def connect_logins(
    self: ChronicleLogs,
    ) -> pl.DataFrame:
    "Extract Connect login logs"
    return (
        self._ldf
        .with_columns([
            pl.col("body").str.json_path_match("$.type").alias("type"),
            pl.col("body").str.json_path_match("$.action").alias("action"),
            pl.col("body").str.json_path_match("$.actor_description").alias("username"),
        ])
        .filter(
            (pl.col("service") == "connect") &
            (pl.col("type") == "audit") &
            (pl.col("action") == "user_login")
        )
        .select("host", "timestamp", "username", "action", "type")
        .collect()
    )


In [None]:

path = "./data"
scan_chronicle_logs(path).logs.connect_logins()

host,timestamp,username,action,type
str,datetime[ms],str,str,str
"""rstudio-connec…",2023-04-03 19:30:36.098,"""Ming Beckwith …","""user_login""","""audit"""
"""rstudio-connec…",2023-04-04 14:29:24.298,"""Tim Margheim (…","""user_login""","""audit"""


## Workbench logins

In [None]:
#| export
@patch
def workbench_logins(
    self: ChronicleLogs,
    ) -> pl.DataFrame:
    "Extract Workbench login logs"
    return (
        self._ldf
        .with_columns([
            pl.col("body").str.json_path_match("$.type").alias("type"),
            pl.col("body").str.json_path_match("$.action").alias("action"),
            pl.col("body").str.json_path_match("$.username").alias("username"),
        ])
        .filter(
            (pl.col("service") == "workbench") &
            (pl.col("type") == "auth_login")
        )
        .select("host", "timestamp", "username", "action", "type")
        .collect()
    )


In [None]:

path = "./data"
scan_chronicle_logs(path).logs.workbench_logins()

host,timestamp,username,action,type
str,datetime[ms],str,str,str
"""rstudio-workbe…",2023-04-04 16:21:48.361,"""monanshi.shah""",,"""auth_login"""
"""rstudio-workbe…",2023-04-06 21:44:28.264,"""lisa.anders""",,"""auth_login"""
"""rstudio-workbe…",2023-04-07 03:46:57.960,"""james""",,"""auth_login"""
"""rstudio-workbe…",2023-04-11 17:14:54.265,"""lisa.anders""",,"""auth_login"""
"""rstudio-workbe…",2023-05-05 13:47:58.865,"""cole""",,"""auth_login"""
"""rstudio-workbe…",2023-05-08 15:19:22.190,"""andrie""",,"""auth_login"""
"""rstudio-workbe…",2023-05-08 16:58:39.698,"""monanshi.shah""",,"""auth_login"""
"""rstudio-workbe…",2023-05-09 14:30:10.252,"""monanshi.shah""",,"""auth_login"""


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()