# core

> Read and query chronicle parquet files.

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import polars as pl
import pyarrow.parquet as pq
import pyarrow.dataset as ds
from s3fs import S3FileSystem
import pandas as pd
import plotly.express as px
from fastcore.basics import patch
import re

## Read chronicle parquet files

Chronicle collects and stores logs and metrics in a series of parquet files.

Use `read_chronicle()` to read either logs or metrics, by specifying the path to the parquet set you need.

The file tree looks like this, with `logs` and `metrics` in separate folders inside `v1`.

``` bash
.
└── v1/
    ├── logs/
    └── metrics/
```

Inside both `logs` and `metrics` the data is stored by date, separated by year, month and day.

``` bash
.
└── v1/
    ├── logs/
    │   └── 2023/
    │       ├── 02/
    │       │   ├── 01
    │       │   ├── 02
    │       │   ├── 03
    │       │   ├── 04
    │       │   ├── 05
    │       │   └── ...
    │       ├── 03
    │       ├── 04
    │       └── ...
    └── metrics/
        └── 2023/
            ├── 02/
            │   ├── 01
            │   ├── 02
            │   ├── 03
            │   ├── 04
            │   ├── 05
            │   └── ...
            ├── 03
            ├── 04
            └── ...
```

## Using the read interface

In [None]:
#| hide
#| export
def read_chronicle(
        path: str, # Path to dataset,
        type: str = "", # must be `metrics` or `logs`
        date:str = "", # date in format `YYYY/MM/DD` 
        version: str = "v1" # currently must be `v1`
    ) -> pl.DataFrame:
    "Read a chronicle parquet file into a polars dataframe."
    path = f"{path}/{version}/{type}/{date}"
    return pl.from_arrow(pq.read_table(path))



In [None]:
#| export
def read_chronicle_metrics(
        path: str, # Path to dataset,
        date:str = "", # date in format `YYYY/MM/DD` 
        version: str = "v1" # currently must be `v1`
) -> pl.DataFrame:
    "Read a chronicle metrics parquet file into a polars dataframe."
    return read_chronicle(path, "metrics", date, version = version) 

def read_chronicle_logs(
        path: str, # Path to dataset,
        date:str = "", # date in format `YYYY/MM/DD` 
        version: str = "v1" # currently must be `v1`
) -> pl.DataFrame:
    "Read a chronicle logs parquet file into a polars dataframe."
    return read_chronicle(path, "logs", date, version) 


In [None]:
z = read_chronicle_metrics("./data", "2023/04/03")
assert type(z) == pl.DataFrame
assert z.columns == [
    'service',
    'host',
    'os',
    'attributes',
    'name',
    'description',
    'unit',
    'type',
    'timestamp',
    'value_float',
    'value_int',
    'value_uint',
    'value_column'
]

In [None]:

z = read_chronicle_logs("./data", "2023/04/03")
assert type(z) == pl.dataframe.frame.DataFrame
assert z.columns == [
    'service', 
    'host', 
    'os', 
    'attributes', 
    'body', 
    'timestamp'
]

## Using the scan interface

In [None]:
#| export
def scan_chronicle(
        path: str, # Path to dataset,
        type: str = "", # must be `metrics` or `logs`
        date:str = "", # date in format `YYYY/MM/DD` 
        filename: str = None, # name of parquet file. If empty, will be inferred.
        version: str = "v1" # currently must be `v1`
    ) -> pl.LazyFrame:
    "Read a chronicle parquet file into a polars LazyFrame."
    date = re.sub("-", "/", date)
    dateh = re.sub("/", "-", date)
    if filename == None:
        filename = f"{type}-{dateh}.parquet"
    path = f"{path}/{version}/{type}/{date}/{filename}"
    # return path
    return pl.scan_parquet(path)


In [None]:

scan_chronicle("./data", "metrics", "2023/04/03")


In [None]:
#| export
def scan_chronicle_metrics(
        path: str, # Path to dataset,
        date:str = "", # date in format `YYYY/MM/DD` 
        version: str = "v1" # currently must be `v1`
) -> pl.DataFrame:
    "Read a chronicle metrics parquet file into a polars dataframe."
    return scan_chronicle(path, "metrics", date, version = version) 

def scan_chronicle_logs(
        path: str, # Path to dataset,
        date:str = "", # date in format `YYYY/MM/DD` 
        version: str = "v1" # currently must be `v1`
) -> pl.DataFrame:
    "Read a chronicle logs parquet file into a polars dataframe."
    return scan_chronicle(path, "logs", date, version = version) 


In [None]:
z = scan_chronicle_metrics("./data", "2023/04/03")
assert type(z) == pl.LazyFrame
assert z.collect().columns == [
    'service',
    'host',
    'os',
    'attributes',
    'name',
    'description',
    'unit',
    'type',
    'timestamp',
    'value_float',
    'value_int',
    'value_uint',
    'value_column'
]

In [None]:

z = scan_chronicle_logs("./data", "2023/04/03")
assert type(z) == pl.LazyFrame
assert z.collect().columns == [
    'service', 
    'host', 
    'os', 
    'attributes', 
    'body', 
    'timestamp'
]

## Analyse metrics

In [None]:
#| export
# @pl.api.register_dataframe_namespace("metrics")
# class ChronicleMetrics:
#     def __init__(self, 
#                  df: pl.DataFrame # A `polars` DataFrame
#                  ) -> pl.DataFrame:
#         "Initialise a chronicle metrics class"
#         self._df = df

@pl.api.register_lazyframe_namespace("metrics")
class ChronicleMetrics:
    def __init__(self, 
                 ldf: pl.LazyFrame # A `polars` DataFrame
                 ) -> pl.LazyFrame:
        "Initialise a chronicle metrics class"
        self._ldf = ldf



In [None]:
#| export
@patch
def describe(self: ChronicleMetrics) -> pl.DataFrame:
    "Reads metrics dataframe and returns a pandas dataframe with summary of service, name and description of all metrics"
    return (
        self._ldf
        .groupby("service", "name")
        .agg(
            pl.col("description").unique(),
            pl.col("value_column").unique(),
        )
        .with_columns(
            pl.col("description").arr.join(", "),
            pl.col("value_column").arr.join("")
        )
        .sort("service", "name")
        .collect()
        .to_pandas()
    )



In [None]:

scan_chronicle_metrics("../temp/", "2023/04/03").metrics.describe()

The metrics data has a single row for each collected metric.

Use `describe()` to get a DataFrame of the unique metrics in the metrics data, containing the `service`, `name` and `description` of each metric.

In [None]:

m = scan_chronicle_metrics("./data", "2023/04/03").metrics.describe()
assert list(m) == ['service', 'name', 'description', 'value_column']
m

In [None]:
#| export
@patch
def filter(self: ChronicleMetrics, 
        name:str, # name of metric to extract
        alias:str = None # alias to use for new column
    ) -> pd.DataFrame:
    "Extract a single metric from a metrics dataframe"
    if alias == None:
        alias = name
    return (
        self._ldf
        .filter(pl.col("name") == name)
        .sort(pl.col("host"), pl.col("timestamp"))
        .select([
            "host",
            pl.col("timestamp"),
            pl.col("value_float").alias(alias)
        ])
        .collect()
        .to_pandas()
    )


You can filter the DataFrame on the `name` column.

In [None]:
m = scan_chronicle_metrics("./data", "2023/04/03").metrics.filter("rsconnect_system_memory_used")
assert type(m) == pd.DataFrame
assert list(m) == ['host', 'timestamp', 'rsconnect_system_memory_used']

m = scan_chronicle_metrics("./data", "2023/04/03").metrics.filter("rsconnect_system_memory_used", "memory")
assert type(m) == pd.DataFrame
assert list(m) == ['host', 'timestamp', 'memory']

m


In [None]:
#| export
@patch
def plot(
        self:ChronicleMetrics, # metrics dataframe
        name:str, # name of metric to extract
        alias:str = None # alias to use for new column
    ) -> px.line: 
    "Plot a selected metric using a Plotly line plot"

    dat = self._ldf.metrics.filter(name, alias)
    fig = px.line(dat, x='timestamp', y=alias, line_group="host", color="host")
    return fig

In [None]:
m = scan_chronicle_metrics("./data", "2023/04/03")
p = m.metrics.plot("rsconnect_system_memory_used", "memory")
assert str(type(p)) == "<class 'plotly.graph_objs._figure.Figure'>"

p

## Analyse logs

In [None]:

#| export
@pl.api.register_lazyframe_namespace("logs")
class ChronicleLogs:
    def __init__(self, 
                 df: pl.DataFrame # A polars data frame
                 ) -> pl.DataFrame:
        "Initialise a chronicle logs DataFrame"
        self._ldf = df

#### Filter logs on type

You can 

In [None]:
#| export
@patch
def filter_type(self: ChronicleLogs,
                value: str # Value to extract 
    ) -> pd.DataFrame:
    "Extract all logs where type == value"
    return (
        self._ldf
        .with_columns([
            (pl.col("body").str.json_path_match(f"$.{value}").alias(f".{value}")),
            (pl.col("body").str.json_path_match("$.type").alias(".type"))
        ])
        .filter(pl.col(f".{value}").is_not_null())
        .select(["service", "host", "timestamp", f".{value}", ".type", "body"])
        .sort("service", "host", f".{value}", "timestamp")
        .collect()
    )

In [None]:
logs = scan_chronicle_logs("./data", "2023/04/03").logs.filter_type("username")
assert type(logs) == pl.DataFrame

In [None]:
scan_chronicle_logs("./data", "2023/04/03").logs.filter_type("username")

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()