# core

> Read and query chronicle parquet files.

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import polars as pl
import pyarrow.parquet as pq
import pyarrow.dataset as ds
from s3fs import S3FileSystem
import pandas as pd
import plotly.express as px
from fastcore.basics import patch

## Read chronicle parquet files (logs and metrics)

In [None]:
#| export
def read_chronicle(
        dataset: str # Path to dataset
    ) -> pl.DataFrame:
    "Read a chronicle parquet file into a polars dataframe."
    df = pl.from_arrow(pq.read_table(dataset))
    return df


In [None]:
z = read_chronicle("./data/v1/metrics")
assert type(z) == pl.dataframe.frame.DataFrame

z = read_chronicle("./data/v1/logs")
assert type(z) == pl.dataframe.frame.DataFrame

## Analyse metrics

#### describe_measures()

The metrics data has a single row for each collected metric.

Use `describe()` to get a DataFrame of the unique metrics in the metrics data, containaing the service, name and description of each metric.

In [None]:
#| export
@pl.api.register_dataframe_namespace("metrics")
class ChronicleMetrics:
    def __init__(self, df: pl.DataFrame):
        self._df = df

    def describe(self) -> pl.DataFrame:
        "Reads metrics dataframe and returns a pandas dataframe with summary of service, name and description of all metrics"
        return (
            self._df
            .groupby("service", "name")
            .agg(
                pl.col("description").unique(),
                pl.col("value_column").unique(),
            )
            .sort("service", "name")
            .to_pandas()
        )


In [None]:

read_chronicle("./data/v1/metrics/").metrics.describe()

#### get_metric_values()

In [None]:
#| export
@patch
def filter(self: ChronicleMetrics, 
        name:str, # name of metric to extract
        alias:str # alias to use for new column
    ) -> pd.DataFrame:
    "Extract a single metric from a metrics dataframe"
    return (
        self._df
        .lazy()
        .filter(pl.col("name") == name)
        .sort(pl.col("host"), pl.col("timestamp"))
        .select([
            "host",
            pl.col("timestamp"),
            pl.col("value_float").alias(alias)
        ])
        .collect()
        .to_pandas()
    )


In [None]:
read_chronicle("./data/v1/metrics/").metrics.filter("rsconnect_system_memory_used", "memory")

#### Plot metric

In [None]:
#| export
@patch
def plot(
        self:ChronicleMetrics, # metrics dataframe
        name:str, # name of metric to extract
        alias:str # alias to use for new column
    ) -> px.line: 
    "Plot a selected metric using a Plotly line plot"

    dat = self._df.metrics.filter(name, alias) 
    fig = px.line(dat, x='timestamp', y=alias, line_group="host", color="host")
    return fig



In [None]:
m = read_chronicle("./data/v1/metrics/")
p = m.metrics.plot("rsconnect_system_memory_used", "memory")
assert str(type(p)) == "<class 'plotly.graph_objs._figure.Figure'>"

read_chronicle("./data/v1/metrics/").metrics.plot("rsconnect_system_memory_used", "memory")

## Analyse logs

In [None]:

#| export
@pl.api.register_dataframe_namespace("logs")
class ChronicleLogs:
    def __init__(self, df: pl.DataFrame):
        self._df = df


Overriding existing custom namespace 'logs' (on DataFrame)



In [None]:
#| export
@patch
def filter_type(self: ChronicleLogs,
                value: str # Value to extract 
    ) -> pd.DataFrame:
    "Extract all logs where type == value"
    return (
        self._df
        .lazy()
        .with_columns([
            (pl.col("body").str.json_path_match(f"$.{value}").alias(f".{value}")),
            (pl.col("body").str.json_path_match("$.type").alias(".type"))
        ])
        .filter(pl.col(f".{value}").is_not_null())
        .select(["service", "host", "timestamp", f".{value}", ".type", "body"])
        .sort("service", "host", f".{value}", "timestamp")
        .collect()
    )

#### Filter logs on type

In [None]:
logs = read_chronicle("./data/v1/logs").logs.filter_type("username")
assert type(logs) == pl.DataFrame

# assert logs


polars.dataframe.frame.DataFrame

In [None]:

read_chronicle("./data/v1/logs").logs.filter_type("username")

service,host,timestamp,.username,.type,body
str,str,datetime[ms],str,str,str
"""workbench""","""rstudio-workbe…",2023-04-03 18:01:26.761,"""james""","""session_exit""","""{""pid"":236,""us…"
"""workbench""","""rstudio-workbe…",2023-04-07 03:46:57.960,"""james""","""auth_login""","""{""pid"":1059,""u…"
"""workbench""","""rstudio-workbe…",2023-04-07 03:51:15.161,"""james""","""session_start""","""{""pid"":236,""us…"
"""workbench""","""rstudio-workbe…",2023-04-07 04:19:18.561,"""james""","""session_exit""","""{""pid"":236,""us…"
"""workbench""","""rstudio-workbe…",2023-04-07 04:19:19.764,"""james""","""session_start""","""{""pid"":883,""us…"
"""workbench""","""rstudio-workbe…",2023-04-07 05:43:02.161,"""james""","""session_exit""","""{""pid"":883,""us…"
"""workbench""","""rstudio-workbe…",2023-04-10 13:14:39.961,"""james""","""session_quit""","""{""pid"":1781,""u…"
"""workbench""","""rstudio-workbe…",2023-04-10 13:15:13.561,"""james""","""session_start""","""{""pid"":236,""us…"
"""workbench""","""rstudio-workbe…",2023-04-10 13:21:53.361,"""james""","""session_start""","""{""pid"":237,""us…"
"""workbench""","""rstudio-workbe…",2023-04-10 13:41:46.960,"""james""","""session_exit""","""{""pid"":237,""us…"


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()