In [1]:
import os
from glob import glob
import duckdb
import polars as pl

In [2]:
from datetime import datetime, timedelta


def create_date_directories(base_path, start_date, end_date, format="%Y%m%d"):
    """
    Creates a list of directories for each date between start_date and end_date, inclusive.

    Args:
        base_path: The base path where directories will be created.
        start_date: The start date as a string in the specified format.
        end_date: The end date as a string in the specified format.
        format: The date format string to use (default is YYYY-MM-DD).

    Returns:
        A list of directory paths created.
    """

    # Convert dates to datetime objects
    try:
        start_date = datetime.strptime(start_date, format).date()
        end_date = datetime.strptime(end_date, format).date()
    except ValueError:
        raise ValueError(f"Invalid date format. Please use '{format}'")

    # Check if start date is before end date
    if start_date > end_date:
        raise ValueError("Start date must be before end date.")

    # Create list to store directory paths
    directory_paths = []

    # Iterate through dates and create directories
    current_date = start_date
    while current_date <= end_date:
        # Format date string
        date_str = current_date.strftime(format)
        # Create directory path
        directory_path = os.path.join(base_path, f"{date_str}.json")
        # Add path to list
        directory_paths.append(directory_path)
        # Increment date
        current_date += timedelta(days=1)

    return directory_paths


# Example usage
base_path = "../data/log_content/"
start_date = "20220401"
end_date = "20220430"
directory_paths = create_date_directories(base_path, start_date, end_date)

print("Created directories:")
for path in directory_paths:
    print(path)


Created directories:
../data/log_content/20220401.json
../data/log_content/20220402.json
../data/log_content/20220403.json
../data/log_content/20220404.json
../data/log_content/20220405.json
../data/log_content/20220406.json
../data/log_content/20220407.json
../data/log_content/20220408.json
../data/log_content/20220409.json
../data/log_content/20220410.json
../data/log_content/20220411.json
../data/log_content/20220412.json
../data/log_content/20220413.json
../data/log_content/20220414.json
../data/log_content/20220415.json
../data/log_content/20220416.json
../data/log_content/20220417.json
../data/log_content/20220418.json
../data/log_content/20220419.json
../data/log_content/20220420.json
../data/log_content/20220421.json
../data/log_content/20220422.json
../data/log_content/20220423.json
../data/log_content/20220424.json
../data/log_content/20220425.json
../data/log_content/20220426.json
../data/log_content/20220427.json
../data/log_content/20220428.json
../data/log_content/2022042

In [3]:
csvFile = "../data/log_content/*.json"
glob(csvFile)

['../data/log_content/20220415.json',
 '../data/log_content/20220403.json',
 '../data/log_content/20220423.json',
 '../data/log_content/20220419.json',
 '../data/log_content/20220418.json',
 '../data/log_content/20220422.json',
 '../data/log_content/20220402.json',
 '../data/log_content/20220414.json',
 '../data/log_content/20220425.json',
 '../data/log_content/20220409.json',
 '../data/log_content/20220429.json',
 '../data/log_content/20220413.json',
 '../data/log_content/20220405.json',
 '../data/log_content/20220404.json',
 '../data/log_content/20220412.json',
 '../data/log_content/20220428.json',
 '../data/log_content/20220408.json',
 '../data/log_content/20220424.json',
 '../data/log_content/20220427.json',
 '../data/log_content/20220407.json',
 '../data/log_content/20220411.json',
 '../data/log_content/20220410.json',
 '../data/log_content/20220406.json',
 '../data/log_content/20220426.json',
 '../data/log_content/20220430.json',
 '../data/log_content/20220401.json',
 '../data/lo

# Ingest the logging data

In [5]:
def get_log_json(paths: str | list[str]) -> pl.LazyFrame:
    """
    Function to ingest the logging json data, get the filename and add a "Date" column
    Args:
        paths (str | list[str]): a list of path to data, this path must be in glob pattern

    Returns:
        pl.LazyFame
    """
    if isinstance(paths, str):
        paths = [paths]
    # Define schema of logging data
    schema = {
        "_index" : pl.String,
        "_type"  : pl.String,
        "_id"    : pl.String,
        "_score" : pl.Int64,
        "_source": pl.Struct(
            [
                pl.Field("Contract", pl.String),
                pl.Field("Mac", pl.String),
                pl.Field("TotalDuration", pl.Int64),
                pl.Field("AppName", pl.String),
            ]
        ),
    }

    def _scan_log(path, schema):
        return (
            pl.scan_ndjson(path, schema=schema, low_memory=True)
            .with_columns(pl.Series("Date", [path]).str.extract(r"\d{8}", 0))
            .select(
                pl.col("Date").str.to_date("%Y %m %d"),
                pl.col("_index").alias("Index"),
                pl.col("_type").alias("Type"),
                pl.col("_id").alias("Id"),
                pl.col("_score").alias("Score"),
                pl.col("_source"),
            )
            .unnest("_source")
        )

    dfs = []
    # Function to scan and preprocess a single log data
    try:
        dfs = [_scan_log(path, schema) for path in paths]
    except Exception as e:
        print(f"Failed to read data from {paths}", {e})

    # Concatenate the processed lazyframes
    return pl.concat(dfs, how="vertical").lazy()

In [6]:
sources = get_log_json(directory_paths)

# Get the RFM table

In [12]:
sources.schema


OrderedDict([('Date', Date),
             ('Index', String),
             ('Type', String),
             ('Id', String),
             ('Score', Int64),
             ('Contract', String),
             ('Mac', String),
             ('TotalDuration', Int64),
             ('AppName', String)])

In [39]:
reported_date = "20220501"


def get_rfm_table(sources: pl.LazyFrame, reported_date: str = "20220501", total_date: int = 30) -> pl.LazyFrame:
    b: pl.LazyFrame = (
        sources.group_by("Contract")
        .agg(
            pl.col("Date").max().alias("LatestDate")
        )
    )

    temp = (
        sources.with_columns(
            pl.lit(reported_date).str.to_date("%Y %m %d").alias("ReportedDate")
        )
    )
    # print("Shape of temp and b")
    # print(f" shape of b {b.collect().shape}, shape of temp {temp.collect().shape}")
    rfm = (
        temp
        .filter(pl.col("Contract").str.len_chars() > 1)
        .join(b, on="Contract", how='cross')
        .group_by("Contract")
        .agg(
            (pl.col("ReportedDate") - pl.col("LatestDate")).min().alias("RecencyScore"),
            (pl.col("Date").n_unique() / pl.lit(total_date) * 100.0).round(2).alias("FrequencyScore"),
            pl.sum("TotalDuration").alias("MonetaryScore"),
        )
        .sort("Contract")

        # .with_columns(
        #     pl.col("RecencyScore").qcut(10)
        #     .over(pl.col("Contract")).alias("R"),
        #     # pl.col("FrequencyScore").qcut(3, labels=["1", "2", "3"]).alias("F"),
        #     # pl.col("MonetaryScore").qcut(3, labels=["1", "2", "3"]).alias("M")
        # )
    )

    return rfm


# pl.lit(total_date).cast(pl.Date)
# new_sources = get_log_json("../data/log_content/20220404.json")
get_rfm_table(sources).fetch(100)




Contract,RecencyScore,FrequencyScore,MonetaryScore
str,duration[ms],f64,i64
"""AGAAA0895""",1d,3.33,133645548
"""AGAAA2984""",1d,3.33,2275689
"""AGD018502""",1d,3.33,2412171
"""AGD019199""",1d,3.33,28233972
"""AGD023638""",1d,3.33,31865580
"""AGD029777""",1d,3.33,145029927
"""AGD030574""",1d,3.33,230200629
"""AGFD02286""",1d,3.33,133111488
"""AGFD03433""",1d,3.33,22077447
"""AGFD03608""",1d,3.33,59918565


In [None]:
pl.col()

In [None]:
# heap sort