In [1]:
import os
from glob import glob
from typing import List

import dotenv
import opendal
import polars as pl
import pyarrow as pa
import polars.selectors as s
from pyarrow import fs
from pyarrow.dataset import dataset

from src.helpers.utils import create_date_directories



In [2]:
# Example usage
base_path = "../data/log_content/"
start_date = "20220401"
end_date = "20220430"
directory_paths = create_date_directories(base_path, start_date, end_date)

In [47]:
def ingest_log():
    dotenv.load_dotenv()
    access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
    secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")
    region = os.getenv("AWS_REGION")
    op = opendal.Operator(
        "s3",
        root='/log_content/',
        bucket='data',
        region=region,
        endpoint='http://127.0.0.1:9000',
        access_key_id=access_key_id,
        secret_access_key=secret_access_key,
    )

    return op

In [7]:
directory_paths

['data/log_content/20220401.json',
 'data/log_content/20220402.json',
 'data/log_content/20220403.json',
 'data/log_content/20220404.json',
 'data/log_content/20220405.json',
 'data/log_content/20220406.json',
 'data/log_content/20220407.json',
 'data/log_content/20220408.json',
 'data/log_content/20220409.json',
 'data/log_content/20220410.json',
 'data/log_content/20220411.json',
 'data/log_content/20220412.json',
 'data/log_content/20220413.json',
 'data/log_content/20220414.json',
 'data/log_content/20220415.json',
 'data/log_content/20220416.json',
 'data/log_content/20220417.json',
 'data/log_content/20220418.json',
 'data/log_content/20220419.json',
 'data/log_content/20220420.json',
 'data/log_content/20220421.json',
 'data/log_content/20220422.json',
 'data/log_content/20220423.json',
 'data/log_content/20220424.json',
 'data/log_content/20220425.json',
 'data/log_content/20220426.json',
 'data/log_content/20220427.json',
 'data/log_content/20220428.json',
 'data/log_content/2

In [22]:

# setup cloud filesystem access
def ingest_by_pyarrow(paths: List[str] = None):
    dotenv.load_dotenv()
    cloudfs = fs.S3FileSystem(
        access_key=os.getenv("AWS_ACCESS_KEY_ID"),
        secret_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
        region=os.getenv("AWS_REGION"),
        endpoint_override='http://127.0.0.1:9000',
    )

    schema = pa.schema([
        pa.field('_index', pa.string()),
        pa.field('_type', pa.string()),
        pa.field('_id', pa.string()),
        pa.field('_score', pa.int64()),
        pa.field('_source',
                 pa.struct(
                     [
                         pa.field('Contract', pa.string()),
                         pa.field('Mac', pa.string()),
                         pa.field('TotalDuration', pa.int64()),
                         pa.field('AppName', pa.string()),
                     ]
                 )
                 )
    ])
    df = []

    for path in paths:
        ds = dataset(
            source=path,
            schema=schema,
            filesystem=cloudfs,
            format='json',
        )
        df.append(
            pl.scan_pyarrow_dataset(ds)
            .with_columns(pl.Series("Date", [path]).str.extract(r"\d{8}", 0))
            .select(
                pl.col("Date").str.to_date("%Y %m %d"),
                pl.col("_index").alias("Index"),
                pl.col("_type").alias("Type"),
                pl.col("_id").alias("Id"),
                pl.col("_score").alias("Score"),
                pl.col("_source"),
            )
            .unnest("_source")
        )

    return pl.concat(df)


In [23]:
del ldf
ldf = ingest_by_pyarrow(directory_paths)

In [28]:
ldf.describe()

statistic,Date,Index,Type,Id,Score,Contract,Mac,TotalDuration,AppName
str,str,str,str,str,f64,str,str,f64,str
"""count""","""48457499""","""48457499""","""48457499""","""48457499""",48457499.0,"""48457499""","""48457499""",48457499.0,"""48457499"""
"""null_count""","""0""","""0""","""0""","""0""",0.0,"""0""","""0""",0.0,"""0"""
"""mean""","""2022-04-15""",,,,0.0,,,15545.568128,
"""std""",,,,,0.0,,,188998.378306,
"""min""","""2022-04-01""","""history""","""app""","""AX_2CD-Aa1FFiv…",0.0,"""""","""001504D6BC59""",-889580000.0,"""APP"""
"""25%""","""2022-04-08""",,,,0.0,,,1054.0,
"""50%""","""2022-04-16""",,,,0.0,,,6765.0,
"""75%""","""2022-04-23""",,,,0.0,,,18562.0,
"""max""","""2022-04-30""","""history""","""vod""","""AYBxrfUVa1FFiv…",0.0,"""qad014541""","""FC017C984258""",86400.0,"""VOD"""


In [3]:
csvFile = "../data/log_content/*.json"
glob(csvFile)

['../data/log_content/20220415.json',
 '../data/log_content/20220403.json',
 '../data/log_content/20220423.json',
 '../data/log_content/20220419.json',
 '../data/log_content/20220418.json',
 '../data/log_content/20220422.json',
 '../data/log_content/20220402.json',
 '../data/log_content/20220414.json',
 '../data/log_content/20220425.json',
 '../data/log_content/20220409.json',
 '../data/log_content/20220429.json',
 '../data/log_content/20220413.json',
 '../data/log_content/20220405.json',
 '../data/log_content/20220404.json',
 '../data/log_content/20220412.json',
 '../data/log_content/20220428.json',
 '../data/log_content/20220408.json',
 '../data/log_content/20220424.json',
 '../data/log_content/20220427.json',
 '../data/log_content/20220407.json',
 '../data/log_content/20220411.json',
 '../data/log_content/20220410.json',
 '../data/log_content/20220406.json',
 '../data/log_content/20220426.json',
 '../data/log_content/20220430.json',
 '../data/log_content/20220401.json',
 '../data/lo

# Ingest the logging data

In [3]:
def get_log_json(paths: str | list[str]) -> pl.LazyFrame:
    """
    Function to ingest the logging json data, get the filename and add a "Date" column
    Args:
        paths (str | list[str]): a list of path to data, this path must be in glob pattern

    Returns:
        pl.LazyFame
    """
    if isinstance(paths, str):
        paths = [paths]
    # Define schema of logging data
    schema = {
        "_index" : pl.String,
        "_type"  : pl.String,
        "_id"    : pl.String,
        "_score" : pl.Int64,
        "_source": pl.Struct(
            [
                pl.Field("Contract", pl.String),
                pl.Field("Mac", pl.String),
                pl.Field("TotalDuration", pl.Int64),
                pl.Field("AppName", pl.String),
            ]
        ),
    }

    def _scan_log(path, schema):
        return (
            pl.scan_ndjson(path, schema=schema, low_memory=True)
            .with_columns(pl.Series("Date", [path]).str.extract(r"\d{8}", 0))
            .select(
                pl.col("Date").str.to_date("%Y %m %d"),
                pl.col("_index").alias("Index"),
                pl.col("_type").alias("Type"),
                pl.col("_id").alias("Id"),
                pl.col("_score").alias("Score"),
                pl.col("_source"),
            )
            .unnest("_source")
        )

    dfs = []
    # Function to scan and preprocess a single log data
    try:
        dfs = [_scan_log(path, schema) for path in paths]
    except Exception as e:
        print(f"Failed to read data from {paths}", {e})

    # Concatenate the processed lazyframes
    return pl.concat(dfs, how="vertical").lazy()

In [4]:
sources = get_log_json(directory_paths)

In [37]:
sources.describe()

statistic,Date,Index,Type,Id,Score,Contract,Mac,TotalDuration,AppName
str,str,str,str,str,f64,str,str,f64,str
"""count""","""48457499""","""48457499""","""48457499""","""48457499""",48457499.0,"""48457499""","""48457499""",48457499.0,"""48457499"""
"""null_count""","""0""","""0""","""0""","""0""",0.0,"""0""","""0""",0.0,"""0"""
"""mean""","""2022-04-15""",,,,0.0,,,15545.568128,
"""std""",,,,,0.0,,,188998.378306,
"""min""","""2022-04-01""","""history""","""app""","""AX_2CD-Aa1FFiv…",0.0,"""""","""001504D6BC59""",-889580000.0,"""APP"""
"""25%""","""2022-04-08""",,,,0.0,,,1054.0,
"""50%""","""2022-04-16""",,,,0.0,,,6765.0,
"""75%""","""2022-04-23""",,,,0.0,,,18562.0,
"""max""","""2022-04-30""","""history""","""vod""","""AYBxrfUVa1FFiv…",0.0,"""qad014541""","""FC017C984258""",86400.0,"""VOD"""


# Get the RFM table

In [27]:
sources.columns

['Date',
 'Index',
 'Type',
 'Id',
 'Score',
 'Contract',
 'Mac',
 'TotalDuration',
 'AppName']

In [5]:
reported_date = "20220501"


def get_rfm_table(sources: pl.LazyFrame, reported_date: str = "20220501", total_date: int = 30) -> pl.LazyFrame:
    b: pl.LazyFrame = (
        sources.group_by("Contract")
        .agg(
            pl.col("Date").max().alias("LatestDate")
        )
    )

    temp = (
        sources.with_columns(
            pl.lit(reported_date).str.to_date("%Y %m %d").alias("ReportedDate")
        )
    )

    rfm = (
        temp
        .filter(pl.col("Contract").str.len_chars() > 1)
        .join(b, on="Contract", how='left')
        .group_by("Contract")
        .agg(
            (pl.col("ReportedDate") - pl.col("LatestDate")).min().alias("Recency"),
            (pl.col("Date").n_unique() / pl.lit(total_date) * 100.0).round(2).alias("Frequency"),
            pl.sum("TotalDuration").alias("Monetary"),
        )
        .with_columns(
            pl.col("Recency").qcut(3, labels=["1", "2", "3"], allow_duplicates=True).alias("R"),
            pl.col("Frequency").qcut(3, labels=["1", "2", "3"], allow_duplicates=True).alias("F"),
            pl.col("Monetary").qcut(3, labels=["1", "2", "3"], allow_duplicates=True).alias("M")
        )
    )

    return rfm



In [7]:
get_rfm_table(sources).fetch(100)

Contract,Recency,Frequency,Monetary,R,F,M
str,duration[ms],f64,i64,cat,cat,cat
"""HNH694560""",4d,3.33,10607,"""1""","""1""","""2"""
"""THFD35248""",27d,3.33,48971,"""3""","""1""","""3"""
"""BDFD59542""",26d,3.33,44869,"""3""","""1""","""3"""
"""HNFD81424""",3d,3.33,2148,"""1""","""1""","""1"""
"""HPFD03625""",18d,3.33,85875,"""2""","""1""","""3"""
"""NBFD13727""",6d,3.33,42674,"""1""","""1""","""3"""
"""HND703462""",30d,3.33,211,"""3""","""1""","""1"""
"""SGD173674""",30d,3.33,241,"""3""","""1""","""1"""
"""TIFD02374""",29d,3.33,5540,"""3""","""1""","""1"""
"""BNFD66450""",17d,3.33,49859,"""2""","""1""","""3"""


In [8]:
app_names = [
    "CHANNEL",
    "KPLUS",
    "VOD",
    "FIMS",
    "BHD",
    "SPORT",
    "CHILD",
    "RELAX",
]

column_names = [
    "TVDuration",
    "TVDuration",
    "MovieDuration",
    "MovieDuration",
    "MovieDuration",
    "SportDuration",
    "ChildDuration",
    "RelaxDuration",
]
def get_pivot_data(sources: pl.LazyFrame, app_names: List[str], column_names: List[str]) -> pl.LazyFrame:
    if not isinstance(sources, pl.LazyFrame):
        sources = sources.lazy()

    if len(app_names) != len(column_names):
        raise ValueError("The lengths of app_names and column_names must be the same")

    mapping = dict(zip(app_names, column_names))
    pivot_df: pl.LazyFrame = (
        sources.select(
            pl.col("Contract"),
            pl.col("TotalDuration"),
            pl.col("AppName").replace(mapping, default="Unknown").alias("Type")
        )
        .filter(
            (pl.col("Contract").str.len_chars() > 1)
            & (pl.col("Type") != "Unknown")
            & (pl.col("TotalDuration") > 0)
        )
        .group_by(["Contract"])
        .agg(
            [
                pl.when(pl.col("Type") == y ).then(pl.col("TotalDuration")).sum().alias(y)
                for y in set(column_names)
            ]
        )
        .sort(["Contract", "TVDuration"])
    )

    return pivot_df

In [12]:
def get_most_watch(sources: pl.LazyFrame) -> pl.LazyFrame:
    if not isinstance(sources, pl.LazyFrame):
        sources = sources.lazy()
    columns = sources.columns[1:]
    watch_type = [item[:-8] for item in columns]

    return (
        sources
        .with_columns(
            pl.concat_list([
                pl.struct(pl.col(c).alias("l"), pl.lit(v).alias("k")) for c, v in zip(columns, watch_type)
                for c, v in zip(columns, watch_type)
            ]
            ).alias('temp')
        )
        .select(
            pl.col('Contract'),
            pl.col('temp').list.sort(descending=True).list.first().struct.field('k').alias('MostWatch')
        )
    )


In [9]:
pivot_df = get_pivot_data(sources, app_names, column_names)

In [10]:
pivot_df.fetch(100)

Contract,MovieDuration,RelaxDuration,ChildDuration,SportDuration,TVDuration
str,i64,i64,i64,i64,i64
"""AGAAA0895""",0,0,0,0,45044
"""AGAAA2984""",0,0,0,0,767
"""AGD018502""",0,0,0,0,813
"""AGD019199""",0,0,9516,0,0
"""AGD023638""",10740,0,0,0,0
"""AGD029777""",0,0,0,0,48881
"""AGD030574""",0,0,0,0,77587
"""AGFD02286""",0,0,0,0,44864
"""AGFD03433""",0,0,7441,0,0
"""AGFD03608""",0,0,0,0,20195


In [13]:
get_most_watch(pivot_df).fetch(100)

Contract,MostWatch
str,str
"""AGAAA0895""","""TV"""
"""AGAAA2984""","""TV"""
"""AGD018502""","""TV"""
"""AGD019199""","""Child"""
"""AGD023638""","""Movie"""
"""AGD029777""","""TV"""
"""AGD030574""","""TV"""
"""AGFD02286""","""TV"""
"""AGFD03433""","""Child"""
"""AGFD03608""","""TV"""


In [110]:
(pivot_df.with_columns(
    pl.concat_list([pl.struct(pl.col(c).alias("l"), pl.lit(v).alias("k")) for c, v in zip (columns, watch_type)]).alias('temp')
).select(
    pl.col('Contract'),
    pl.col('temp').list.sort(descending=True).list.first().struct.field('k')
)
 .fetch())


Contract,k
str,str
"""AGAAA0895""","""TV"""
"""AGAAA1084""","""TV"""
"""AGAAA1339""","""TV"""
"""AGAAA1998""","""Sport"""
"""AGAAA2151""","""TV"""
"""AGAAA2984""","""TV"""
"""AGAAA3269""","""TV"""
"""AGD003389""","""TV"""
"""AGD003690""","""TV"""
"""AGD005972""","""TV"""


In [105]:
columns

['MovieDuration',
 'TVDuration',
 'RelaxDuration',
 'ChildDuration',
 'SportDuration']

In [106]:
original_list = ['MovieDuration', 'TVDuration', 'RelaxDuration', 'ChildDuration', 'SportDuration']
filtered_list = [item for item in original_list if 'Duration' not in item]
filtered_list

[]