In [17]:
import logging
from pyspark.sql import SparkSession, functions as F, DataFrame
from pyspark.ml.fpm import FPGrowth

from warp.spark.io_adapters.utils.enums import AdapterType
from warp.core.io_adapters import get_adapter
from warp.spark.io import cache


def create_logger(name: str) -> logging.Logger:
    """creates a logger with specified name"""
    logging.basicConfig(
        level=logging.INFO,
        format="[%(asctime)s] {%(name)s:%(filename)s:%(lineno)s}"
               " %(levelname)s - %(message)s",
    )

    return logging.getLogger(name)


def cache_df(df: DataFrame) -> DataFrame:
    return cache(
        df, file_schema="file:/", bucket_name="home/burhan/project_data"
    )


logger = logging.getLogger('py4j')
logger.setLevel(logging.ERROR)

builder = (
    SparkSession.builder
    .master('local[*]')
    .appName('term-project')
    .config('spark.default.parallelism', 8)
    .config('spark.sql.shuffle.partitions', 8)
    .config('spark.port.maxRetries', 32)
    .config('spark.sql.adaptive.enabled', False)
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '4g')
)

spark = builder.getOrCreate()
parquet_io = get_adapter(AdapterType.PARQUET)(spark_session=spark)

In [22]:
path = "/home/burhan/project_data/analysis_data"

In [23]:
df = spark.read.parquet(path)

In [6]:
df.agg(
    F.countDistinct("date").alias("date_count"), 
    F.countDistinct("sales_id").alias("basket_count"), 
    F.count("transaction_id").alias("transaction_count")
).withColumn(
    "transaction_per_basket", 
    F.col("transaction_count") / F.col("basket_count")
).show()

In [24]:
df = df.groupby("sales_id").agg(F.collect_set(F.col("product_id")).alias("basket"))

In [26]:
df = cache_df(df)

In [27]:
df.show()

In [32]:
fp = FPGrowth(minSupport=0.01, minConfidence=0.01, itemsCol='basket', predictionCol='prediction')
model = fp.fit(df)

In [33]:
model.freqItemsets.show(10, False)

In [34]:
model.associationRules.show(20, False)