In [7]:
import logging
from pyspark.sql import SparkSession, functions as F, DataFrame
from pyspark.ml.fpm import FPGrowth

from warp.spark.io_adapters.utils.enums import AdapterType
from warp.core.io_adapters import get_adapter
from warp.spark.io import cache


def create_logger(name: str) -> logging.Logger:
    """creates a logger with specified name"""
    logging.basicConfig(
        level=logging.INFO,
        format="[%(asctime)s] {%(name)s:%(filename)s:%(lineno)s}"
               " %(levelname)s - %(message)s",
    )

    return logging.getLogger(name)


def cache_df(df: DataFrame) -> DataFrame:
    return cache(
        df, file_schema="file:/", bucket_name="home/burhan/project_data"
    )


logger = logging.getLogger('py4j')
logger.setLevel(logging.ERROR)

builder = (
    SparkSession.builder
    .master('local[*]')
    .appName('term-project')
    .config('spark.default.parallelism', 8)
    .config('spark.sql.shuffle.partitions', 8)
    .config('spark.port.maxRetries', 32)
    .config('spark.sql.adaptive.enabled', False)
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '4g')
)

spark = builder.getOrCreate()
parquet_io = get_adapter(AdapterType.PARQUET)(spark_session=spark)

In [12]:
df = parquet_io.read('/home/burhan/datastore/association/etl/products_local')

In [13]:
df.count()

In [8]:
path = "/home/burhan/datastore/association/etl/transactional_sales_local"

In [4]:
df.agg(F.max("date")).show()

In [9]:
df.groupby("sales_type").agg(F.count("*")).distinct().show()

In [12]:
df.show()

In [4]:
df.agg(
    F.countDistinct("date").alias("date_count"), 
    F.countDistinct("sales_id").alias("basket_count"), 
    F.count("transaction_id").alias("transaction_count")
).withColumn(
    "transaction_per_basket", 
    F.col("transaction_count") / F.col("basket_count")
).show()

In [5]:
df = df.where(F.col("product_id") != F.lit(890999))
df = df.where(F.col("product_id") != F.lit(888376))
df = df.groupby("sales_id").agg(F.collect_set(F.col("product_id")).alias("basket"))

In [6]:
df = cache_df(df)

In [34]:
df.show()

In [7]:
fp = FPGrowth(minSupport=0.001, minConfidence=0.001, itemsCol='basket', predictionCol='prediction')
model = fp.fit(df)

In [None]:
model.freqItemsets.show(20, False)

In [None]:
model.freqItemsets.count()

In [37]:
model.associationRules.show(20, False)

In [None]:
model.associationRules.count()

In [8]:
cache_df(model.associationRules)

In [None]:
cache_df(model.freqItemsets)

In [18]:
model.associationRules.orderBy("lift").show(211, False)

In [28]:
spark.conf.get("spark.serializer")