In [1]:
import logging
from pyspark.sql import SparkSession, functions as F, DataFrame
from pyspark.ml.fpm import FPGrowth

from warp.spark.io_adapters.utils.enums import AdapterType
from warp.core.io_adapters import get_adapter
from warp.spark.io import cache


def create_logger(name: str) -> logging.Logger:
    """creates a logger with specified name"""
    logging.basicConfig(
        level=logging.INFO,
        format="[%(asctime)s] {%(name)s:%(filename)s:%(lineno)s}"
               " %(levelname)s - %(message)s",
    )

    return logging.getLogger(name)


def cache_df(df: DataFrame) -> DataFrame:
    return cache(
        df, file_schema="file:/", bucket_name="home/burhan/project_data"
    )


logger = logging.getLogger('py4j')
logger.setLevel(logging.ERROR)

builder = (
    SparkSession.builder
    .master('local[*]')
    .appName('term-project')
    .config('spark.default.parallelism', 8)
    .config('spark.sql.shuffle.partitions', 8)
    .config('spark.port.maxRetries', 32)
    .config('spark.sql.adaptive.enabled', False)
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '4g')
)

spark = builder.getOrCreate()
parquet_io = get_adapter(AdapterType.PARQUET)(spark_session=spark)

In [2]:
rules_path = "/home/burhan/datastore/cache/a2507f68-3b46-4aea-a7ab-7c48c0ecce7d"
item_sets_path = "/home/burhan/datastore/cache/5b24b539-5ed5-4518-a857-e92711000df6"

In [3]:
rules_df = parquet_io.read(rules_path)

In [4]:
item_sets_df = parquet_io.read(item_sets_path)

In [6]:
rules_df.show()

In [7]:
item_sets_df.show()

In [5]:
rules_df.schema

In [6]:
item_sets_df.schema