In [81]:
import ast
import json
from typing import Tuple

from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.functions import from_json, col, udf, split, regexp_replace, regexp_extract, to_timestamp, to_date
from pyspark.sql.types import StructType, StructField, StringType, TimestampType

In [2]:
builder = (
    SparkSession.builder.appName("homework1")
    .config("spark.driver.memory", "16g")
    .config("spark.driver.cores", 3)
    .config("sql.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"
    )
)
spark = configure_spark_with_delta_pip(builder).getOrCreate()

23/10/21 14:18:57 WARN Utils: Your hostname, Andrea-Le-MBP-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.9 instead (on interface en0)
23/10/21 14:18:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/Users/sonle/.sdkman/candidates/spark/3.4.0/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/sonle/.ivy2/cache
The jars for the packages stored in: /Users/sonle/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-da2e2fed-bb57-454c-8204-1bdf835d6d5c;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 237ms :: artifacts dl 10ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   

In [3]:
schema = StructType([
    StructField("Mac", StringType(), True),
    StructField("SessionMainMenu", StringType(), True),
    StructField("LogId", StringType(), True),
    StructField("Event", StringType(), True),
    StructField("ItemId", StringType(), True),
    StructField("RealTimePlaying", StringType(), True)
])
log_dir = "/Users/sonle/Documents/GitHub/spark-playground/data/DataSampleTest/logt21.txt"
user_dir = "/Users/sonle/Documents/GitHub/spark-playground/data/DataSampleTest/user_info.txt"

In [90]:
def read_data(log_paths: str, user_info_path: str, log_schema: StructType) -> tuple[DataFrame, DataFrame, DataFrame]:
    """
    Input the log_path for log user data and user_info_path for user info. Log path must be a glob pattern
    For log user data, an additional dict_to_json is needed to revalidate the json strings
    For user info data, small modification is needed to get 2-columns format
    :param user_info_path: strings, a path for user info data
    :param log_paths: strings, a path for log user data
    :param log_schema: StrucType schema of log_data
    :return: DataFrame
    """
    dict_to_json = udf(lambda x: json.dumps(ast.literal_eval(x)))
    #log data
    log_df = (
        spark.read.text(paths=log_paths, lineSep='\n')
        .withColumn("data", from_json(dict_to_json("value"), schema=schema, options={"encoding": "utf-8"}))
        .select("data.*")
        .withColumn("SessionMainMenu",
                    to_timestamp(regexp_replace(col("SessionMainMenu"),  r"^(\w+):", ""),
                            "yyyy:MM:dd:HH:mm:ss:SSS"))
        
    )
    #user info data
    user_df = (
        spark.read.text(paths=user_info_path)
        .withColumn("Mac", split(col("value"), "\t")[0])
        .withColumn("Days", split(col("value"), "\t")[1])
        .select(
            regexp_replace(col("Mac"), "^.{4}", "").cast("string").alias("MacId"),
            col("Days").cast("integer")
        )
        .filter(col("Days").isNotNull())  #we need this line to remove the first row (MAC, #days)
    )
    
    df = log_df.join(
        user_df,
        log_df.Mac == user_df.MacId,
        'left'
    ).select(
        "Mac", "SessionMainMenu", "LogId", "Event", "ItemId", "RealTimePlaying", "Days"
    )
    return df, user_df, log_df

In [91]:
df, user_df, log_df = read_data(log_dir, user_dir, log_schema=schema)
df.show()

+------------+--------------------+-----+------------+---------+---------------+----+
|         Mac|     SessionMainMenu|LogId|       Event|   ItemId|RealTimePlaying|Days|
+------------+--------------------+-----+------------+---------+---------------+----+
|B046FCAC0DC1|2016-02-12 12:35:...|   52|     StopVOD|100052388|          570.3| 375|
|B046FCAC0DC1|2016-02-11 01:01:...|   40|   EnterIPTV|     null|           null| 375|
|B046FCAC0DC1|2016-02-11 01:02:...|   55|     NextVOD|100052388|           null| 375|
|B046FCAC0DC1|2016-02-12 04:44:...|   18|ChangeModule|     null|           null| 375|
|B046FCAC0DC1|2016-02-12 12:35:...|   54|     PlayVOD|100052388|           null| 375|
|B046FCAC0DC1|2016-02-12 04:44:...|   40|   EnterIPTV|     null|           null| 375|
|B046FCAC0DC1|2016-02-12 12:35:...|   55|     NextVOD|100052388|           null| 375|
|B046FCAC0DC1|2016-02-12 12:35:...|   52|     StopVOD|100052388|         3384.6| 375|
|B046FCAC0DC1|2016-02-13 17:25:...|   40|   EnterIPTV|

In [92]:
user_df.createOrReplaceTempView("user_data")
spark.sql(
    sqlQuery="""
    select * from user_data
    """
).show()

+------------+----+
|       MacId|Days|
+------------+----+
|B046FCB79E0B|  20|
|B046FCB3528B| 181|
|B046FCAAFB73| 426|
|B046FCAAFB72| 426|
|B046FCAA2085| 429|
|B046FCAA0669| 380|
|B046FCB343BF| 376|
|B046FCAC0CFB| 376|
|B046FCABED45| 378|
|B046FCAD80FC| 305|
|B046FCB1E3FE| 255|
|B046FCB27666| 210|
|B046FCB42341| 142|
|B046FCB6D6B2|  46|
|B046FCB6D4BC|  46|
|B046FCB6D4B6|  46|
|B046FCA6A3F4| 583|
|B046FCA86BD5| 493|
|B046FCABE3BC| 425|
|B046FCAC125F| 374|
+------------+----+
