In [1]:
import ast, json
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, udf, split, regexp_replace
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql.dataframe import  DataFrame

In [2]:
builder = (
    SparkSession.builder.appName("homework1")
    .config("spark.driver.memory", "16g")
    .config("spark.driver.cores", 3)
    .config("sql.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"
    )
)
spark = configure_spark_with_delta_pip(builder).getOrCreate()

23/10/18 20:49:51 WARN Utils: Your hostname, Andrea-Le-MBP-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.11 instead (on interface en0)
23/10/18 20:49:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/Users/sonle/.sdkman/candidates/spark/3.4.0/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/sonle/.ivy2/cache
The jars for the packages stored in: /Users/sonle/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-9d6964ce-e7ef-4c82-b69b-da86de202a9e;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 177ms :: artifacts dl 11ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   

In [3]:
schema = StructType([
    StructField("Mac", StringType(), True),
    StructField("SessionMainMenu", StringType(), True),
    StructField("LogId", StringType(), True),
    StructField("Event", StringType(), True),
    StructField("ItemId", StringType(), True),
    StructField("RealTimePlaying", StringType(), True)
])
log_path = "/Users/sonle/Documents/GitHub/spark-playground/data/DataSampleTest/logt21.txt"
user_path = "/Users/sonle/Documents/GitHub/spark-playground/data/DataSampleTest/user_info.txt"

In [4]:
def read_data(log_paths: str, user_info_path: str, log_schema: StructType) -> DataFrame:
    """
    Input the log_path for log user data and user_info_path for user info. Log path must be a glob pattern
    For log user data, an additional dict_to_json is needed to revalidate the json strings
    For user info data, small modification is needed to get 2-columns format
    :param user_info_path: strings, a path for user info data
    :param log_paths: strings, a path for log user data
    :param log_schema: StrucType schema of log_data
    :return: DataFrame
    """
    dict_to_json = udf(lambda x: json.dumps(ast.literal_eval(x)))
    #log data
    log_df = (
        spark.read.text(paths=log_path, lineSep='\n')
        .withColumn("data", from_json(dict_to_json("value"), schema=schema, options={"encoding": "utf-8"}))
        .select("data.*")
    )
    #user info data
    user_df = (
        spark.read.text(paths=user_info_path)
        .withColumn("Mac", split(col("value"), "\t")[0])
        .withColumn("Days", split(col("value"), "\t")[1])
        .select(
            regexp_replace(col("Mac"), "^.{4}", "").cast("string").alias("MacId"),
            col("Days").cast("integer")
        )
        .filter(col("Days").isNotNull())  #we need this line to remove the first row (MAC, #days)
    )
    
    df = log_df.join(
        user_df,
        log_df.Mac == user_df.MacId,
        'left'
    ).select(
        "Mac", "SessionMainMenu", "LogId", "Event", "ItemId", "RealTimePlaying", "Days"
    )
    return df

In [5]:
df = read_data(log_path, user_path, log_schema=schema)
df.show()

[Stage 1:>                                                          (0 + 1) / 1]

+------------+--------------------+-----+------------+---------+---------------+----+
|         Mac|     SessionMainMenu|LogId|       Event|   ItemId|RealTimePlaying|Days|
+------------+--------------------+-----+------------+---------+---------------+----+
|B046FCAC0DC1|B046FCAC0DC1:2016...|   52|     StopVOD|100052388|          570.3| 375|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   40|   EnterIPTV|     null|           null| 375|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   55|     NextVOD|100052388|           null| 375|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   18|ChangeModule|     null|           null| 375|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   54|     PlayVOD|100052388|           null| 375|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   40|   EnterIPTV|     null|           null| 375|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   55|     NextVOD|100052388|           null| 375|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   52|     StopVOD|100052388|         3384.6| 375|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   40|   EnterIPTV|

                                                                                

In [7]:
df.filter(col("Mac").isin("B046FCAC0DC1")).show()

+------------+--------------------+-----+------------+---------+---------------+----+
|         Mac|     SessionMainMenu|LogId|       Event|   ItemId|RealTimePlaying|Days|
+------------+--------------------+-----+------------+---------+---------------+----+
|B046FCAC0DC1|B046FCAC0DC1:2016...|   52|     StopVOD|100052388|          570.3| 375|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   40|   EnterIPTV|     null|           null| 375|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   55|     NextVOD|100052388|           null| 375|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   18|ChangeModule|     null|           null| 375|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   54|     PlayVOD|100052388|           null| 375|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   40|   EnterIPTV|     null|           null| 375|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   55|     NextVOD|100052388|           null| 375|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   52|     StopVOD|100052388|         3384.6| 375|
|B046FCAC0DC1|B046FCAC0DC1:2016...|   40|   EnterIPTV|