## This Notebook will Processing Movie Data in a Streaming Using a Structured Streaming

Spark Version 3.1.2
Databricks Runtime LTS 9.1

In [0]:
from pyspark.sql.functions import to_date, input_file_name, split
from pyspark.sql.types import DateType
import logging

In [0]:
%run ../util/utils

In [0]:
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

In [0]:
movies_filepath="dbfs:/FileStore/tables/asos_data/movies/"
checkpoint_path="dbfs:/FileStore/tables/asos_delta_std/movies_checkpoint"
movies_outpath="dbfs:/FileStore/tables/asos_delta_std/movies/"

### Input Raw folder structure

In [0]:
display(dbutils.fs.ls(movies_filepath))

path,name,size
dbfs:/FileStore/tables/asos_data/movies/20220306/,20220306/,0


In [0]:
####Getting Rating Schema from latest file path
movies_schema_filepath = get_latest_file_path(movies_filepath)
movies_schema = get_schema(movies_schema_filepath, "csv")

In [0]:
df_movies = spark.readStream.format("csv").schema(movies_schema).option("header", True).option("inferSchema", "true").load(movies_filepath + "/*")

### Casting From String to Date using the input date as a Partition column

In [0]:
df_movies = df_movies.withColumn('release_date',to_date(df_movies.release_date, 'mm/dd/yy'))
df_movies = df_movies.withColumn("load_date", to_date(split(input_file_name(), '/').getItem(5), 'yyyyMMdd'))

In [0]:
df_movies.writeStream.format("delta").partitionBy("load_date").outputMode("append").option('checkpointLocation', checkpoint_path).start(movies_outpath)

Out[12]: <pyspark.sql.streaming.StreamingQuery at 0x7f3ad7d25fd0>

### Output displayed from the movies output path

In [0]:
display(spark.read.format("delta").load(movies_outpath))

movie_id,movie_title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,animation,childrens,Comedy,crime,Documentary,drama,fantasy,filmnoir,horror,musical,Mystery,romance,scifi,thriller,war,western,load_date
1,Toy Story (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Toy%20Story%20(1995),0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2022-03-06
2,GoldenEye (1995),1995-01-01,,http://us.imdb.com/M/title-exact?GoldenEye%20(1995),0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2022-03-06
3,Four Rooms (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2022-03-06
4,Get Shorty (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995),0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,2022-03-06
5,Copycat (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,2022-03-06
6,Shanghai Triad (Yao a yao yao dao waipo qiao) (1995),1995-01-01,,http://us.imdb.com/Title?Yao+a+yao+yao+dao+waipo+qiao+(1995),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2022-03-06
7,Twelve Monkeys (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Twelve%20Monkeys%20(1995),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,2022-03-06
8,Babe (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Babe%20(1995),0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,2022-03-06
9,Dead Man Walking (1995),1995-01-01,,http://us.imdb.com/M/title-exact?Dead%20Man%20Walking%20(1995),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2022-03-06
10,Richard III (1995),1996-01-22,,http://us.imdb.com/M/title-exact?Richard%20III%20(1995),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,2022-03-06


### Below are the output folders with the checkpoint in place

In [0]:
display(dbutils.fs.ls("/FileStore/tables/asos_delta_std"))

path,name,size
dbfs:/FileStore/tables/asos_delta_std/movies/,movies/,0
dbfs:/FileStore/tables/asos_delta_std/movies_checkpoint/,movies_checkpoint/,0
dbfs:/FileStore/tables/asos_delta_std/rating/,rating/,0
dbfs:/FileStore/tables/asos_delta_std/rating_checkpoint/,rating_checkpoint/,0
dbfs:/FileStore/tables/asos_delta_std/tags/,tags/,0
dbfs:/FileStore/tables/asos_delta_std/tags_checkpoint/,tags_checkpoint/,0
