### Streaming Startup Data (SaberFix)
Exploring Data and Answering Business Questions

In [2]:
# Spark Session is the entry point for all Spark functionality 
# Thorugh the Spark Session you are able to read data, 
# create DataFrames and transform it using Structured API's like pyspark.
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.appName("StriderTest").getOrCreate()

### Creating Views for each data file

In [None]:
### Movies View Creation
print("\n ----> Movies View Creation")
movies_schema = "TITLE STRING, DURATION_MINS FLOAT,ORIGINAL_LANGUAGE STRING,SIZE_MB DOUBLE"

df_movies = spark.read.option("header","true").schema(movies_schema).csv("data/movies.csv")
df_movies.printSchema()
df_movies.show(5)
df_movies.createOrReplaceTempView("movies")

### Users View Creation
print("\n ----> Users View Creation")
df_users = spark.read.option("header","true").csv("data/users.csv")
df_users.show(5)
df_users.createOrReplaceTempView("users")


### Streams View Cration
print("\n ----> Streams View Creation")
streams_schema = "MOVIE_TITLE STRING, USER_EMAIL STRING,SIZE_MB DOUBLE,START_AT STRING,END_AT STRING"

df_streams_1 = spark.read.option("header","true").schema(streams_schema).csv("data/streams.csv")

df_streams_2 = (df_streams_1 #### Transformation 
.withColumn('SIZE_MB', col('size_mb').astype("DOUBLE")) 
.withColumn('START_AT', to_timestamp(col("start_at"),"yyyy-MM-dd'T'HH:mm:ss.SSSZZZ")) 
.withColumn('END_AT', to_timestamp(col("end_at"),"yyyy-MM-dd'T'HH:mm:ss.SSSZZZ"))
)

df_streams_2.printSchema()
df_streams_2.show(5)
df_streams_2.createOrReplaceTempView("streams")


### Authors View Creation
print("\n ----> Authors View Creation")
df_authors = spark.read.option("header","true").json("data/authors.json")

df_authors_2 = (df_authors # transform
.withColumn('died_at', to_timestamp(col("died_at"),"yyyy-MM-dd'T'HH:mm:ss.SSSZZZ"))
) 
df_authors_2.show(5)
df_authors_2.createOrReplaceTempView("authors")


### Books View Creation
print("\n ----> Books View Creation")
df_books = spark.read.option("header","true").json("data/books.json")
df_books.show(5)
df_books.createOrReplaceTempView("books")


### Reviews View Creation
print("\n ----> Reviews View Creation")
df_reviews = spark.read.option("header","true").json("data/reviews.json")
df_reviews.show(5)
df_reviews.createOrReplaceTempView("reviews")

### What percentage of the streamed movies are based on books?

In [None]:
sf_distinct_movies = spark.sql("select distinct MOVIE_TITLE as moviesCount from streams").distinct().count()

In [None]:
spark.sql("select count(distinct book) as reviewsBooksCount from reviews").show(1)

+-----------------+
|reviewsBooksCount|
+-----------------+
|              102|
+-----------------+



In [None]:
q1 = """
SELECT m.title, r.movie
FROM movies as m
LEFT JOIN reviews as r on m.title = r.movie
WHERE r.book is not null
"""
movies_based_on_books_on_sf = spark.sql(q1).distinct().count()
movies_based_on_books_on_sf/sf_distinct_movies

0.9340659340659341

### How many users were watching "Unforgiven" on Christmas morning (between 7 and 12 am on December 25)?

In [None]:
q2 = """
SELECT START_AT
FROM streams
WHERE 
MOVIE_TITLE = 'Unforgiven'
AND HOUR(START_AT) BETWEEN 7 AND 12 
AND DATE(START_AT) = '2021-12-25'
"""
spark.sql(q2).count()

1

### How many movies based on books written by Singaporeans authors were streamed that month?

In [None]:
q3 = """
SELECT distinct movie_title
FROM streams as s
LEFT JOIN reviews as r on s.MOVIE_TITLE = r.movie
left join books b on r.book = b.name
left join authors a on  b.author = a.name
WHERE 
month(s.START_AT) = 12
and year(s.START_AT) = 2021
and a.nationality = 'Singaporeans'
"""
spark.sql(q3).count()

3

### What's the average streaming duration?

In [None]:
q4 = """
SELECT START_AT, END_AT, replace(split((END_AT - START_AT),' ')[2],"'","") diff
FROM streams 
"""
spark.sql(q4).show(100)

+--------------------+--------------------+------------+
|            START_AT|              END_AT|        diff|
+--------------------+--------------------+------------+
|2021-12-06 15:30:...|2021-12-07 11:44:...|20:14:19.078|
|2021-12-15 00:36:...|2021-12-15 01:57:...|01:21:01.766|
|2021-12-12 18:32:...|2021-12-13 16:37:...|22:05:31.779|
|2021-12-04 16:47:...|2021-12-05 03:13:...|10:26:20.379|
|2021-12-25 10:27:...|2021-12-25 20:22:...|09:55:40.465|
|2021-12-26 21:13:...|2021-12-27 07:18:...|10:05:08.437|
|2021-12-02 06:14:...|2021-12-03 03:15:...| 21:01:48.65|
|2021-12-14 23:15:...|2021-12-15 08:08:...|08:52:19.761|
|2021-12-14 02:47:...|2021-12-14 07:10:...|04:22:40.743|
|2021-12-16 22:05:...|2021-12-17 03:51:...|05:46:00.433|
|2021-12-22 03:29:...|2021-12-22 18:53:...|15:24:10.009|
|2021-12-19 16:32:...|2021-12-20 10:27:...|17:54:48.671|
|2021-12-19 17:31:...|2021-12-19 21:19:...|03:48:27.919|
|2021-12-28 19:24:...|2021-12-28 19:26:...|00:01:27.272|
|2021-12-21 16:46:...|2021-12-2