### Streaming Startup Data (SaberFix)
Exploring Data and Answering Business Questions

In [2]:
# Spark Session is the entry point for all Spark functionality 
# Thorugh the Spark Session you are able to read data, 
# create DataFrames and transform it using Structured API's like pyspark.
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.appName("StriderTest").getOrCreate()

### Creating Views for each data file

In [None]:
### Movies View Creation
print("\n ----> Movies View Creation")
movies_schema = "TITLE STRING, DURATION_MINS FLOAT,ORIGINAL_LANGUAGE STRING,SIZE_MB DOUBLE"

df_movies = spark.read.option("header","true").schema(movies_schema).csv("data/movies.csv")
df_movies.printSchema()
df_movies.show(5)
df_movies.createOrReplaceTempView("movies")

### Users View Creation
print("\n ----> Users View Creation")
df_users = spark.read.option("header","true").csv("data/users.csv")
df_users.show(5)
df_users.createOrReplaceTempView("users")


### Streams View Cration
print("\n ----> Streams View Creation")
streams_schema = "MOVIE_TITLE STRING, USER_EMAIL STRING,SIZE_MB DOUBLE,START_AT STRING,END_AT STRING"

df_streams_1 = spark.read.option("header","true").schema(streams_schema).csv("data/streams.csv")

df_streams_2 = (df_streams_1 #### Transformation 
.withColumn('SIZE_MB', col('size_mb').astype("DOUBLE")) 
.withColumn('START_AT', to_timestamp(col("start_at"),"yyyy-MM-dd'T'HH:mm:ss.SSSZZZ")) 
.withColumn('END_AT', to_timestamp(col("end_at"),"yyyy-MM-dd'T'HH:mm:ss.SSSZZZ"))
)

df_streams_2.printSchema()
df_streams_2.show(5)
df_streams_2.createOrReplaceTempView("streams")


### Authors View Creation
print("\n ----> Authors View Creation")
df_authors = spark.read.option("header","true").json("data/authors.json")

df_authors_2 = (df_authors # transform
.withColumn('died_at', to_timestamp(col("died_at"),"yyyy-MM-dd'T'HH:mm:ss.SSSZZZ"))
) 
df_authors_2.show(5)
df_authors_2.createOrReplaceTempView("authors")


### Books View Creation
print("\n ----> Books View Creation")
df_books = spark.read.option("header","true").json("data/books.json")
df_books.show(5)
df_books.createOrReplaceTempView("books")


### Reviews View Creation
print("\n ----> Reviews View Creation")
df_reviews = spark.read.option("header","true").json("data/reviews.json")
df_reviews.show(5)
df_reviews.createOrReplaceTempView("reviews")