## Demonstrate the functionalities of boradcast variables and UDFs

In [1]:
from os import environ
from ast import literal_eval
from pyspark.sql import SparkSession, functions as f
from pyspark.sql.types import IntegerType

In [2]:
spark = SparkSession.builder.appName("best_movies").getOrCreate()
file_path = "file:///"+environ['DATA_LAKE']

In [3]:
ratings_df = spark.read.csv(file_path+"ratings.csv", header=True)

In [4]:
movies_df = spark.read.csv(file_path+"movies.csv", header=True)

### Multiple aggregations done within the dataframe

In [5]:
movie_avg_rating = ratings_df.select(ratings_df.movieId, ratings_df.rating) \
        .groupBy(ratings_df.movieId) \
        .agg(f.round(f.avg(ratings_df.rating), 2).alias("avg_rating"), \
             f.round(f.sum(ratings_df.rating), 2).alias("total_rating"), \
             f.count(ratings_df.rating).alias("num_ratings"))

### Converting to dictionary and exporting as a broadcast variable

In [6]:
movies_dict = spark.sparkContext.broadcast({int(row.asDict()["movieId"]):row.asDict()["title"].encode("ascii", "ignore") for row in movies_df.select(movies_df.movieId, movies_df.title).collect()})

### Creating a UDF

In [7]:
def lookup_movie_name(movieId):
    return movies_dict.value.get(movieId, None)

lookup_movie_name_udf = f.udf(lookup_movie_name)

### Casting the cell to a different datatype

In [8]:
result_df = movie_avg_rating \
            .withColumn("movieId", movie_avg_rating.movieId.cast(IntegerType()))

In [9]:
main_df = result_df \
        .withColumn("title", lookup_movie_name_udf(result_df.movieId)) \
        .sort(result_df.num_ratings.desc(), result_df.avg_rating.desc())

In [10]:
main_df.show()

+-------+----------+------------+-----------+-----------+
|movieId|avg_rating|total_rating|num_ratings|      title|
+-------+----------+------------+-----------+-----------+
|    356|      4.16|      1370.0|        329|[B@658d76ed|
|    318|      4.43|      1404.0|        317|[B@1fe88935|
|    296|       4.2|      1288.5|        307|[B@19b04904|
|    593|      4.16|      1161.0|        279|[B@66131672|
|   2571|      4.19|      1165.5|        278|[B@49e9b503|
|    260|      4.23|      1062.0|        251|[B@1ad539f1|
|    480|      3.75|       892.5|        238|[B@7e399b8e|
|    110|      4.03|       955.5|        237|[B@7c38fcb2|
|    589|      3.97|       889.5|        224|[B@3abf6028|
|    527|      4.23|       929.5|        220|[B@159479ac|
|   2959|      4.27|       931.5|        218| [B@528c504|
|      1|      3.92|       843.0|        215|[B@3535989b|
|   1196|      4.22|       889.5|        211| [B@ffba49e|
|     50|      4.24|       864.5|        204|[B@3e5e03e7|
|   2858|     

In [11]:
spark.stop()