## Demonstrate the functionalities of boradcast variables and UDFs

In [71]:
from pyspark.sql import SparkSession, functions as f
from ast import literal_eval

In [10]:
spark = SparkSession.builder.appName("best_movies").getOrCreate()

In [11]:
ratings_df = spark.read.csv("file:///var/lib/jupyter/data/ratings.csv", header=True)

In [37]:
movies_df = spark.read.csv("file:///var/lib/jupyter/data/movies.csv", header=True)

### Multiple aggregations done within the dataframe

In [217]:
movie_avg_rating = ratings_df.select(ratings_df.movieId, ratings_df.rating) \
        .groupBy(ratings_df.movieId) \
        .agg(f.round(f.avg(ratings_df.rating), 2).alias("avg_rating"), \
             f.round(f.sum(ratings_df.rating), 2).alias("total_rating"), \
             f.count(ratings_df.rating).alias("num_ratings"))

### Converting to dictionary and exporting as a broadcast variable

In [218]:
movies_dict = spark.sparkContext.broadcast({int(row.asDict()["movieId"]):row.asDict()["title"].encode("ascii", "ignore") for row in movies_df.select(movies_df.movieId, movies_df.title).collect()})

### Creating a UDF

In [219]:
def lookup_movie_name(movieId):
    return movies_dict.value.get(movieId, None)

lookup_movie_name_udf = f.udf(lookup_movie_name)

### Casting the cell to a different datatype

In [220]:
result_df = movie_avg_rating \
            .withColumn("movieId", movie_avg_rating.movieId.cast(IntegerType()))

In [223]:
main_df = result_df \
        .withColumn("title", lookup_movie_name_udf(result_df.movieId)) \
        .sort(result_df.num_ratings.desc(), result_df.avg_rating.desc())

In [224]:
main_df.show()

+-------+----------+------------+-----------+--------------------+
|movieId|avg_rating|total_rating|num_ratings|               title|
+-------+----------+------------+-----------+--------------------+
|    356|      4.16|      1370.0|        329| Forrest Gump (1994)|
|    318|      4.43|      1404.0|        317|Shawshank Redempt...|
|    296|       4.2|      1288.5|        307| Pulp Fiction (1994)|
|    593|      4.16|      1161.0|        279|Silence of the La...|
|   2571|      4.19|      1165.5|        278|  Matrix, The (1999)|
|    260|      4.23|      1062.0|        251|Star Wars: Episod...|
|    480|      3.75|       892.5|        238|Jurassic Park (1993)|
|    110|      4.03|       955.5|        237|   Braveheart (1995)|
|    589|      3.97|       889.5|        224|Terminator 2: Jud...|
|    527|      4.23|       929.5|        220|Schindler's List ...|
|   2959|      4.27|       931.5|        218|   Fight Club (1999)|
|      1|      3.92|       843.0|        215|    Toy Story (19

In [None]:
spark.stop()