# Start Spark Session

In [8]:
# Change the number of cores in this code block
# by setting `spark.master` to `local[n]` where
# n is the number of cores
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

conf = pyspark.SparkConf().setAll([('spark.master', 'local[2]'),
                                   ('spark.app.name', 'Basic Setup')])
spark = SparkSession.builder.config(conf=conf).getOrCreate()

2022-05-16 00:59:21,043 WARN util.Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
2022-05-16 00:59:21,043 WARN util.Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


## Load Data

### Read data from the `movies.csv` file

In [9]:
movies_df = spark.read.option("header",True).csv("file:///home/work/data/movies.csv").cache()
movies_df.printSchema()

root
 |-- movieId: string (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [10]:
movies_df.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

### Read in ratings csv

In [11]:
ratings_df = spark.read.option("header",True).csv("file:///home/work/data/ratings.csv").cache()
ratings_df.printSchema()

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [12]:
ratings_df.show()

[Stage 3:>                                                          (0 + 1) / 1]

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    306|   3.5|1147868817|
|     1|    307|   5.0|1147868828|
|     1|    665|   5.0|1147878820|
|     1|    899|   3.5|1147868510|
|     1|   1088|   4.0|1147868495|
|     1|   1175|   3.5|1147868826|
|     1|   1217|   3.5|1147878326|
|     1|   1237|   5.0|1147868839|
|     1|   1250|   4.0|1147868414|
|     1|   1260|   3.5|1147877857|
|     1|   1653|   4.0|1147868097|
|     1|   2011|   2.5|1147868079|
|     1|   2012|   2.5|1147868068|
|     1|   2068|   2.5|1147869044|
|     1|   2161|   3.5|1147868609|
|     1|   2351|   4.5|1147877957|
|     1|   2573|   4.0|1147878923|
|     1|   2632|   5.0|1147878248|
|     1|   2692|   5.0|1147869100|
+------+-------+------+----------+
only showing top 20 rows



                                                                                

## Join between movies and ratings

In [13]:
# missing_movies = movies_df.join(ratings_df, movies_df.movieId ==  ratings_df.movieId, "leftanti").show(truncate=False)

movies_df.createOrReplaceTempView("m")
ratings_df.createOrReplaceTempView("r")
missing_movies = spark.sql("SELECT COUNT(m.movieId) FROM m LEFT ANTI JOIN r ON m.movieId == r.movieId") \
  .show(truncate=False)

2022-05-16 01:00:00,489 WARN memory.MemoryStore: Not enough space to cache rdd_31_3 in memory! (computed 54.6 MiB so far)
2022-05-16 01:00:00,489 WARN storage.BlockManager: Persisting block rdd_31_3 to disk instead.
2022-05-16 01:00:06,431 WARN memory.MemoryStore: Not enough space to cache rdd_31_3 in memory! (computed 35.3 MiB so far)
2022-05-16 01:00:06,724 WARN memory.MemoryStore: Not enough space to cache rdd_31_4 in memory! (computed 83.6 MiB so far)
2022-05-16 01:00:06,724 WARN storage.BlockManager: Persisting block rdd_31_4 to disk instead.
[Stage 8:>                                                          (0 + 2) / 2]

+--------------+
|count(movieId)|
+--------------+
|3376          |
+--------------+



                                                                                

In [14]:
missing_movies = spark.sql("SELECT m.movieId, m.title FROM m LEFT ANTI JOIN r ON m.movieId == r.movieId") \
  .show(truncate=False)

2022-05-16 01:20:41,122 WARN memory.MemoryStore: Not enough space to cache rdd_31_3 in memory! (computed 35.3 MiB so far)
[Stage 17:>                                                         (0 + 1) / 1]

+-------+------------------------------------------------------------------+
|movieId|title                                                             |
+-------+------------------------------------------------------------------+
|100585 |Above the Street, Below the Water (Over gaden under vandet) (2009)|
|101237 |9500 Liberty (2009)                                               |
|101329 |Shining Night: A Portrait of Composer Morten Lauridsen (2012)     |
|101444 |Big Bang in Tunguska (Das Rätsel von Tunguska) (2008)             |
|101472 |Brussels Business, The (2012)                                     |
|102135 |Sadda Haq (2013)                                                  |
|102192 |Successful Calamity, A (1932)                                     |
|103241 |Flea in Her Ear, A (1968)                                         |
|103322 |Projectionist, The (1971)                                         |
|104640 |Long Dark Hall, The (1951)                                        |

                                                                                

In [22]:
no_genre_ratings = spark.sql("SELECT m.movieId FROM m RIGHT JOIN r ON m.movieId == r.movieId WHERE m.genres like '%no genres listed%' ")

In [27]:
no_genre_ratings.count()

2022-05-16 02:26:06,486 WARN memory.MemoryStore: Not enough space to cache rdd_31_0 in memory! (computed 54.6 MiB so far)
                                                                                

26627

In [26]:
percent_lost = 100 - (((ratings_df.count() - no_genre_ratings.count()) / ratings_df.count()) * 100)
print(f"If drop ratings for movies without genres, we will lose {percent_lost:.3f}% of ratings data.")

2022-05-16 02:25:37,158 WARN memory.MemoryStore: Not enough space to cache rdd_31_0 in memory! (computed 54.6 MiB so far)
2022-05-16 02:25:37,542 WARN memory.MemoryStore: Not enough space to cache rdd_31_0 in memory! (computed 54.6 MiB so far)
2022-05-16 02:25:39,878 WARN memory.MemoryStore: Not enough space to cache rdd_31_0 in memory! (computed 54.6 MiB so far)


If drop ratings for movies without genres, we will lose 0.107% of ratings data.


In [7]:
# spark.stop()