In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

import matplotlib.pyplot as plt

In [2]:
# Change the number of cores in this code block
# by setting `spark.master` to `local[n]` where
# n is the number of cores

conf = pyspark.SparkConf().setAll([('spark.master', 'local[2]'),
                                   ('spark.app.name', 'Basic Setup')])
spark = SparkSession.builder.config(conf=conf).getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-05-16 01:49:13,609 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Load Data

Read data from the `ratings.csv` file

In [3]:
movies_df = spark.read.option("header",True).csv("file:///home/work/data/movies.csv").cache()
movies_df.printSchema()

root
 |-- movieId: string (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [4]:
movies_df.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

                                                                                

## Add Year column

In [6]:
#TODO
# movies_df.select('')

## Removing IMAX
Upon visual inspection, there were only two cases for IMAX
### Case 1: IMAX was only genre
* 1 movie where this was true
    * 4460,Encounter in the Third Dimension (1999),IMAX -> only movie with IMAX as only genre -> 27 ratings
Solution: Replace pattern: ",IMAX" -> ",(no genres listed)"
### Case 2: IMAX was last genre listed
* All other 194 instances of IMAX, IMAX was the last genre
Solution: Replace pattern: "|IMAX" -> ""

In [7]:
# Case 1: IMAX is only genre
movies_df = movies_df.withColumn('genres', regexp_replace(col('genres'), ',IMAX', ',(no genres listed)'))
# Case 2: ImMAX is last genre
movies_df = movies_df.withColumn('genres', regexp_replace(col('genres'), '|IMAX', ''))

In [8]:
movies_df.filter(movies_df.genres == 'IMAX').show()

+-------+--------------------+------+
|movieId|               title|genres|
+-------+--------------------+------+
|   4460|Encounter in the ...|  IMAX|
+-------+--------------------+------+



In [9]:
genres_df = movies_df.select(split("genres","\|").alias("genres"))
genres_df = genres_df.select(explode("genres").alias("word"))
genres_df.show(5, truncate=False)

+---------+
|word     |
+---------+
|Adventure|
|Animation|
|Children |
|Comedy   |
|Fantasy  |
+---------+
only showing top 5 rows



In [10]:
distinct_genres = genres_df.groupBy('word').count().sort(col("count").desc())
print(f"Total distinct genres: {distinct_genres.count()}")
distinct_genres.show(25, truncate=False)

Total distinct genres: 20


[Stage 16:>                                                         (0 + 1) / 1]

+------------------+-----+
|word              |count|
+------------------+-----+
|Drama             |25606|
|Comedy            |16870|
|Thriller          |8654 |
|Romance           |7719 |
|Action            |7348 |
|Horror            |5989 |
|Documentary       |5605 |
|Crime             |5319 |
|(no genres listed)|5062 |
|Adventure         |4145 |
|Sci-Fi            |3595 |
|Children          |2935 |
|Animation         |2929 |
|Mystery           |2925 |
|Fantasy           |2731 |
|War               |1874 |
|Western           |1399 |
|Musical           |1054 |
|Film-Noir         |353  |
|IMAX              |195  |
+------------------+-----+



                                                                                

## Movies with no genres
Solution: Drop movies without genres
Future aim: write scraper to pull missing genres

## Movies with no ratings
Interestingly, the movies found with no ratings were found to be a mixed bag of quality. A decent check of movies without ratings were also older movies.
Solution: Drop movies since we have a way of guestimating their rating

Future Aims: Pull average rating from other sites and possibly introduce movies as a fresh experience for users.

In [11]:
# missing_movies = spark.sql("SELECT m.movieId= FROM m LEFT ANTI JOIN r ON m.movieId == r.movieId") \
#   .show(truncate=False)

# How to deal with genres?
## One hot encoding
```
Action           001
Action|Adventure 010
Drama            100
Dram|Action      110
```
https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.ml.feature.OneHotEncoder.html#:~:text=A%20one%2Dhot%20encoder%20that,0.0%2C%201.0%2C%200.0%5D%20
## Writing CSV
Convert 18 long array to string representation

In [5]:
# Unique movie genre combos
movies_df.select('genres').distinct().count()

1639

## Distribution of genre combination count
ie: how many movies are drama and comedy

In [None]:
# TODO