In [32]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import OneHotEncoder

import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Change the number of cores in this code block
# by setting `spark.master` to `local[n]` where
# n is the number of cores

conf = pyspark.SparkConf().setAll([('spark.master', 'local[4]'),
                                   ('spark.app.name', 'Basic Setup')])
spark = SparkSession.builder.config(conf=conf).getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-05-16 07:05:39,520 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Load Data

Read data from the `ratings.csv` file

In [3]:
movies_df = spark.read.option("header",True).csv("file:///home/work/data/movies.csv")
movies_df.printSchema()

root
 |-- movieId: string (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [4]:
movies_df.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

## Add Year column

In [5]:
#Extracting the year from Movie Title
movies_df = movies_df.withColumn('movieYear',regexp_extract(col('title'), '(.+)(\()([0-9]{4})(\))', 3))

In [6]:
movies_df.show()

+-------+--------------------+--------------------+---------+
|movieId|               title|              genres|movieYear|
+-------+--------------------+--------------------+---------+
|      1|    Toy Story (1995)|Adventure|Animati...|     1995|
|      2|      Jumanji (1995)|Adventure|Childre...|     1995|
|      3|Grumpier Old Men ...|      Comedy|Romance|     1995|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|     1995|
|      5|Father of the Bri...|              Comedy|     1995|
|      6|         Heat (1995)|Action|Crime|Thri...|     1995|
|      7|      Sabrina (1995)|      Comedy|Romance|     1995|
|      8| Tom and Huck (1995)|  Adventure|Children|     1995|
|      9| Sudden Death (1995)|              Action|     1995|
|     10|    GoldenEye (1995)|Action|Adventure|...|     1995|
|     11|American Presiden...|Comedy|Drama|Romance|     1995|
|     12|Dracula: Dead and...|       Comedy|Horror|     1995|
|     13|        Balto (1995)|Adventure|Animati...|     1995|
|     14

## Removing IMAX
Upon visual inspection, there were only two cases for IMAX
### Case 1: IMAX was only genre
* 1 movie where this was true
    * 4460,Encounter in the Third Dimension (1999),IMAX -> only movie with IMAX as only genre -> 27 ratings
Solution: Replace pattern: ",IMAX" -> ",(no genres listed)"
### Case 2: IMAX was last genre listed
* All other 194 instances of IMAX, IMAX was the last genre
Solution: Replace pattern: "|IMAX" -> ""

In [7]:
#Check IMAX genres in records
movies_df.filter(col("genres").contains("IMAX")).show()

+-------+--------------------+--------------------+---------+
|movieId|               title|              genres|movieYear|
+-------+--------------------+--------------------+---------+
|     33|Wings of Courage ...|Adventure|Romance...|     1995|
|     37|Across the Sea of...|    Documentary|IMAX|     1995|
|    150|    Apollo 13 (1995)|Adventure|Drama|IMAX|     1995|
|    364|Lion King, The (1...|Adventure|Animati...|     1994|
|    595|Beauty and the Be...|Animation|Childre...|     1991|
|   1797|      Everest (1998)|    Documentary|IMAX|     1998|
|   3159|Fantasia 2000 (1999)|Animation|Childre...|     1999|
|   4382|       Wolves (1999)|    Documentary|IMAX|     1999|
|   4445|T-Rex: Back to th...|Adventure|Documen...|     1998|
|   4453|Michael Jordan to...|    Documentary|IMAX|     2000|
|   4454|         More (1998)|Animation|Drama|S...|     1998|
|   4455|Thrill Ride: The ...|Adventure|Documen...|     1997|
|   4456|Haunted Castle (2...|Animation|Horror|...|     2001|
|   4457

In [8]:
# Case 1: IMAX is only genre
movies_df = movies_df.withColumn('genres', regexp_replace(col('genres'), '\\|IMAX', '')) \
                    .withColumn('genres', regexp_replace(col('genres'), 'IMAX', '(no genres listed)'))
         

In [9]:
#check after IMAX genre removal 
movies_df.filter(col("genres").contains("IMAX")).show()

+-------+-----+------+---------+
|movieId|title|genres|movieYear|
+-------+-----+------+---------+
+-------+-----+------+---------+



In [10]:
#Movies before flattening the genres
movies_df.count()

62423

In [11]:
#Creating the flattened version of movies with genres
final_moveies_df = movies_df.select(col('movieId'),col('title'),col('movieYear'),explode(split("genres","\\|")).alias("genre"))

In [12]:
final_moveies_df.count()

112113

## Movies with no genres
Solution: Drop movies without genres
Future aim: write scraper to pull missing genres

In [14]:
#Removing movies with (no genres listed)
final_moveies_df = final_moveies_df.filter(col("genres") != "(no genres listed)")

In [21]:
distinct_genres = final_moveies_df.groupBy('genre').count().sort(col("count").desc())
print(f"Total distinct genres: {distinct_genres.count()}")
distinct_genres.show(25, truncate=False)

Total distinct genres: 18
+-----------+-----+
|genre      |count|
+-----------+-----+
|Drama      |25606|
|Comedy     |16870|
|Thriller   |8654 |
|Romance    |7719 |
|Action     |7348 |
|Horror     |5989 |
|Documentary|5605 |
|Crime      |5319 |
|Adventure  |4145 |
|Sci-Fi     |3595 |
|Children   |2935 |
|Animation  |2929 |
|Mystery    |2925 |
|Fantasy    |2731 |
|War        |1874 |
|Western    |1399 |
|Musical    |1054 |
|Film-Noir  |353  |
+-----------+-----+



In [22]:
final_moveies_df.count()

107050

In [23]:
final_moveies_df.show(truncate=False)

+-------+----------------------------------+---------+---------+
|movieId|title                             |movieYear|genre    |
+-------+----------------------------------+---------+---------+
|1      |Toy Story (1995)                  |1995     |Adventure|
|1      |Toy Story (1995)                  |1995     |Animation|
|1      |Toy Story (1995)                  |1995     |Children |
|1      |Toy Story (1995)                  |1995     |Comedy   |
|1      |Toy Story (1995)                  |1995     |Fantasy  |
|2      |Jumanji (1995)                    |1995     |Adventure|
|2      |Jumanji (1995)                    |1995     |Children |
|2      |Jumanji (1995)                    |1995     |Fantasy  |
|3      |Grumpier Old Men (1995)           |1995     |Comedy   |
|3      |Grumpier Old Men (1995)           |1995     |Romance  |
|4      |Waiting to Exhale (1995)          |1995     |Comedy   |
|4      |Waiting to Exhale (1995)          |1995     |Drama    |
|4      |Waiting to Exhal

## Movies with no ratings
Interestingly, the movies found with no ratings were found to be a mixed bag of quality. A decent check of movies without ratings were also older movies.
Solution: Drop movies since we have a way of guestimating their rating

Future Aims: Pull average rating from other sites and possibly introduce movies as a fresh experience for users.

In [55]:
from pyspark.sql.types import IntegerType
genre_list = np.array(final_moveies_df.select(col('genre')).distinct().collect()).squeeze().tolist()
genre_map = {g: i for i, g in enumerate(genre_list)}

# https://stackoverflow.com/questions/42980704/pyspark-create-new-column-with-mapping-from-a-dict
def translate(mapping):
    def translate_(col):
        return mapping.get(col)
    return udf(translate_, IntegerType())

final_moveies_df = final_moveies_df.withColumn("genre_id", translate(genre_map)("genre"))

In [58]:
ohe = OneHotEncoder(inputCol='genre_id', outputCol='ohe_test')

ohe_model = ohe.fit(final_moveies_df)
ohe_model.transform(final_moveies_df).show()
ohe_model.transform(final_moveies_df).head().ohe_test.toArray()

+-------+--------------------+---------+---------+--------+---------------+
|movieId|               title|movieYear|    genre|genre_id|       ohe_test|
+-------+--------------------+---------+---------+--------+---------------+
|      1|    Toy Story (1995)|     1995|Adventure|       3| (17,[3],[1.0])|
|      1|    Toy Story (1995)|     1995|Animation|      10|(17,[10],[1.0])|
|      1|    Toy Story (1995)|     1995| Children|      15|(17,[15],[1.0])|
|      1|    Toy Story (1995)|     1995|   Comedy|      14|(17,[14],[1.0])|
|      1|    Toy Story (1995)|     1995|  Fantasy|       7| (17,[7],[1.0])|
|      2|      Jumanji (1995)|     1995|Adventure|       3| (17,[3],[1.0])|
|      2|      Jumanji (1995)|     1995| Children|      15|(17,[15],[1.0])|
|      2|      Jumanji (1995)|     1995|  Fantasy|       7| (17,[7],[1.0])|
|      3|Grumpier Old Men ...|     1995|   Comedy|      14|(17,[14],[1.0])|
|      3|Grumpier Old Men ...|     1995|  Romance|       1| (17,[1],[1.0])|
|      4|Wai

array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

# How to deal with genres?
## One hot encoding
```
Action           001
Action|Adventure 010
Drama            100
Dram|Action      110
```
https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.ml.feature.OneHotEncoder.html#:~:text=A%20one%2Dhot%20encoder%20that,0.0%2C%201.0%2C%200.0%5D%20
## Writing CSV
Convert 18 long array to string representation

## Distribution of genre combination count
ie: how many movies are drama and comedy

In [19]:
# TODO