# Movielens dataset analysis

Download data from: https://www.kaggle.com/datasets/grouplens/movielens-20m-dataset?resource=download

In [1]:
import pyspark.sql.functions as f
import pyspark.sql.types as t

# Load and explore movies dataset

In [2]:
movielens_path = '/work/data/movielens'
movies_path = f'{movielens_path}/movie.csv'
movies = spark.read.format('csv').option('header', True).load(movies_path)
movies.show(truncate=False)

[Stage 1:>                                                          (0 + 1) / 1]

+-------+-------------------------------------+-------------------------------------------+
|movieId|title                                |genres                                     |
+-------+-------------------------------------+-------------------------------------------+
|1      |Toy Story (1995)                     |Adventure|Animation|Children|Comedy|Fantasy|
|2      |Jumanji (1995)                       |Adventure|Children|Fantasy                 |
|3      |Grumpier Old Men (1995)              |Comedy|Romance                             |
|4      |Waiting to Exhale (1995)             |Comedy|Drama|Romance                       |
|5      |Father of the Bride Part II (1995)   |Comedy                                     |
|6      |Heat (1995)                          |Action|Crime|Thriller                      |
|7      |Sabrina (1995)                       |Comedy|Romance                             |
|8      |Tom and Huck (1995)                  |Adventure|Children               

                                                                                

In [3]:
movies.printSchema()

root
 |-- movieId: string (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [4]:
movies.count()

                                                                                

27278

In [5]:
movies.dropna().count()

                                                                                

27278

Explore genres

In [6]:
movies.select(f.split('genres', '\|')).printSchema()

root
 |-- split(genres, \|, -1): array (nullable = true)
 |    |-- element: string (containsNull = true)



In [7]:
genre_count = movies.select(f.explode(f.split('genres', '\|'))).distinct().count()
print('number of different genres: ', genre_count)

movies.select(f.explode(f.split('genres', '\|'))).distinct().show(genre_count)

                                                                                

number of different genres:  20




+------------------+
|               col|
+------------------+
|             Crime|
|           Romance|
|          Thriller|
|         Adventure|
|             Drama|
|               War|
|       Documentary|
|           Fantasy|
|           Mystery|
|           Musical|
|         Animation|
|         Film-Noir|
|(no genres listed)|
|              IMAX|
|            Horror|
|           Western|
|            Comedy|
|          Children|
|            Action|
|            Sci-Fi|
+------------------+



                                                                                

In [8]:
movies_genres = movies.select('movieId', 'title', f.explode(f.split('genres', '\|')).alias('genre'))
movies_genres.cache()
movies_genres.show(truncate=False)

[Stage 19:>                                                         (0 + 1) / 1]

+-------+----------------------------------+---------+
|movieId|title                             |genre    |
+-------+----------------------------------+---------+
|1      |Toy Story (1995)                  |Adventure|
|1      |Toy Story (1995)                  |Animation|
|1      |Toy Story (1995)                  |Children |
|1      |Toy Story (1995)                  |Comedy   |
|1      |Toy Story (1995)                  |Fantasy  |
|2      |Jumanji (1995)                    |Adventure|
|2      |Jumanji (1995)                    |Children |
|2      |Jumanji (1995)                    |Fantasy  |
|3      |Grumpier Old Men (1995)           |Comedy   |
|3      |Grumpier Old Men (1995)           |Romance  |
|4      |Waiting to Exhale (1995)          |Comedy   |
|4      |Waiting to Exhale (1995)          |Drama    |
|4      |Waiting to Exhale (1995)          |Romance  |
|5      |Father of the Bride Part II (1995)|Comedy   |
|6      |Heat (1995)                       |Action   |
|6      |H

                                                                                

In [9]:
movies_genres.groupBy('genre').agg(f.count('title').alias('count')).sort(f.col('count').desc()).show(20)

                                                                                

+------------------+-----+
|             genre|count|
+------------------+-----+
|             Drama|13344|
|            Comedy| 8374|
|          Thriller| 4178|
|           Romance| 4127|
|            Action| 3520|
|             Crime| 2939|
|            Horror| 2611|
|       Documentary| 2471|
|         Adventure| 2329|
|            Sci-Fi| 1743|
|           Mystery| 1514|
|           Fantasy| 1412|
|               War| 1194|
|          Children| 1139|
|           Musical| 1036|
|         Animation| 1027|
|           Western|  676|
|         Film-Noir|  330|
|(no genres listed)|  246|
|              IMAX|  196|
+------------------+-----+



Extract year from title

In [10]:
# version 1: simple udf that works as python's strings slicing  

def pysubstr(title, start, end):
    if start is None:
        return title[:end]
    if end is None:
        return title[start:]
    return title[start:end]
    

substr_udf = f.udf(pysubstr, t.StringType())

movies_years = movies.select(
    substr_udf( movies['title'], f.lit(0), f.lit(-7)).alias('title'),
    substr_udf( movies['title'], f.lit(-5), f.lit(-1)).cast(t.IntegerType()).alias('year')
)

movies_years.cache()
movies_years.show(truncate=False)

[Stage 22:>                                                         (0 + 1) / 1]

+------------------------------+----+
|title                         |year|
+------------------------------+----+
|Toy Story                     |1995|
|Jumanji                       |1995|
|Grumpier Old Men              |1995|
|Waiting to Exhale             |1995|
|Father of the Bride Part II   |1995|
|Heat                          |1995|
|Sabrina                       |1995|
|Tom and Huck                  |1995|
|Sudden Death                  |1995|
|GoldenEye                     |1995|
|American President, The       |1995|
|Dracula: Dead and Loving It   |1995|
|Balto                         |1995|
|Nixon                         |1995|
|Cutthroat Island              |1995|
|Casino                        |1995|
|Sense and Sensibility         |1995|
|Four Rooms                    |1995|
|Ace Ventura: When Nature Calls|1995|
|Money Train                   |1995|
+------------------------------+----+
only showing top 20 rows



                                                                                

In [11]:
movies_years.where(f.col('year').isNull()).select('title', 'year').show(50,truncate=False)

[Stage 23:>                                                         (0 + 1) / 1]

+-------------------------------------------------------------------+----+
|title                                                              |year|
+-------------------------------------------------------------------+----+
|Babe Ruth Story, The                                               |null|
|Heroes of Telemark, The                                            |null|
|Ba                                                                 |null|
|Millions Game, The (Das Millione                                   |null|
|Truth, The                                                         |null|
|Wedding Song, The                                                  |null|
|White Hell of Pitz Palu, The (Die weiße Hölle vom Piz Palü)        |null|
|Pool, The (Swimming Pool - Der Tod feiert mit)                     |null|
|Bicycle, Spoon, Apple (Bicicleta, cullera                          |null|
|Possession of David O'Reilly, The                                  |null|
|Horrible Way to Die, A  

                                                                                

In [71]:
# v2: udf that looks for the year using regular expression
def py_split_title_year(title):
    import re
    title = title.strip()
    if(re.search(r'\(\d\d\d\d\)$', title)):
        return title[:-7], int(title[-5:-1])
    return title, None


split_udf = f.udf(py_split_title_year, t.ArrayType(t.StringType(), True))

movies_years = movies.select('movieId', 
    split_udf( movies['title']).alias('split')
)
movies_years = movies_years. \
    selectExpr('movieId', 'split[0] as title', 'split[1] as year'). \
    select('movieId', 'title', f.col('year').cast(t.IntegerType()) 
)
movies_years.show(truncate=False)

+-------+------------------------------+----+
|movieId|title                         |year|
+-------+------------------------------+----+
|1      |Toy Story                     |1995|
|2      |Jumanji                       |1995|
|3      |Grumpier Old Men              |1995|
|4      |Waiting to Exhale             |1995|
|5      |Father of the Bride Part II   |1995|
|6      |Heat                          |1995|
|7      |Sabrina                       |1995|
|8      |Tom and Huck                  |1995|
|9      |Sudden Death                  |1995|
|10     |GoldenEye                     |1995|
|11     |American President, The       |1995|
|12     |Dracula: Dead and Loving It   |1995|
|13     |Balto                         |1995|
|14     |Nixon                         |1995|
|15     |Cutthroat Island              |1995|
|16     |Casino                        |1995|
|17     |Sense and Sensibility         |1995|
|18     |Four Rooms                    |1995|
|19     |Ace Ventura: When Nature 

In [74]:
movies_years.groupBy('title').count().where(f.expr('count > 1')).show()



+--------------------+-----+
|               title|count|
+--------------------+-----+
|     Misérables, Les|    9|
|              Hamlet|    8|
|Three Musketeers,...|    7|
|   Wuthering Heights|    6|
|  Christmas Carol, A|    6|
| Alice in Wonderland|    6|
|          Cinderella|    5|
|           Jane Eyre|    5|
|            Blackout|    4|
|            Paradise|    4|
|        Oliver Twist|    4|
|Dr. Jekyll and Mr...|    4|
|                Eden|    4|
|               Angel|    4|
|      Boy Meets Girl|    4|
|              Carrie|    4|
|     Treasure Island|    4|
|20,000 Leagues Un...|    4|
|Hound of the Bask...|    4|
|King Solomon's Mines|    4|
+--------------------+-----+
only showing top 20 rows





# Loading Ratings

In [13]:

ratings_path = f'{movielens_path}/rating.csv'
ratings = spark.read.format('csv').option('header', True).load(ratings_path)
ratings.show(truncate=False)

+------+-------+------+-------------------+
|userId|movieId|rating|timestamp          |
+------+-------+------+-------------------+
|1     |2      |3.5   |2005-04-02 23:53:47|
|1     |29     |3.5   |2005-04-02 23:31:16|
|1     |32     |3.5   |2005-04-02 23:33:39|
|1     |47     |3.5   |2005-04-02 23:32:07|
|1     |50     |3.5   |2005-04-02 23:29:40|
|1     |112    |3.5   |2004-09-10 03:09:00|
|1     |151    |4     |2004-09-10 03:08:54|
|1     |223    |4     |2005-04-02 23:46:13|
|1     |253    |4     |2005-04-02 23:35:40|
|1     |260    |4     |2005-04-02 23:33:46|
|1     |293    |4     |2005-04-02 23:31:43|
|1     |296    |4     |2005-04-02 23:32:47|
|1     |318    |4     |2005-04-02 23:33:18|
|1     |337    |3.5   |2004-09-10 03:08:29|
|1     |367    |3.5   |2005-04-02 23:53:00|
|1     |541    |4     |2005-04-02 23:30:03|
|1     |589    |3.5   |2005-04-02 23:45:57|
|1     |593    |3.5   |2005-04-02 23:31:01|
|1     |653    |3     |2004-09-10 03:08:11|
|1     |919    |3.5   |2004-09-1

In [14]:
ratings.count()

                                                                                

20000263

In [15]:
ratings.printSchema()

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [16]:
ratings.where(f.col('movieId').isNull()).count()

                                                                                

0

In [17]:
ratings.where(f.col('rating').isNull()).count()

                                                                                

0

In [18]:
ratings = ratings.select('userId', 'movieId', f.col('rating').cast(t.FloatType()))


Join ratings with movies

In [19]:
sub_ratings = ratings.drop('userId').withColumnRenamed('movieId', '_movieId')
movies_ratings = movies.drop('genres'). \
    join(
        sub_ratings, 
        movies['movieId'] == sub_ratings['_movieId']). \
drop('_movieId')
movies_ratings.show()

+-------+--------------------+------+
|movieId|               title|rating|
+-------+--------------------+------+
|      2|      Jumanji (1995)|   3.5|
|     29|City of Lost Chil...|   3.5|
|     32|Twelve Monkeys (a...|   3.5|
|     47|Seven (a.k.a. Se7...|   3.5|
|     50|Usual Suspects, T...|   3.5|
|    112|Rumble in the Bro...|   3.5|
|    151|      Rob Roy (1995)|   4.0|
|    223|       Clerks (1994)|   4.0|
|    253|Interview with th...|   4.0|
|    260|Star Wars: Episod...|   4.0|
|    293|Léon: The Profess...|   4.0|
|    296| Pulp Fiction (1994)|   4.0|
|    318|Shawshank Redempt...|   4.0|
|    337|What's Eating Gil...|   3.5|
|    367|    Mask, The (1994)|   3.5|
|    541| Blade Runner (1982)|   4.0|
|    589|Terminator 2: Jud...|   3.5|
|    593|Silence of the La...|   3.5|
|    653|  Dragonheart (1996)|   3.0|
|    919|Wizard of Oz, The...|   3.5|
+-------+--------------------+------+
only showing top 20 rows



Calculate average rating by movie and number of ratings

In [64]:
movies_ratings_agg = movies_ratings.groupBy('movieId').agg(
    f.avg(f.col('rating')).alias('avg_rating'), 
    f.count(f.col('rating')).alias('rating_count'))
movies_ratings_agg.show()

+-------+------------------+------------+
|movieId|        avg_rating|rating_count|
+-------+------------------+------------+
|    296| 4.174231169217055|       67310|
|   1090| 3.919977226720648|       15808|
|   3959| 3.699372603694667|        2869|
|   2294| 3.303207714257601|       10163|
|   6731|3.5571184995737424|        1173|
|  48738| 3.895868364160461|        4163|
|   3210|3.6711219879518073|        7968|
|  88140|3.5536100302637266|        2313|
|    467|3.3832658569500675|         741|
|   2088| 2.562729584628426|        3539|
|   2069| 3.806294326241135|        1128|
|  50802|  2.85519801980198|         404|
|    829|2.6765513454146075|        1821|
|   2136| 2.849462365591398|        2883|
|  89864|3.8558174523570714|        1994|
|   2904|3.5884353741496597|         147|
|   4821|3.1852010265183917|        1169|
|  62912|2.3253676470588234|         272|
|  55498|2.9166666666666665|          78|
|   2162|2.4223394055608822|        2086|
+-------+------------------+------

In [28]:
movies_ratings_agg.cache()

24/05/12 16:49:03 WARN CacheManager: Asked to cache already cached data.


DataFrame[movieId: string, avg_rating: double, rating_count: bigint]

In [29]:
movies_ratings_agg.count()

                                                                                

26744

In [65]:
movies_ratings_agg.agg(
        f.expr('avg(rating_count)'), 
        f.expr('percentile_approx(rating_count, 0.25)'), 
        f.expr('stddev(rating_count)'), 
        f.expr('max(rating_count)'), 
        f.expr('min(rating_count)')
    
    ).show(truncate=False)



+-----------------+-------------------------------------+--------------------+-----------------+-----------------+
|avg(rating_count)|percentile_approx(rating_count, 0.25)|stddev(rating_count)|max(rating_count)|min(rating_count)|
+-----------------+-------------------------------------+--------------------+-----------------+-----------------+
|747.8411232425965|3                                    |3085.8182679035876  |67310            |1                |
+-----------------+-------------------------------------+--------------------+-----------------+-----------------+



                                                                                

Only consider movies with more than 3 ratings

In [66]:
movies_ratings_agg = movies_ratings_agg.where(f.expr('rating_count > 3'))
movies_ratings_agg.count()

                                                                                

19374

Show 20 top rated movies

In [67]:
movies.join(movies_ratings_agg). \
    select('title', 'avg_rating', 'rating_count'). \
    sort(f.col('avg_rating').desc(), f.col('rating_count').desc()). \
    show(20, truncate=False)

[Stage 131:>                                                        (0 + 1) / 1]

+-----------------------------------------+----------+------------+
|title                                    |avg_rating|rating_count|
+-----------------------------------------+----------+------------+
|Balto (1995)                             |4.5       |11          |
|Wings of Courage (1995)                  |4.5       |11          |
|Nixon (1995)                             |4.5       |11          |
|Jumanji (1995)                           |4.5       |11          |
|Cutthroat Island (1995)                  |4.5       |11          |
|Waiting to Exhale (1995)                 |4.5       |11          |
|Casino (1995)                            |4.5       |11          |
|Grumpier Old Men (1995)                  |4.5       |11          |
|Sense and Sensibility (1995)             |4.5       |11          |
|Tom and Huck (1995)                      |4.5       |11          |
|Four Rooms (1995)                        |4.5       |11          |
|GoldenEye (1995)                         |4.5  

                                                                                