In [1]:
# Let's create our entry point

from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("ratingsData").getOrCreate()

In [11]:
# get the movie ratings data

ratings_location = "/home/datamaking/Documents/Hadoop/ml-latest/ratings.csv"

ratings = spark.read.csv(path=ratings_location,
                        sep=",",
                        header=True,
                        quote='"',
                        schema="userId INT, movieId INT, rating DOUBLE, timestamp INT",)

In [9]:
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [12]:
ratings.show(5)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    307|   3.5|1256677221|
|     1|    481|   3.5|1256677456|
|     1|   1091|   1.5|1256677471|
|     1|   1257|   4.5|1256677460|
|     1|   1449|   4.5|1256677264|
+------+-------+------+----------+
only showing top 5 rows



In [13]:
# let's use sql functions to wrangle our data

from pyspark.sql import functions as f

In [14]:
ratings = (ratings.withColumnRenamed("timestamp","timestamp_unix")
           .withColumn("timestamp", f.from_unixtime("timestamp_unix"))
           .withColumn("timestamp", f.to_timestamp("timestamp")))



ratings.printSchema()
ratings.show(5)

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp_unix: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)

+------+-------+------+--------------+-------------------+
|userId|movieId|rating|timestamp_unix|          timestamp|
+------+-------+------+--------------+-------------------+
|     1|    307|   3.5|    1256677221|2009-10-28 00:00:21|
|     1|    481|   3.5|    1256677456|2009-10-28 00:04:16|
|     1|   1091|   1.5|    1256677471|2009-10-28 00:04:31|
|     1|   1257|   4.5|    1256677460|2009-10-28 00:04:20|
|     1|   1449|   4.5|    1256677264|2009-10-28 00:01:04|
+------+-------+------+--------------+-------------------+
only showing top 5 rows



In [15]:
ratings.drop("timestamp_unix").show(5)

+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|    307|   3.5|2009-10-28 00:00:21|
|     1|    481|   3.5|2009-10-28 00:04:16|
|     1|   1091|   1.5|2009-10-28 00:04:31|
|     1|   1257|   4.5|2009-10-28 00:04:20|
|     1|   1449|   4.5|2009-10-28 00:01:04|
+------+-------+------+-------------------+
only showing top 5 rows



In [28]:
# we can chain the loading and wrangling operations into one compact operation

ratings_again = (
        spark.read.csv(path=ratings_location,
                        sep=",",
                        header=True,
                        quote='"',
                        schema="userId INT, movieId INT, rating DOUBLE, timestamp INT",)
                        .withColumnRenamed("timestamp","timestamp_unix")
                        .withColumn("timestamp", f.to_timestamp(f.from_unixtime("timestamp_unix")))
                        .drop("timestamp_unix")
            )


ratings_again.show(5)

+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|    307|   3.5|2009-10-28 00:00:21|
|     1|    481|   3.5|2009-10-28 00:04:16|
|     1|   1091|   1.5|2009-10-28 00:04:31|
|     1|   1257|   4.5|2009-10-28 00:04:20|
|     1|   1449|   4.5|2009-10-28 00:01:04|
+------+-------+------+-------------------+
only showing top 5 rows



In [16]:
# Let's get the movie data

movies_path = "/home/datamaking/Documents/Hadoop/ml-latest/movies.csv"

movies = spark.read.csv(path=movies_path,
                       sep=",",
                       header=True,
                       quote='"',
                       schema="movieID int, title string, genres string")


movies.printSchema()

movies.show(15, truncate=False)

root
 |-- movieID: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+-------+----------------------------------+-------------------------------------------+
|movieID|title                             |genres                                     |
+-------+----------------------------------+-------------------------------------------+
|1      |Toy Story (1995)                  |Adventure|Animation|Children|Comedy|Fantasy|
|2      |Jumanji (1995)                    |Adventure|Children|Fantasy                 |
|3      |Grumpier Old Men (1995)           |Comedy|Romance                             |
|4      |Waiting to Exhale (1995)          |Comedy|Drama|Romance                       |
|5      |Father of the Bride Part II (1995)|Comedy                                     |
|6      |Heat (1995)                       |Action|Crime|Thriller                      |
|7      |Sabrina (1995)                    |Comedy|Romance                    

In [18]:
# lets do some filtering by genre the pythonic way

movies.where("genres = 'Crime'").show(5, False)

+-------+------------------------------------+------+
|movieID|title                               |genres|
+-------+------------------------------------+------+
|681    |Coup de torchon (Clean Slate) (1981)|Crime |
|716    |Switchblade Sisters (1975)          |Crime |
|732    |Original Gangstas (1996)            |Crime |
|1000   |Curdled (1996)                      |Crime |
|1313   |Mad Dog Time (1996)                 |Crime |
+-------+------------------------------------+------+
only showing top 5 rows



In [19]:
# alternatively the SQL way

movies.where(f.col("genres") == "Crime").show(5, False)

+-------+------------------------------------+------+
|movieID|title                               |genres|
+-------+------------------------------------+------+
|681    |Coup de torchon (Clean Slate) (1981)|Crime |
|716    |Switchblade Sisters (1975)          |Crime |
|732    |Original Gangstas (1996)            |Crime |
|1000   |Curdled (1996)                      |Crime |
|1313   |Mad Dog Time (1996)                 |Crime |
+-------+------------------------------------+------+
only showing top 5 rows



In [25]:
# Array operations

movies_genre = (
                movies.withColumn("genres_array", f.split("genres", "\|"))
                .withColumn("genre", f.explode("genres_array"))
                .select("MovieId", "title", "genre")
                )


movies_genre.show(10)

+-------+--------------------+---------+
|MovieId|               title|    genre|
+-------+--------------------+---------+
|      1|    Toy Story (1995)|Adventure|
|      1|    Toy Story (1995)|Animation|
|      1|    Toy Story (1995)| Children|
|      1|    Toy Story (1995)|   Comedy|
|      1|    Toy Story (1995)|  Fantasy|
|      2|      Jumanji (1995)|Adventure|
|      2|      Jumanji (1995)| Children|
|      2|      Jumanji (1995)|  Fantasy|
|      3|Grumpier Old Men ...|   Comedy|
|      3|Grumpier Old Men ...|  Romance|
+-------+--------------------+---------+
only showing top 10 rows



In [28]:
available_genres = movies_genre.select("genre").distinct().show()

+------------------+
|             genre|
+------------------+
|             Crime|
|           Romance|
|          Thriller|
|         Adventure|
|             Drama|
|               War|
|       Documentary|
|           Fantasy|
|           Mystery|
|           Musical|
|         Animation|
|         Film-Noir|
|(no genres listed)|
|              IMAX|
|            Horror|
|           Western|
|            Comedy|
|          Children|
|            Action|
|            Sci-Fi|
+------------------+



In [38]:
movies_without_genre = movies.where(f.col("genres") == "(no genres listed)")

movies_without_genre.show(10)

print(f'There are {movies_without_genre.count()} movies without genres')

+-------+--------------------+------------------+
|movieID|               title|            genres|
+-------+--------------------+------------------+
|  83773|Away with Words (...|(no genres listed)|
|  83829|Scorpio Rising (1...|(no genres listed)|
|  84768|   Glitterbug (1994)|(no genres listed)|
|  86493|Age of the Earth,...|(no genres listed)|
|  87061|Trails (Veredas) ...|(no genres listed)|
|  91246|Milky Way (Tejút)...|(no genres listed)|
|  92435|Dancing Hawk, The...|(no genres listed)|
|  92641|Warsaw Bridge (Po...|(no genres listed)|
|  94431|Ella Lola, a la T...|(no genres listed)|
|  94657|Turkish Dance, El...|(no genres listed)|
+-------+--------------------+------------------+
only showing top 10 rows

There are 4266 movies without genres


In [54]:
# Let's get the links and tags data

links_path = "/home/datamaking/Documents/Hadoop/ml-latest/links.csv"



links = spark.read.csv(
        path=links_path,
        sep=",",
        header=True,
        quote='"',
        schema="movieId int, imdbId int, tmdbId int",
        )


links.printSchema()

links.show(5)

root
 |-- movieId: integer (nullable = true)
 |-- imdbId: integer (nullable = true)
 |-- tmdbId: integer (nullable = true)

+-------+------+------+
|movieId|imdbId|tmdbId|
+-------+------+------+
|      1|114709|   862|
|      2|113497|  8844|
|      3|113228| 15602|
|      4|114885| 31357|
|      5|113041| 11862|
+-------+------+------+
only showing top 5 rows



In [55]:
tags_path = "/home/datamaking/Documents/Hadoop/ml-latest/tags.csv"

tags = spark.read.csv(
        path=tags_path,
        sep=",",
        quote='"',
        header=True,
        schema="userId int, movieId int, tag string, timestamp int",
        ).withColumn("timestamp",f.to_timestamp(f.from_unixtime("timestamp")))



tags.printSchema()

tags.show(5)

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- tag: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)

+------+-------+------------+-------------------+
|userId|movieId|         tag|          timestamp|
+------+-------+------------+-------------------+
|    14|    110|        epic|2015-09-25 05:35:38|
|    14|    110|    Medieval|2015-09-25 05:35:32|
|    14|    260|      sci-fi|2015-09-13 21:36:50|
|    14|    260|space action|2015-09-13 21:37:01|
|    14|    318|imdb top 250|2015-09-19 01:26:35|
+------+-------+------------+-------------------+
only showing top 5 rows



In [57]:
# Count of movies per genre

movies_per_genre = movies_genre.groupBy("genre").count()

movies_per_genre.show()

+------------------+-----+
|             genre|count|
+------------------+-----+
|             Crime| 5105|
|           Romance| 7412|
|          Thriller| 8216|
|         Adventure| 4067|
|             Drama|24144|
|               War| 1820|
|       Documentary| 5118|
|           Fantasy| 2637|
|           Mystery| 2773|
|           Musical| 1113|
|         Animation| 2663|
|         Film-Noir|  364|
|(no genres listed)| 4266|
|              IMAX|  197|
|            Horror| 5555|
|           Western| 1378|
|            Comedy|15956|
|          Children| 2749|
|            Action| 7130|
|            Sci-Fi| 3444|
+------------------+-----+



In [59]:
# Joining data
# the default join is "inner"

opinions = movies.join(tags, ["movieId"])

opinions.show(10)

+-------+--------------------+--------------------+------+--------------+-------------------+
|movieID|               title|              genres|userId|           tag|          timestamp|
+-------+--------------------+--------------------+------+--------------+-------------------+
|    110|   Braveheart (1995)|    Action|Drama|War|    14|          epic|2015-09-25 05:35:38|
|    110|   Braveheart (1995)|    Action|Drama|War|    14|      Medieval|2015-09-25 05:35:32|
|    260|Star Wars: Episod...|Action|Adventure|...|    14|        sci-fi|2015-09-13 21:36:50|
|    260|Star Wars: Episod...|Action|Adventure|...|    14|  space action|2015-09-13 21:37:01|
|    318|Shawshank Redempt...|         Crime|Drama|    14|  imdb top 250|2015-09-19 01:26:35|
|    318|Shawshank Redempt...|         Crime|Drama|    14|       justice|2015-09-19 01:26:32|
|    480|Jurassic Park (1993)|Action|Adventure|...|    14|     Dinosaurs|2015-09-25 05:36:03|
|    593|Silence of the La...|Crime|Horror|Thri...|    14|ps

In [60]:
# let's do a left join

opinions_left = movies.join(tags, ["movieId"], "left")

opinions_left.show() # you can notice some nulls introduced

+-------+--------------------+--------------------+------+--------------------+-------------------+
|movieID|               title|              genres|userId|                 tag|          timestamp|
+-------+--------------------+--------------------+------+--------------------+-------------------+
|    148|Awfully Big Adven...|               Drama| 40716|Nudity (Topless -...|2006-09-20 09:46:17|
|    148|Awfully Big Adven...|               Drama| 73406|               1940s|2018-06-07 08:44:07|
|    148|Awfully Big Adven...|               Drama| 73406|based on novel or...|2018-06-07 08:44:07|
|    148|Awfully Big Adven...|               Drama| 73406|             england|2018-06-07 08:44:07|
|    148|Awfully Big Adven...|               Drama| 73406|           liverpool|2018-06-07 08:44:07|
|    148|Awfully Big Adven...|               Drama| 73406|     theatre company|2018-06-07 08:44:07|
|    148|Awfully Big Adven...|               Drama|103013|    nudity (topless)|2014-05-17 02:12:20|


In [67]:
# let's do some clean up on the opinions dataframe

opinions = (
            movies
            .join(tags, ["movieId"], "left")
            .select("userId", "movieId", "title", "tag", "timestamp"))

opinions.show(10, truncate=False)

+------+-------+-----------------------------------------+--------------+-------------------+
|userId|movieId|title                                    |tag           |timestamp          |
+------+-------+-----------------------------------------+--------------+-------------------+
|14    |110    |Braveheart (1995)                        |epic          |2015-09-25 05:35:38|
|14    |110    |Braveheart (1995)                        |Medieval      |2015-09-25 05:35:32|
|14    |260    |Star Wars: Episode IV - A New Hope (1977)|sci-fi        |2015-09-13 21:36:50|
|14    |260    |Star Wars: Episode IV - A New Hope (1977)|space action  |2015-09-13 21:37:01|
|14    |318    |Shawshank Redemption, The (1994)         |imdb top 250  |2015-09-19 01:26:35|
|14    |318    |Shawshank Redemption, The (1994)         |justice       |2015-09-19 01:26:32|
|14    |480    |Jurassic Park (1993)                     |Dinosaurs     |2015-09-25 05:36:03|
|14    |593    |Silence of the Lambs, The (1991)         |ps

In [None]:
# get the ratings data in, rename the "timestamp" column from the tags join

extended_opinions = (
                    opinions
                    .withColumnRenamed("timestamp", "tag_time")
                    .join(ratings, ["movieId", "userId"])
                    )



extended_opinions.show(10)