In [1]:
spark

In [5]:
movies = spark.read.format("csv").option("header", True).option("inferSchema", True).load("/data/movielens/movies.csv")
movies.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [6]:
movies.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [7]:
ratings = spark.read.format("csv").option("header", True).option("inferSchema", True).load("/data/movielens/ratings.csv")
ratings.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|     16|   4.0|1217897793|
|     1|     24|   1.5|1217895807|
|     1|     32|   4.0|1217896246|
|     1|     47|   4.0|1217896556|
|     1|     50|   4.0|1217896523|
|     1|    110|   4.0|1217896150|
|     1|    150|   3.0|1217895940|
|     1|    161|   4.0|1217897864|
|     1|    165|   3.0|1217897135|
|     1|    204|   0.5|1217895786|
|     1|    223|   4.0|1217897795|
|     1|    256|   0.5|1217895764|
|     1|    260|   4.5|1217895864|
|     1|    261|   1.5|1217895750|
|     1|    277|   0.5|1217895772|
|     1|    296|   4.0|1217896125|
|     1|    318|   4.0|1217895860|
|     1|    349|   4.5|1217897058|
|     1|    356|   3.0|1217896231|
|     1|    377|   2.5|1217896373|
+------+-------+------+----------+
only showing top 20 rows



In [8]:
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



# Dataframe DSL ... these are functions simiar to pandas dataframe ... another way will be to use sql .. that will see next

In [9]:
from pyspark.sql.functions import * 

In [17]:
(ratings
 .groupBy("movieId")
 .agg(avg("rating").alias("avg_rating"), count("rating").alias("rating_count"))
 .filter("rating_count>=100")
 .alias("t1")
 .join(movies.alias("t2"), col("t1.movieId")==col("t2.movieId"))
 .select("avg_rating", "t1.movieId", "t2.title")
 .orderBy(desc("avg_rating"))
 .limit(10)
).show(10, False)

+------------------+-------+--------------------------------------+
|avg_rating        |movieId|title                                 |
+------------------+-------+--------------------------------------+
|4.454545454545454 |318    |Shawshank Redemption, The (1994)      |
|4.392857142857143 |858    |Godfather, The (1972)                 |
|4.328947368421052 |50     |Usual Suspects, The (1995)            |
|4.3019480519480515|1136   |Monty Python and the Holy Grail (1975)|
|4.296370967741935 |527    |Schindler's List (1993)               |
|4.2727272727272725|1193   |One Flew Over the Cuckoo's Nest (1975)|
|4.2711442786069655|608    |Fargo (1996)                          |
|4.264367816091954 |2571   |Matrix, The (1999)                    |
|4.260714285714286 |1221   |Godfather: Part II, The (1974)        |
|4.2592592592592595|1213   |Goodfellas (1990)                     |
+------------------+-------+--------------------------------------+



# Spark SQL

In [18]:
movies.createOrReplaceTempView("movies")
ratings.createOrReplaceTempView("ratings")

In [19]:
sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        |   movies|       true|
|        |  ratings|       true|
+--------+---------+-----------+



In [20]:
sql("""

select 

t1.movieId, t1.title,avg(t2.rating) avg_rating
from movies t1 join ratings t2 on t1.movieId = t2.movieId
group by t1.movieId, t1.title
having count(*)>100
order by avg_rating desc
limit 10

""").show(100, False)

+-------+--------------------------------------+------------------+
|movieId|title                                 |avg_rating        |
+-------+--------------------------------------+------------------+
|318    |Shawshank Redemption, The (1994)      |4.454545454545454 |
|858    |Godfather, The (1972)                 |4.392857142857143 |
|50     |Usual Suspects, The (1995)            |4.328947368421052 |
|1136   |Monty Python and the Holy Grail (1975)|4.3019480519480515|
|527    |Schindler's List (1993)               |4.296370967741935 |
|1193   |One Flew Over the Cuckoo's Nest (1975)|4.2727272727272725|
|608    |Fargo (1996)                          |4.2711442786069655|
|2571   |Matrix, The (1999)                    |4.264367816091954 |
|1221   |Godfather: Part II, The (1974)        |4.260714285714286 |
|1213   |Goodfellas (1990)                     |4.2592592592592595|
+-------+--------------------------------------+------------------+

