In [1]:
import os, sys
SPARK_HOME = os.environ['SPARK_HOME']
sys.path.insert(0, os.path.join(SPARK_HOME, "python", "lib", "py4j-0.10.4-src.zip"))
sys.path.insert(0, os.path.join(SPARK_HOME, "python"))

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark.read.text("file:///etc/passwd").show(10, False)

+-------------------------------------------------+
|value                                            |
+-------------------------------------------------+
|root:x:0:0:root:/root:/bin/bash                  |
|daemon:x:1:1:daemon:/usr/sbin:/usr/sbin/nologin  |
|bin:x:2:2:bin:/bin:/usr/sbin/nologin             |
|sys:x:3:3:sys:/dev:/usr/sbin/nologin             |
|sync:x:4:65534:sync:/bin:/bin/sync               |
|games:x:5:60:games:/usr/games:/usr/sbin/nologin  |
|man:x:6:12:man:/var/cache/man:/usr/sbin/nologin  |
|lp:x:7:7:lp:/var/spool/lpd:/usr/sbin/nologin     |
|mail:x:8:8:mail:/var/mail:/usr/sbin/nologin      |
|news:x:9:9:news:/var/spool/news:/usr/sbin/nologin|
+-------------------------------------------------+
only showing top 10 rows



In [3]:
base = "/home/training/Downloads/datasets/movielens"

In [6]:
movies = (spark
          .read
          .format("csv")
          .option("header", True)
          .load(base + "/movies.csv"))
movies.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [7]:
movies.dtypes

[('movieId', 'string'), ('title', 'string'), ('genres', 'string')]

In [10]:
movies = (spark
          .read
          .format("csv")
          .option("header", True)
          .option("inferSchema", True)
          .load(base + "/movies.csv"))
movies.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [11]:
movies.dtypes


[('movieId', 'int'), ('title', 'string'), ('genres', 'string')]

In [12]:
ratings = (spark
          .read
          .format("csv")
          .option("header", True)
          .option("inferSchema", True)
          .load(base + "/ratings.csv"))
ratings.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|     16|   4.0|1217897793|
|     1|     24|   1.5|1217895807|
|     1|     32|   4.0|1217896246|
|     1|     47|   4.0|1217896556|
|     1|     50|   4.0|1217896523|
|     1|    110|   4.0|1217896150|
|     1|    150|   3.0|1217895940|
|     1|    161|   4.0|1217897864|
|     1|    165|   3.0|1217897135|
|     1|    204|   0.5|1217895786|
|     1|    223|   4.0|1217897795|
|     1|    256|   0.5|1217895764|
|     1|    260|   4.5|1217895864|
|     1|    261|   1.5|1217895750|
|     1|    277|   0.5|1217895772|
|     1|    296|   4.0|1217896125|
|     1|    318|   4.0|1217895860|
|     1|    349|   4.5|1217897058|
|     1|    356|   3.0|1217896231|
|     1|    377|   2.5|1217896373|
+------+-------+------+----------+
only showing top 20 rows



In [13]:
from pyspark.sql.functions import * 

In [23]:
(ratings
 .groupBy("movieId")
 .agg(avg("rating").alias("avg_rating")
      , count("rating").alias("count"))
.filter("count>=100")
.alias("t1")
.join(movies.alias("t2"), col("t1.movieId") == col("t2.movieId"))
.select("t1.movieId", "title", "avg_rating")
.orderBy(desc("avg_rating"))
.limit(10)
.show(10, False))

+-------+--------------------------------------+------------------+
|movieId|title                                 |avg_rating        |
+-------+--------------------------------------+------------------+
|318    |Shawshank Redemption, The (1994)      |4.454545454545454 |
|858    |Godfather, The (1972)                 |4.392857142857143 |
|50     |Usual Suspects, The (1995)            |4.328947368421052 |
|1136   |Monty Python and the Holy Grail (1975)|4.3019480519480515|
|527    |Schindler's List (1993)               |4.296370967741935 |
|1193   |One Flew Over the Cuckoo's Nest (1975)|4.2727272727272725|
|608    |Fargo (1996)                          |4.2711442786069655|
|2571   |Matrix, The (1999)                    |4.264367816091954 |
|1221   |Godfather: Part II, The (1974)        |4.260714285714286 |
|1213   |Goodfellas (1990)                     |4.2592592592592595|
+-------+--------------------------------------+------------------+



In [25]:
movies.createOrReplaceTempView("movies")
ratings.createOrReplaceTempView("ratings")
spark.sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        |   movies|       true|
|        |  ratings|       true|
+--------+---------+-----------+



In [26]:
sql = spark.sql

In [27]:
sql("describe movies").show()

+--------+---------+-------+
|col_name|data_type|comment|
+--------+---------+-------+
| movieId|      int|   null|
|   title|   string|   null|
|  genres|   string|   null|
+--------+---------+-------+



In [33]:
sql("""
select t1.movieId, t1.title, avg(t2.rating) avg_rating from movies t1 join  ratings
t2 on t1.movieId = t2.movieId group by t1.movieId, t1.title
having count(t1.movieId) > 100  
order by avg_rating desc limit 10
""").show(10, False)

+-------+--------------------------------------+------------------+
|movieId|title                                 |avg_rating        |
+-------+--------------------------------------+------------------+
|318    |Shawshank Redemption, The (1994)      |4.454545454545454 |
|858    |Godfather, The (1972)                 |4.392857142857143 |
|50     |Usual Suspects, The (1995)            |4.328947368421052 |
|1136   |Monty Python and the Holy Grail (1975)|4.3019480519480515|
|527    |Schindler's List (1993)               |4.296370967741935 |
|1193   |One Flew Over the Cuckoo's Nest (1975)|4.2727272727272725|
|608    |Fargo (1996)                          |4.2711442786069655|
|2571   |Matrix, The (1999)                    |4.264367816091954 |
|1221   |Godfather: Part II, The (1974)        |4.260714285714286 |
|1213   |Goodfellas (1990)                     |4.2592592592592595|
+-------+--------------------------------------+------------------+

