# Load data in HDFS

In [1]:
'''
Uncomment below to copy files to local HDFS.
Assumes that code lives in git directory and that data/ lives one level above
'''
# !hadoop fs -copyFromLocal ../data/movies.csv /
# !hadoop fs -copyFromLocal ../data/ratings.csv /
# !hadoop fs -ls /
# !hadoop fs -cat /movies.csv | head
# !hadoop fs -cat /ratings.csv | head

Found 2 items
-rw-r--r--   1 root supergroup    3038099 2022-04-27 22:06 /movies.csv
-rw-r--r--   1 root supergroup  678260987 2022-04-27 22:06 /ratings.csv
movieId,title,genres
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
6,Heat (1995),Action|Crime|Thriller
7,Sabrina (1995),Comedy|Romance
8,Tom and Huck (1995),Adventure|Children
9,Sudden Death (1995),Action
cat: Unable to write to output stream.
userId,movieId,rating,timestamp
1,296,5.0,1147880044
1,306,3.5,1147868817
1,307,5.0,1147868828
1,665,5.0,1147878820
1,899,3.5,1147868510
1,1088,4.0,1147868495
1,1175,3.5,1147868826
1,1217,3.5,1147878326
1,1237,5.0,1147868839
cat: Unable to write to output stream.


# Start Spark Session

In [20]:
# Change the number of cores in this code block
# by setting `spark.master` to `local[n]` where
# n is the number of cores
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

conf = pyspark.SparkConf().setAll([('spark.master', 'local[2]'),
                                   ('spark.app.name', 'Basic Setup')])
spark = SparkSession.builder.config(conf=conf).getOrCreate()

## Load Data

### Read data from the `movies.csv` file

In [3]:
movies_df = spark.read.option("header",True).csv("hdfs:///movies.csv").cache()
movies_df.printSchema()

root
 |-- movieId: string (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [4]:
movies_df.show()

                                                                                

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

### Read in ratings csv

In [5]:
ratings_df = spark.read.option("header",True).csv("hdfs:///ratings.csv").cache()
ratings_df.printSchema()

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [6]:
ratings_df.show()

[Stage 3:>                                                          (0 + 1) / 1]

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    306|   3.5|1147868817|
|     1|    307|   5.0|1147868828|
|     1|    665|   5.0|1147878820|
|     1|    899|   3.5|1147868510|
|     1|   1088|   4.0|1147868495|
|     1|   1175|   3.5|1147868826|
|     1|   1217|   3.5|1147878326|
|     1|   1237|   5.0|1147868839|
|     1|   1250|   4.0|1147868414|
|     1|   1260|   3.5|1147877857|
|     1|   1653|   4.0|1147868097|
|     1|   2011|   2.5|1147868079|
|     1|   2012|   2.5|1147868068|
|     1|   2068|   2.5|1147869044|
|     1|   2161|   3.5|1147868609|
|     1|   2351|   4.5|1147877957|
|     1|   2573|   4.0|1147878923|
|     1|   2632|   5.0|1147878248|
|     1|   2692|   5.0|1147869100|
+------+-------+------+----------+
only showing top 20 rows



                                                                                

## Join between movies and ratings

In [7]:
# missing_movies = movies_df.join(ratings_df, movies_df.movieId ==  ratings_df.movieId, "leftanti").show(truncate=False)

movies_df.createOrReplaceTempView("m")
ratings_df.createOrReplaceTempView("r")
missing_movies = spark.sql("SELECT COUNT(m.movieId) FROM m LEFT ANTI JOIN r ON m.movieId == r.movieId") \
  .show(truncate=False)

2022-04-27 22:07:08,494 WARN memory.MemoryStore: Not enough space to cache rdd_31_3 in memory! (computed 54.6 MiB so far)
2022-04-27 22:07:08,503 WARN storage.BlockManager: Persisting block rdd_31_3 to disk instead.
2022-04-27 22:07:13,549 WARN memory.MemoryStore: Not enough space to cache rdd_31_3 in memory! (computed 6.6 MiB so far)
[Stage 8:>                                                          (0 + 2) / 2]

+--------------+
|count(movieId)|
+--------------+
|3376          |
+--------------+



                                                                                

In [8]:
spark.stop()