In [2]:
// loading ratings.csv as DataFrame

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

val spark = SparkSession.builder
  .appName("User Rating History Partitioning")
  .getOrCreate()

val ratingsDF = spark.read.option("header", "true").csv("gs://first-job-bucket/ratings.csv")

ratingsDF.printSchema()
ratingsDF.show(5)

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: string (nullable = true)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|     17|   4.0|944249077|
|     1|     25|   1.0|944250228|
|     1|     29|   2.0|943230976|
|     1|     30|   5.0|944249077|
|     1|     32|   5.0|943228858|
+------+-------+------+---------+
only showing top 5 rows



spark = org.apache.spark.sql.SparkSession@22237f87
ratingsDF = [userId: string, movieId: string ... 2 more fields]


[userId: string, movieId: string ... 2 more fields]

In [3]:
// Filter ratings DataFrame

val validRatingsDF = ratingsDF.filter(
  col("userId").isNotNull &&
  col("movieId").isNotNull &&
  col("rating").isNotNull &&
  col("timestamp").isNotNull
)

// Ensure ratings are numeric
val filteredRatingsDF = validRatingsDF.filter(
  col("rating").cast("double").isNotNull
)

filteredRatingsDF.show(5)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|     17|   4.0|944249077|
|     1|     25|   1.0|944250228|
|     1|     29|   2.0|943230976|
|     1|     30|   5.0|944249077|
|     1|     32|   5.0|943228858|
+------+-------+------+---------+
only showing top 5 rows



validRatingsDF = [userId: string, movieId: string ... 2 more fields]
filteredRatingsDF = [userId: string, movieId: string ... 2 more fields]


[userId: string, movieId: string ... 2 more fields]

In [5]:
// create RDD with key as userId and value as List<Tuple[movieId, rating]>

import org.apache.spark.sql.functions._

val ratingsRDD = filteredRatingsDF.rdd.map(row => {
  val userId = row.getAs[String]("userId")
  val movieId = row.getAs[String]("movieId")
  val rating = row.getAs[String]("rating")

  (userId, (movieId, rating))
})

val groupedRatingsRDD = ratingsRDD
  .groupByKey()
  .mapValues(_.toList)

groupedRatingsRDD.take(5).foreach(println)

(140868,List((5,3.0), (6,4.0), (7,4.0), (11,4.0), (17,3.0), (18,3.0), (21,4.0), (25,4.0), (32,3.0), (45,3.0), (50,5.0), (52,4.0), (57,4.0), (62,3.0), (64,3.0), (65,2.0), (70,3.0), (74,3.0), (75,3.0), (82,5.0), (85,4.0), (90,4.0), (93,3.0), (95,3.0), (96,5.0), (100,4.0), (102,3.0), (112,3.0), (125,4.0), (135,3.0), (141,4.0), (163,3.0), (171,5.0), (174,3.0), (176,5.0), (186,3.0), (187,3.0), (189,4.0), (194,4.0), (195,3.0), (203,3.0), (206,4.0), (215,3.0), (223,4.0), (224,4.0), (231,1.0), (232,5.0), (234,4.0), (235,5.0), (236,3.0), (237,3.0), (248,3.0), (252,4.0), (255,3.0), (256,3.0), (260,4.0), (265,5.0), (274,3.0), (275,4.0), (276,4.0), (278,3.0), (281,5.0), (287,4.0), (289,4.0), (294,4.0), (296,5.0), (305,3.0), (307,4.0), (308,4.0), (312,3.0), (314,5.0), (317,3.0), (327,4.0), (339,3.0), (341,4.0), (342,4.0), (344,3.0), (345,5.0), (355,3.0), (356,4.0), (357,5.0), (360,2.0), (367,5.0), (368,4.0), (370,2.0), (372,3.0), (373,4.0), (374,2.0), (378,3.0), (413,3.0), (417,4.0), (419,1.0), (42

lastException = null
ratingsRDD = MapPartitionsRDD[27] at map at <console>:32
groupedRatingsRDD = MapPartitionsRDD[29] at mapValues at <console>:42


MapPartitionsRDD[29] at mapValues at <console>:42

In [12]:
// Create folder for each userId with the associated data in HDFS

import org.apache.spark.rdd.RDD
import org.apache.hadoop.fs.{FileSystem, Path}
import java.io.{BufferedWriter, OutputStreamWriter}

val outputPath = "hdfs://cluster-fcbe-m/spark/Day16_17/CS2/user-data"
val first10Users = groupedRatingsRDD.take(10)
first10Users.foreach { case (userId, ratingsList) =>
  val userFolderPath = s"${outputPath}/${userId}/ratings.txt"
  val path = new Path(userFolderPath)
  
  val fs = FileSystem.get(new java.net.URI("hdfs://cluster-fcbe-m"), new org.apache.hadoop.conf.Configuration())
  
  if (!fs.exists(path.getParent)) {
    fs.mkdirs(path.getParent)
  }

  val ratingsText = ratingsList.map { case (movieId, rating) =>
    s"${movieId}, ${rating}"
  }.mkString("\n")

  val outputStream = fs.create(path)
  val writer = new BufferedWriter(new OutputStreamWriter(outputStream))

  writer.write(ratingsText)

  writer.close()
  outputStream.close()
}

lastException = null
outputPath = hdfs://cluster-fcbe-m/spark/Day16_17/CS2/user-data
first10Users = Array((140868,List((5,3.0), (6,4.0), (7,4.0), (11,4.0), (17,3.0), (18,3.0), (21,4.0), (25,4.0), (32,3.0), (45,3.0), (50,5.0), (52,4.0), (57,4.0), (62,3.0), (64,3.0), (65,2.0), (70,3.0), (74,3.0), (75,3.0), (82,5.0), (85,4.0), (90,4.0), (93,3.0), (95,3.0), (96,5.0), (100,4.0), (102,3.0), (112,3.0), (125,4.0), (135,3.0), (141,4.0), (163,3.0), (171,5.0), (174,3.0), (176,5.0), (186,3.0), (187,3.0), (189,4.0), (194,4.0), (195,3.0), (203,3.0), (206,4.0), (215,3.0), (223,4.0), (224,4.0), (231,1.0), (232,5.0), (234,4.0), (235,5.0), (236,3.0...


Array((140868,List((5,3.0), (6,4.0), (7,4.0), (11,4.0), (17,3.0), (18,3.0), (21,4.0), (25,4.0), (32,3.0), (45,3.0), (50,5.0), (52,4.0), (57,4.0), (62,3.0), (64,3.0), (65,2.0), (70,3.0), (74,3.0), (75,3.0), (82,5.0), (85,4.0), (90,4.0), (93,3.0), (95,3.0), (96,5.0), (100,4.0), (102,3.0), (112,3.0), (125,4.0), (135,3.0), (141,4.0), (163,3.0), (171,5.0), (174,3.0), (176,5.0), (186,3.0), (187,3.0), (189,4.0), (194,4.0), (195,3.0), (203,3.0), (206,4.0), (215,3.0), (223,4.0), (224,4.0), (231,1.0), (232,5.0), (234,4.0), (235,5.0), (236,3.0...