In [2]:
import org.apache.spark.sql.functions.{explode, split, col}
import spark.implicits._

In [1]:
// Creating the Spark Context
val spark = SparkSession.builder().appName("Movie Ratings DataSet").master("local[*]").getOrCreate()
val sparkConext = spark.sparkContext

spark = org.apache.spark.sql.SparkSession@2a56f541
sparkConext = org.apache.spark.SparkContext@48bababb


org.apache.spark.SparkContext@48bababb

In [3]:
val moviesDffPath = "gs://artifacts_spark_jobs/movie.csv"
val ratingDffPath = "gs://artifacts_spark_jobs/rating.csv"


val moviesDff = spark.read.format("csv").option("header","true").option("inferSchema","true").load(moviesDffPath)
val ratingsDff = spark.read.format("csv").option("header","true").option("inferSchema","true").load(ratingDffPath)

// reading the csv files from source

moviesDffPath = gs://artifacts_spark_jobs/movie.csv
ratingDffPath = gs://artifacts_spark_jobs/rating.csv
moviesDff = [movieId: int, title: string ... 1 more field]
ratingsDff = [userId: int, movieId: int ... 2 more fields]


[userId: int, movieId: int ... 2 more fields]

In [6]:
val explodedGenresDF = moviesDff.withColumn("genres", explode(split(col("genres"),"\\|"))).select("movieId","title","genres")

explodedGenresDF = [movieId: int, title: string ... 1 more field]


[movieId: int, title: string ... 1 more field]

In [7]:
val moviesRDD = explodedGenresDF.rdd

def standardizeGenre(genre: String): String ={
  genre match{
    case "Sci-Fi" => "Science Fiction"
    case "Film-Noir" => "Black and White"
    case "(no genres listed)" => "Others"
    case _ => genre 
  }
}


val standarizedData = moviesRDD.map(eachEle=>{
   val movieId = eachEle.getAs[Int]("movieId")
   val movieName = eachEle.getAs[String]("title")
   val genre = eachEle.getAs[String]("genres")

   val standardizedGenre = standardizeGenre(genre)
  
   (movieId, (movieName, standardizedGenre)) 
})

moviesRDD = MapPartitionsRDD[25] at rdd at <console>:29
standarizedData = MapPartitionsRDD[26] at map at <console>:41


standardizeGenre: (genre: String)String


MapPartitionsRDD[26] at map at <console>:41

In [8]:
val ratingsRDD = ratingsDff.rdd.map(row => {
  val movieId = row.getAs[Int]("movieId")
  val rating = row.getAs[Double]("rating")
  (movieId, rating)  // Restructured for join
})

ratingsRDD = MapPartitionsRDD[32] at map at <console>:29


MapPartitionsRDD[32] at map at <console>:29

In [9]:
val moviesWithRatings = ratingsRDD.join(standarizedData)

moviesWithRatings = MapPartitionsRDD[35] at join at <console>:30


MapPartitionsRDD[35] at join at <console>:30

In [10]:
val genreAverageRatings = moviesWithRatings
  .map { case (_, (rating, (_, genre))) => 
    (genre, (rating, 1))
  }
  .reduceByKey { case ((sum1, count1), (sum2, count2)) => 
    (sum1 + sum2, count1 + count2)  
  }
  .mapValues { case (sum, count) => 
    sum / count.toDouble  
  }

genreAverageRatings = MapPartitionsRDD[38] at mapValues at <console>:36


MapPartitionsRDD[38] at mapValues at <console>:36

In [11]:
val genreAverageDF = genreAverageRatings.toDF("genre", "average_rating")

genreAverageDF = [genre: string, average_rating: double]


[genre: string, average_rating: double]

In [12]:
val hdfsPath = "hdfs:/user/hdfs/CaseStudies"

hdfsPath = hdfs:/user/hdfs/CaseStudies


hdfs:/user/hdfs/CaseStudies

In [None]:
genreAverageDF.write.mode("overwrite").parquet(hdfsPath)

In [None]:
val checkDF = spark.read.parquet(hdfsPath)
checkDF.show()