In [3]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import scala.util.Random

val spark = SparkSession.builder()
  .appName("Generate Metadata JSON")
  .getOrCreate()

val moviesPath = "gs://first-job-bucket/movies.csv"
val moviesDF = spark.read.option("header", "true").csv(moviesPath)

val extractYear = udf((title: String) => {
  val yearPattern = "\\((\\d{4})\\)".r
  yearPattern.findFirstMatchIn(title).map(_.group(1)).getOrElse {
    (1980 + Random.nextInt(2023 - 1980 + 1)).toString
  }
})

val metadataDF = moviesDF
  .select("movieId", "title")
  .withColumn("releaseYear", extractYear(col("title")))

val outputPath = "gs://first-job-bucket/metadata.json"

metadataDF.coalesce(1)
  .write
  .mode("overwrite")
  .json(outputPath)

println(s"Metadata written successfully to $outputPath!")

Metadata written successfully to gs://first-job-bucket/metadata.json!


lastException = null
spark = org.apache.spark.sql.SparkSession@343e1dd
moviesPath = gs://first-job-bucket/movies.csv
moviesDF = [movieId: string, title: string ... 1 more field]
extractYear = SparkUserDefinedFunction($Lambda$5216/0x0000000801f31040@545083dd,StringType,List(Some(class[value[0]: string])),Some(class[value[0]: string]),None,true,true)
metadataDF = [movieId: string, title: string ... 1 more field]
outputPath = gs://first-job-bucket/metadata.json


gs://first-job-bucket/metadata.json

In [4]:
import org.json4s._
import org.json4s.jackson.JsonMethods._
import java.io.PrintWriter

val metadataRDD = spark.sparkContext.textFile("gs://first-job-bucket/metadata.json")
val parsedMetadataRDD = metadataRDD.map { line =>
  implicit val formats = DefaultFormats
  val json = parse(line)
  val movieId = (json \ "movieId").extract[String]
  val releaseYear = (json \ "releaseYear").extract[String]
  (movieId, releaseYear)
}

val metadataFromJsonDF = parsedMetadataRDD.toDF("movieId", "releaseYear")
metadataFromJsonDF.show(5)

+-------+-----------+
|movieId|releaseYear|
+-------+-----------+
|      1|       1995|
|      2|       1995|
|      3|       1995|
|      4|       1995|
|      5|       1995|
+-------+-----------+
only showing top 5 rows



metadataRDD = gs://first-job-bucket/metadata.json MapPartitionsRDD[16] at textFile at <console>:33
parsedMetadataRDD = MapPartitionsRDD[17] at map at <console>:34
metadataFromJsonDF = [movieId: string, releaseYear: string]


[movieId: string, releaseYear: string]

In [5]:
val missingYearCount = moviesDF.filter(!col("title").rlike("\\(\\d{4}\\)$")).count()

if (missingYearCount > 0) {
  println(s"Validation failed: $missingYearCount movies do not have years in their titles.")
} else {
  println("Validation passed: All movies have years in their titles.")
}

Validation failed: 797 movies do not have years in their titles.


missingYearCount = 797


797

In [6]:
val enrichedMoviesDF = moviesDF.join(metadataFromJsonDF, Seq("movieId"), "left").map(row => {
  val movieId = row.getString(row.fieldIndex("movieId"))
  var title = row.getString(row.fieldIndex("title"))
  val releaseYear = row.getString(row.fieldIndex("releaseYear"))

  if (!title.matches(".*\\(\\d{4}\\)$")) {
    title = s"$title ($releaseYear)"
  }

  (movieId, title, row.getString(row.fieldIndex("genres")))
}).toDF("movieId", "title", "genres")

val outputPath = "hdfs:///spark/Day16_17/CS3/enriched-movies"
enrichedMoviesDF.write.mode("overwrite").parquet(outputPath)

println("Enriched movies data saved successfully!")

Enriched movies data saved successfully!


enrichedMoviesDF = [movieId: string, title: string ... 1 more field]
outputPath = hdfs:///spark/Day16_17/CS3/enriched-movies


hdfs:///spark/Day16_17/CS3/enriched-movies

In [7]:
val missingYearCount = enrichedMoviesDF.filter(!col("title").rlike("\\(\\d{4}\\)$")).count()

if (missingYearCount > 0) {
  println(s"Validation failed: $missingYearCount movies do not have years in their titles.")
} else {
  println("Validation passed: All movies have years in their titles.")
}

Validation passed: All movies have years in their titles.


missingYearCount = 0


0