In [1]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

// Initialize SparkSession
val spark = SparkSession.builder()
  .appName("Handling Incomplete Metadata")
  .getOrCreate()


spark = org.apache.spark.sql.SparkSession@332cf136


org.apache.spark.sql.SparkSession@332cf136

In [2]:
val moviesPath = "gs://bhargav-assignments/Day16And17Task/movie.csv"
val metadataPath = "gs://bhargav-assignments/Day16And17Task/metadata.json"
val outputPath = "hdfs:///user/Day16AndDay17/case_study_3/enriched-movies"

moviesPath = gs://bhargav-assignments/Day16And17Task/movie.csv
metadataPath = gs://bhargav-assignments/Day16And17Task/metadata.json
outputPath = hdfs:///user/Day16AndDay17/case_study_3/enriched-movies


hdfs:///user/Day16AndDay17/case_study_3/enriched-movies

In [3]:
// Load movies.csv as a DataFrame
val moviesDF = spark.read
  .option("header", "true")
  .csv(moviesPath)

//Show movies DataFrame
println("Movies DataFrame:")
moviesDF.show(5, truncate = false)

Movies DataFrame:
+-------+----------------------------------+-------------------------------------------+
|movieId|title                             |genres                                     |
+-------+----------------------------------+-------------------------------------------+
|1      |Toy Story (1995)                  |Adventure|Animation|Children|Comedy|Fantasy|
|2      |Jumanji (1995)                    |Adventure|Children|Fantasy                 |
|3      |Grumpier Old Men (1995)           |Comedy|Romance                             |
|4      |Waiting to Exhale (1995)          |Comedy|Drama|Romance                       |
|5      |Father of the Bride Part II (1995)|Comedy                                     |
+-------+----------------------------------+-------------------------------------------+
only showing top 5 rows



moviesDF = [movieId: string, title: string ... 1 more field]


[movieId: string, title: string ... 1 more field]

In [4]:
// Extract releaseYear from title
val metadataDF = moviesDF
  .select(
    col("movieId").cast(IntegerType),
    // Extract the year using regex, if no match, coalesce to 2000
    coalesce(regexp_extract(col("title"), "\\((\\d{4})\\)", 1).cast(IntegerType), lit(2000)).as("releaseYear")
  )

//Show extracted metadata
println("Extracted Metadata DataFrame:")
metadataDF.show(5, truncate = false)

// Save metadata as JSON to HDFS
metadataDF.write
  .mode("overwrite")
  .json(metadataPath)

println(s"Metadata saved to $metadataPath.")

Extracted Metadata DataFrame:
+-------+-----------+
|movieId|releaseYear|
+-------+-----------+
|1      |1995       |
|2      |1995       |
|3      |1995       |
|4      |1995       |
|5      |1995       |
+-------+-----------+
only showing top 5 rows

Metadata saved to gs://bhargav-assignments/Day16And17Task/metadata.json.


metadataDF = [movieId: int, releaseYear: int]


[movieId: int, releaseYear: int]

In [5]:
// Enrich movies DataFrame by joining with metadata
val enrichedMoviesDF = moviesDF
  .join(metadataDF, Seq("movieId"), "left_outer")
  .withColumn("releaseYear", coalesce(col("releaseYear"), lit(null)))

//Show enriched movies DataFrame
println("Enriched Movies DataFrame:")
enrichedMoviesDF.show(5, truncate = false)


Enriched Movies DataFrame:
+-------+----------------------------------+-------------------------------------------+-----------+
|movieId|title                             |genres                                     |releaseYear|
+-------+----------------------------------+-------------------------------------------+-----------+
|1      |Toy Story (1995)                  |Adventure|Animation|Children|Comedy|Fantasy|1995       |
|2      |Jumanji (1995)                    |Adventure|Children|Fantasy                 |1995       |
|3      |Grumpier Old Men (1995)           |Comedy|Romance                             |1995       |
|4      |Waiting to Exhale (1995)          |Comedy|Drama|Romance                       |1995       |
|5      |Father of the Bride Part II (1995)|Comedy                                     |1995       |
+-------+----------------------------------+-------------------------------------------+-----------+
only showing top 5 rows



enrichedMoviesDF = [movieId: string, title: string ... 2 more fields]


[movieId: string, title: string ... 2 more fields]

In [6]:
// Save the enriched DataFrame to HDFS in Parquet format
enrichedMoviesDF.write
  .mode("overwrite")
  .parquet(outputPath)

// Enriched data saved
println(s"Enriched Movies DataFrame saved to $outputPath.")

Enriched Movies DataFrame saved to hdfs:///user/Day16AndDay17/case_study_3/enriched-movies.


In [7]:
// Stop the SparkSession
spark.stop()