In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType
from time import sleep

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("StreamingReviews")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

#Review_ID,Movie_ID,Reviewer_Name,Review_Rating,Reviewer_Nationality,Reviewer_Age,Review_Date,Sex
dataSchema = StructType(
    [StructField("Review_ID", StringType(), True),
     StructField("Movie_ID", StringType(), True),
     StructField("Reviewer_Name", StringType(), True),
     StructField("Review_Rating", IntType(), True),
     StructField("Reviewer_Nationality", StringType(), True),
     StructField("Reviewer_Age", IntType(), True),
     StructField("Review_Date", StringType(), True),
     StructField("Sex", StringType(), True)
     ])

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# We need to set the following configuration whenever we need to use GCS.
# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set(
    "fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS"
)

# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "temp_de2024_2069997"
spark.conf.set("temporaryGcsBucket", bucket)

# Read the whole dataset as a batch
kafkaStream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:9093") \
    .option("subscribe", "reviews") \
    .option("startingOffsets", "latest") \
    .load()

df = kafkaStream.selectExpr("CAST(value AS STRING)")

df1 = df.select(from_json(df.value, dataSchema.simpleString()))

df1.printSchema()

sdf = df1.select(col("from_json(value).*"))

sdf.printSchema()

# Step 1: Aggregate all ratings for each nationality and calculate their average
ratings_by_nationality = (
    sdf
    .groupBy("Reviewer_Nationality")
    .agg(avg("Review_Rating").alias("Average_Rating"))
)

# Step 2: Sort nationalities by their average ratings
sorted_nationalities = ratings_by_nationality.orderBy(col("Average_Rating").asc())

# Step 3: Extract the nationality with the lowest average rating
lowest_avg_nationality = (
    sorted_nationalities
    .limit(1)
    .select("Reviewer_Nationality")
    .alias("Lowest_Nationality")
)

# Join this with the original dataset to filter by this nationality
lowest_nationality_reviews = parsed_df.join(
    lowest_avg_nationality, 
    sdf["Reviewer_Nationality"] == lowest_avg_nationality["Reviewer_Nationality"]
)

# Step 4: Find the top 10 highest-rated movies for this nationality
top_movies_for_lowest_nationality = (
    lowest_nationality_reviews
    .groupBy("Movie_ID")
    .agg(avg("Review_Rating").alias("Movie_Average_Rating"))
    .orderBy(col("Movie_Average_Rating").desc())
    .limit(10)
)

def my_foreach_batch_function(df, batch_id):
   # Saving the data to BigQuery as batch processing sink -see, use write(), save(), etc.
    df.write.format('bigquery') \
      .option('table', 'dataengineering-439112.labdataset.topmoviesforlowestnationality') \
      .mode("overwrite") \
      .save()

# Output to the console or sink
query = (
    top_movies_for_lowest_nationality
    .writeStream
    .outputMode("complete") 
    .trigger(processingTime="2 seconds")
    .foreachBatch(my_foreach_batch_function)
    .start()
)

try:
    query.awaitTermination()
except KeyboardInterrupt:
    query.stop()
    # Stop the spark context
    spark.stop()
    print("Stoped the streaming query and the spark context")