In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import *
from pyspark.ml.recommendation import ALSModel
from pymongo import MongoClient
from datetime import datetime

# Configuration Spark
spark = SparkSession.builder \
    .appName("MovieRecommender") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.0") \
    .getOrCreate()

# Schéma des messages
schema = StructType([
    StructField("userId", IntegerType(), True),
    StructField("movieId", IntegerType(), True),
    StructField("rating", FloatType(), True),
    StructField("timestamp", StringType(), True)
])

# MongoDB configuration
client = MongoClient('mongodb://root:example@mongodb:27017/admin?authSource=admin')
db = client['moviedb']
predictions_collection = db['predictions']

def process_batch(df, epoch_id):
    try:
        if not df.isEmpty():
            # Conversion directe en liste de dictionnaires sans utiliser pandas
            predictions = df.select("userId", "movieId", "prediction", "timestamp").collect()
            documents = [row.asDict() for row in predictions]
            
            # Sauvegarde dans MongoDB
            predictions_collection.insert_many(documents)
            
            print(f"\nBatch {epoch_id}: {len(documents)} prédictions sauvegardées")
            
    except Exception as e:
        print(f"Erreur dans le batch {epoch_id}: {e}")
# Configuration du streaming
streaming_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "bigdata-container:9092") \
    .option("subscribe", "movieratings") \
    .option("startingOffsets", "latest") \
    .load()

# Chargement du modèle
als_model = ALSModel.load("hdfs://bigdata-container:9000/models/als_recommender")

# Pipeline de traitement
predictions_df = streaming_df \
    .selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), schema).alias("data")) \
    .select("data.*") \
    .transform(als_model.transform)

# Démarrage du streaming
query = predictions_df \
    .writeStream \
    .foreachBatch(process_batch) \
    .outputMode("append") \
    .trigger(processingTime='2 seconds') \
    .start()

print("Consumer démarré - En attente de messages...")
query.awaitTermination()

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-9da33ccc-d811-4ca5-ab01-dcb2864a1fef;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.4.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.4.0 in central
	found org.apache.kafka#kafka-clients;3.3.2 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.9.1 in central
	found org.slf4j#slf4j-api;2.0.6 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
downloading https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-0

Consumer démarré - En attente de messages...


25/05/05 03:50:53 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
25/05/05 03:51:04 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 2000 milliseconds, but spent 11807 milliseconds
                                                                                


Batch 1: 5 prédictions sauvegardées


25/05/05 03:51:17 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 2000 milliseconds, but spent 13151 milliseconds
25/05/05 03:51:20 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 2000 milliseconds, but spent 2616 milliseconds



Batch 2: 7 prédictions sauvegardées

Batch 4: 1 prédictions sauvegardées


25/05/05 03:51:24 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 2000 milliseconds, but spent 2362 milliseconds
                                                                                


Batch 5: 1 prédictions sauvegardées


25/05/05 03:51:30 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 2000 milliseconds, but spent 4294 milliseconds



Batch 6: 1 prédictions sauvegardées

Batch 7: 2 prédictions sauvegardées

Batch 8: 1 prédictions sauvegardées

Batch 9: 1 prédictions sauvegardées


                                                                                


Batch 10: 1 prédictions sauvegardées


25/05/05 03:51:38 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 2000 milliseconds, but spent 2510 milliseconds



Batch 11: 1 prédictions sauvegardées

Batch 12: 1 prédictions sauvegardées


25/05/05 03:51:42 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 2000 milliseconds, but spent 2286 milliseconds



Batch 13: 1 prédictions sauvegardées


25/05/05 03:51:44 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 2000 milliseconds, but spent 2196 milliseconds



Batch 14: 1 prédictions sauvegardées


                                                                                


Batch 15: 1 prédictions sauvegardées

Batch 16: 1 prédictions sauvegardées


25/05/05 03:51:52 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 2000 milliseconds, but spent 3974 milliseconds



Batch 17: 2 prédictions sauvegardées

Batch 18: 1 prédictions sauvegardées


                                                                                


Batch 19: 1 prédictions sauvegardées


                                                                                


Batch 20: 1 prédictions sauvegardées

Batch 21: 1 prédictions sauvegardées


25/05/05 03:52:02 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 2000 milliseconds, but spent 2341 milliseconds



Batch 22: 1 prédictions sauvegardées

Batch 23: 1 prédictions sauvegardées

Batch 24: 1 prédictions sauvegardées

Batch 25: 1 prédictions sauvegardées


                                                                                


Batch 26: 1 prédictions sauvegardées


25/05/05 03:52:12 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 2000 milliseconds, but spent 2284 milliseconds
                                                                                


Batch 27: 1 prédictions sauvegardées

Batch 28: 1 prédictions sauvegardées


                                                                                