In [14]:
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, TimestampType
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.ml.recommendation import ALSModel
from kafka import KafkaProducer
import json
import time
import random


# Configuration Spark
spark = SparkSession.builder \
    .appName("RealTimeRecommendations") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.0") \
    .getOrCreate()

In [15]:
# Configuration Kafka
KAFKA_BROKER = 'bigdata-container:9092'
TOPIC_NAME = 'movieratings'

# Création du producer
producer = KafkaProducer(
    bootstrap_servers=KAFKA_BROKER,
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

# Génération de données
def generate_rating():
    return {
        'userId': random.randint(1, 1000),
        'movieId': random.randint(1, 1000),
        'rating': round(random.uniform(0.5, 5.0), 1),
        'timestamp': int(time.time())
    }

# Envoi des messages
for _ in range(50): 
    message = generate_rating()
    producer.send(TOPIC_NAME, message)
    print(f"Message envoyé: {message}")
    time.sleep(1)

Message envoyé: {'userId': 116, 'movieId': 470, 'rating': 1.6, 'timestamp': 1746182741}
Message envoyé: {'userId': 336, 'movieId': 628, 'rating': 0.8, 'timestamp': 1746182742}
-------------------------------------------
Batch: 136
-------------------------------------------
-------------------------------------------
Batch: 90
-------------------------------------------
+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|   116|    470| 0.9257184|
+------+-------+----------+

+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|   116|    470| 0.9257184|
+------+-------+----------+

Message envoyé: {'userId': 34, 'movieId': 82, 'rating': 2.8, 'timestamp': 1746182743}
-------------------------------------------
Batch: 137
-------------------------------------------
-------------------------------------------
Batch: 91
-------------------------------------------
+------+-------+----------+
|userId|movieId|prediction|
+-----

                                                                                

Message envoyé: {'userId': 535, 'movieId': 631, 'rating': 2.8, 'timestamp': 1746182753}
-------------------------------------------
Batch: 101
-------------------------------------------
-------------------------------------------
Batch: 147
-------------------------------------------
+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|   457|    629| 4.0739007|
+------+-------+----------+

+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|   457|    629| 4.0739007|
+------+-------+----------+

Message envoyé: {'userId': 992, 'movieId': 366, 'rating': 4.8, 'timestamp': 1746182754}
-------------------------------------------
Batch: 102
-------------------------------------------
-------------------------------------------
Batch: 148
-------------------------------------------
+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|   535|    631| 1.5626397|
+------+-------+----------+

+----

                                                                                

-------------------------------------------
Batch: 115
-------------------------------------------
-------------------------------------------
Batch: 161
-------------------------------------------
+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|   568|    952| 3.9887948|
+------+-------+----------+

+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|   568|    952| 3.9887948|
+------+-------+----------+

Message envoyé: {'userId': 11, 'movieId': 746, 'rating': 2.0, 'timestamp': 1746182768}
-------------------------------------------
Batch: 116
-------------------------------------------
-------------------------------------------
Batch: 162
-------------------------------------------
+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|   107|    516| 3.6655838|
+------+-------+----------+

+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|   107|  

                                                                                

Message envoyé: {'userId': 67, 'movieId': 325, 'rating': 2.7, 'timestamp': 1746182783}
-------------------------------------------
Batch: 131
-------------------------------------------
-------------------------------------------
Batch: 177
-------------------------------------------
+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|   817|    530| 3.2780533|
+------+-------+----------+

+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|   817|    530| 3.2780533|
+------+-------+----------+

Message envoyé: {'userId': 770, 'movieId': 220, 'rating': 1.8, 'timestamp': 1746182784}
-------------------------------------------
Batch: 178
-------------------------------------------
-------------------------------------------
Batch: 132
-------------------------------------------
+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|    67|    325| 2.4190583|
+------+-------+----------+

+-----

In [16]:
# Schéma des messages
schema = StructType([
    StructField("userId", IntegerType(), True),
    StructField("movieId", IntegerType(), True),
    StructField("rating", FloatType(), True),
    StructField("timestamp", TimestampType(), True)
])

# Configuration du streaming Kafka
kafka_stream = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_BROKER) \
    .option("subscribe", TOPIC_NAME) \
    .load()

# Transformation des messages
parsed_stream = kafka_stream.selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), schema).alias("data")) \
    .select("data.*")

# Chargement du modèle et prédictions
model_path = "hdfs://namenode:9000/models/als_recommender"
als_model = ALSModel.load(model_path)
predictions = als_model.transform(parsed_stream)

# Configuration et démarrage du streaming
query = predictions.select("userId", "movieId", "rating") \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

# Attente des résultats
query.awaitTermination(timeout=60)

-------------------------------------------
Batch: 184
-------------------------------------------
+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|   199|      8| 3.1439137|
|    61|    885| 2.0062792|
+------+-------+----------+

-------------------------------------------
Batch: 138
-------------------------------------------
+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|    61|    885| 2.0062792|
+------+-------+----------+



25/05/02 10:46:32 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-46fca008-1180-4be3-bf0d-a23bd1965bb9. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/05/02 10:46:32 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/05/02 10:46:32 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


-------------------------------------------
Batch: 0
-------------------------------------------
+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
+------+-------+----------+



False