In [None]:
from kafka import KafkaConsumer
import json

In [None]:
c = KafkaConsumer('movielens_ratings' , bootstrap_servers=['localhost:9092'])

In [None]:
def process_msg(msg):
    print(msg.offset)
    dico = dict(json.loads(msg.value))
    print(dico)

In [None]:
for msg in c:
    process_msg(msg)

In [None]:
from kafka import KafkaConsumer
import json

consumer = KafkaConsumer(
    'MoviesRatings',
    bootstrap_servers=['localhost:9092'],
    auto_offset_reset='earliest',  # Pour lire depuis le début
    group_id='my-group',  # Identifiant de groupe de consommateurs
    value_deserializer=lambda x: json.loads(x.decode('utf-8'))  # Pour désérialiser automatiquement
)

for message in consumer:
    print(f"Offset: {message.offset}")
    print(f"Valeur: {message.value}")

In [None]:
!pip install numpy


In [None]:
!pip install pymongo


In [None]:
# streaming_recommendations.py
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, explode, array, lit, to_json, struct
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, TimestampType
from pyspark.ml.recommendation import ALSModel
import time
import json
from pymongo import MongoClient
import datetime

# Créer une session Spark avec support Kafka et MongoDB
spark = SparkSession.builder \
    .appName("MovieRecommendationStreaming") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2,org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .getOrCreate()

# Définir le schéma des données entrantes
schema = StructType([
    StructField("userId", IntegerType(), True),
    StructField("movieId", IntegerType(), True),
    StructField("rating", DoubleType(), True),
    StructField("timestamp", IntegerType(), True)
])

# Charger le modèle ALS préalablement entraîné
model_path = "hdfs://namenode:9000/models/als"
model = ALSModel.load(model_path)


# Configuration MongoDB
mongo_uri = "mongodb://localhost:27017"
mongo_client = MongoClient(mongo_uri)
mongo_db = mongo_client["movie_recommender"]
mongo_collection = mongo_db["recommendations"]

# Fonction pour générer des recommandations et les sauvegarder dans MongoDB
def process_batch(df, epoch_id):
    if not df.isEmpty():
        try:
            # Extraire les userId uniques du batch
            unique_users = df.select("userId").distinct()
            
            # Générer des recommandations pour chaque utilisateur
            recommendations = model.recommendForUserSubset(unique_users, 10)
            
            # Exploser les recommandations pour avoir un format plat
            flat_recommendations = recommendations.select(
                col("userId"),
                explode(col("recommendations")).alias("rec")
            ).select(
                col("userId"),
                col("rec.movieId").alias("movieId"),
                col("rec.rating").alias("prediction")
            )
            
            # Joindre avec les informations de films (si disponible)
            try:
                movies_df = spark.read.csv("hdfs://namenode:9000/datasets/movies.csv", header=True)
                recommendations_with_info = flat_recommendations.join(
                    movies_df, flat_recommendations.movieId == movies_df.movieId
                ).select(
                    flat_recommendations.userId,
                    flat_recommendations.movieId,
                    movies_df.title,
                    flat_recommendations.prediction
                )
            except:
                recommendations_with_info = flat_recommendations
            
            # Afficher les recommandations
            recommendations_with_info.show(10, False)
            
            # Convertir le DataFrame en liste de dictionnaires pour MongoDB
            recommendations_list = recommendations_with_info.withColumn(
                "timestamp", lit(datetime.datetime.now().isoformat())
            ).toJSON().collect()
            
            # Insérer dans MongoDB
            if recommendations_list:
                documents = [json.loads(rec) for rec in recommendations_list]
                mongo_collection.insert_many(documents)
                
            print(f"Batch {epoch_id}: Recommandations générées et sauvegardées pour {unique_users.count()} utilisateurs")
        
        except Exception as e:
            print(f"Erreur lors du traitement du batch {epoch_id}: {str(e)}")

# Lire les données du stream Kafka
kafka_stream = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "MoviesRatings") \
    .option("startingOffsets", "latest") \
    .load()

# Transformer les données JSON en DataFrame structuré
parsed_stream = kafka_stream \
    .select(from_json(col("value").cast("string"), schema).alias("data")) \
    .select("data.*")

# Traiter les données par batch
query = parsed_stream \
    .writeStream \
    .foreachBatch(process_batch) \
    .outputMode("update") \
    .trigger(processingTime="10 seconds") \
    .start()

# Attendre la fin du traitement
query.awaitTermination()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, explode, lit
from pyspark.ml.recommendation import ALSModel
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, StringType
import time
import json
from pymongo import MongoClient
import datetime


# Créer une session Spark avec support Kafka et MongoDB
spark = SparkSession.builder \
    .appName("MovieRecommendationStreaming") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2,org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .getOrCreate()

# Définir le schéma des données entrantes
schema = StructType([
    StructField("userId", IntegerType(), True),
    StructField("movieId", IntegerType(), True),
    StructField("rating", DoubleType(), True),
    StructField("timestamp", IntegerType(), True)
])

# Charger le modèle ALS préalablement entraîné
model_path = "hdfs://namenode:9000/models/als"
model = ALSModel.load(model_path)

# Configuration MongoDB
mongo_uri = "mongodb://localhost:27017"
mongo_client = MongoClient(mongo_uri)
mongo_db = mongo_client["movie_recommender"]
mongo_collection = mongo_db["recommendations"]

# Fonction pour générer des recommandations et les sauvegarder dans MongoDB
def process_batch(df, epoch_id):
    if not df.isEmpty():
        try:
            # Extraire les userId uniques du batch
            unique_users = df.select("userId").distinct()
            
            # Générer des recommandations pour chaque utilisateur
            recommendations = model.recommendForUserSubset(unique_users, 10)
            
            # Exploser les recommandations pour avoir un format plat
            flat_recommendations = recommendations.select(
                col("userId"),
                explode(col("recommendations")).alias("rec")
            ).select(
                col("userId"),
                col("rec.movieId").alias("movieId"),
                col("rec.rating").alias("prediction")
            )
            
            # Joindre avec les informations de films (si disponible)
            try:
                movies_df = spark.read.csv("hdfs://namenode:9000/datasets/movie.csv", header=True)
                recommendations_with_info = flat_recommendations.join(
                    movies_df, flat_recommendations.movieId == movies_df.movieId
                ).select(
                    flat_recommendations.userId,
                    flat_recommendations.movieId,
                    movies_df.title,
                    flat_recommendations.prediction
                )
            except Exception as e:
                print(f"Erreur lors du chargement des informations des films: {str(e)}")
                recommendations_with_info = flat_recommendations
            
            # Afficher les recommandations (facultatif, pour debug)
            recommendations_with_info.show(10, False)
            
            # Convertir le DataFrame en liste de dictionnaires pour MongoDB
            recommendations_list = recommendations_with_info.withColumn(
                "timestamp", lit(datetime.datetime.now().isoformat())
            ).toJSON().collect()
            
            # Insérer dans MongoDB
            if recommendations_list:
                documents = [json.loads(rec) for rec in recommendations_list]
                mongo_collection.insert_many(documents)
                
            print(f"Batch {epoch_id}: Recommandations générées et sauvegardées pour {unique_users.count()} utilisateurs")
        
        except Exception as e:
            print(f"Erreur lors du traitement du batch {epoch_id}: {str(e)}")

# Lire les données du stream Kafka
kafka_stream = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "MoviesRatings") \
    .option("startingOffsets", "latest") \
    .load()

# Transformer les données JSON en DataFrame structuré
parsed_stream = kafka_stream \
    .select(from_json(col("value").cast("string"), schema).alias("data")) \
    .select("data.*")

# Traiter les données par batch
query = parsed_stream \
    .writeStream \
    .foreachBatch(process_batch) \
    .outputMode("update") \
    .trigger(processingTime="10 seconds") \
    .start()

# Attendre la fin du traitement
query.awaitTermination()


:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-88488a41-288b-4028-b6ae-e1d8be3b380d;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.1.2 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.1.2 in central
	found org.apache.kafka#kafka-clients;2.6.0 in central
	found com.github.luben#zstd-jni;1.4.8-1 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.2 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.commons#commons-pool2;2.6.2 in central
	found org.mongodb.spark#mongo-spark-connector_2.12;3.0.1 in central
	found org.mongodb#mongodb-driver-sync;4.0.5 in central
	found org.mon

Erreur lors du chargement des informations des films: [PATH_NOT_FOUND] Path does not exist: hdfs://namenode:9000/datasets/movies.csv.


25/05/02 15:25:05 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|3     |120821 |6.3276896 |
|3     |117907 |5.928648  |
|3     |98595  |5.924997  |
|3     |101538 |5.8830857 |
|3     |125966 |5.779213  |
|3     |102119 |5.7608495 |
|3     |74061  |5.738727  |
|3     |120313 |5.7309647 |
|3     |129514 |5.7032986 |
|3     |109887 |5.682914  |
+------+-------+----------+

Erreur lors du traitement du batch 1: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6814e3cb6b5defe400436fd8, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>


25/05/02 15:25:38 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 37441 milliseconds


Erreur lors du chargement des informations des films: [PATH_NOT_FOUND] Path does not exist: hdfs://namenode:9000/datasets/movies.csv.
+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|85    |120821 |5.961138  |
|85    |98595  |5.7224665 |
|85    |86237  |5.562018  |
|85    |77736  |5.491927  |
|85    |101855 |5.4835405 |
|85    |60356  |5.479856  |
|85    |85205  |5.4434586 |
|85    |117907 |5.338574  |
|85    |74061  |5.305612  |
|85    |106334 |5.2770243 |
+------+-------+----------+
only showing top 10 rows

Erreur lors du traitement du batch 2: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6814e3cb6b5defe400436fd8, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeout

25/05/02 15:26:13 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 35009 milliseconds


Erreur lors du chargement des informations des films: [PATH_NOT_FOUND] Path does not exist: hdfs://namenode:9000/datasets/movies.csv.
+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|85    |120821 |5.961138  |
|85    |98595  |5.7224665 |
|85    |86237  |5.562018  |
|85    |77736  |5.491927  |
|85    |101855 |5.4835405 |
|85    |60356  |5.479856  |
|85    |85205  |5.4434586 |
|85    |117907 |5.338574  |
|85    |74061  |5.305612  |
|85    |106334 |5.2770243 |
+------+-------+----------+
only showing top 10 rows

Erreur lors du traitement du batch 3: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6814e3cb6b5defe400436fd8, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeout

25/05/02 15:26:48 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 34408 milliseconds


Erreur lors du chargement des informations des films: [PATH_NOT_FOUND] Path does not exist: hdfs://namenode:9000/datasets/movies.csv.
+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|52    |120821 |5.8714366 |
|52    |7568   |4.966122  |
|52    |87948  |4.6657233 |
|52    |83531  |4.642039  |
|52    |26459  |4.5690837 |
|52    |6085   |4.53474   |
|52    |103838 |4.5006123 |
|52    |47460  |4.4819455 |
|52    |3582   |4.469969  |
|52    |77736  |4.413271  |
+------+-------+----------+
only showing top 10 rows

Erreur lors du traitement du batch 4: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6814e3cb6b5defe400436fd8, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeout

25/05/02 15:27:22 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 34102 milliseconds


Erreur lors du chargement des informations des films: [PATH_NOT_FOUND] Path does not exist: hdfs://namenode:9000/datasets/movies.csv.
+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|5     |120821 |6.0885944 |
|5     |77736  |5.9700994 |
|5     |101855 |5.8657312 |
|5     |86237  |5.821442  |
|5     |85205  |5.6868367 |
|5     |41980  |5.6113367 |
|5     |98595  |5.5689597 |
|5     |129243 |5.5292807 |
|5     |120313 |5.505905  |
|5     |83531  |5.4922004 |
+------+-------+----------+
only showing top 10 rows

Erreur lors du traitement du batch 5: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6814e3cb6b5defe400436fd8, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeout

25/05/02 15:27:56 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 33880 milliseconds


Erreur lors du chargement des informations des films: [PATH_NOT_FOUND] Path does not exist: hdfs://namenode:9000/datasets/movies.csv.
+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|12    |120821 |5.3279204 |
|12    |117907 |5.152245  |
|12    |98595  |5.051058  |
|12    |101538 |4.9056544 |
|12    |101855 |4.812896  |
|12    |112423 |4.754116  |
|12    |104103 |4.717584  |
|12    |102107 |4.717584  |
|12    |32230  |4.709983  |
|12    |74061  |4.683931  |
+------+-------+----------+
only showing top 10 rows

Erreur lors du traitement du batch 6: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6814e3cb6b5defe400436fd8, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeout

25/05/02 15:28:29 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 33838 milliseconds


Erreur lors du chargement des informations des films: [PATH_NOT_FOUND] Path does not exist: hdfs://namenode:9000/datasets/movies.csv.


                                                                                

+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|85    |120821 |5.961138  |
|85    |98595  |5.7224665 |
|85    |86237  |5.562018  |
|85    |77736  |5.491927  |
|85    |101855 |5.4835405 |
|85    |60356  |5.479856  |
|85    |85205  |5.4434586 |
|85    |117907 |5.338574  |
|85    |74061  |5.305612  |
|85    |106334 |5.2770243 |
+------+-------+----------+
only showing top 10 rows

