In [1]:
from pyspark.sql import SparkSession

In [2]:
# Imports nécessaires
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
 
# Configuration Spark optimisée
spark = SparkSession.builder \
    .appName("MovieLensAnalysis") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://bigdata-node:9000") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.memory.offHeap.enabled", "true") \
    .config("spark.memory.offHeap.size", "2g") \
    .config("spark.sql.shuffle.partitions", "20") \
    .config("spark.memory.fraction", "0.8") \
    .config("spark.memory.storageFraction", "0.3") \
    .getOrCreate()
 
 
try:
    # Chargement des données
    clean_data = spark.read.parquet("hdfs://bigdata-container:9000/datasets/clean_movies_ratings")
    # Préparation des données
    als_data = clean_data.select("userId", "movieId", "rating").cache()
    (training, test) = als_data.randomSplit([0.8, 0.2], seed=42)
    print(f"Données d'entraînement: {training.count()} lignes")
    print(f"Données de test: {test.count()} lignes")
 
    # Construction et entraînement du modèle
    als = ALS(
        maxIter=10,
        regParam=0.01,
        userCol="userId",
        itemCol="movieId",
        ratingCol="rating",
        coldStartStrategy="drop",
        nonnegative=True
    )
 
    # Entraînement
    model = als.fit(training)
 
    # Évaluation
    predictions = model.transform(test)
    evaluator = RegressionEvaluator(
        metricName="rmse",
        labelCol="rating",
        predictionCol="prediction"
    )
 
    rmse = evaluator.evaluate(predictions)
    print(f"\nPerformance du modèle:")
    print(f"RMSE: {rmse:.3f}")
 
except Exception as e:
    print("Erreur:", str(e))
finally:
    spark.stop()
 

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/30 12:18:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/30 12:18:31 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Erreur: [PATH_NOT_FOUND] Path does not exist: hdfs://bigdata-container:9000/datasets/clean_movies_ratings.


In [3]:
spark = SparkSession.builder.appName("KafkaSparkDemo").master("local[*]").config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.5.1").getOrCreate()

25/04/30 12:18:34 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
df = spark.readStream.format("kafka").option("kafka.bootstrap.servers", "localhost:9092").option("subscribe", "test-topic").load()

AnalysisException: Failed to find data source: kafka. Please deploy the application as per the deployment section of Structured Streaming + Kafka Integration Guide.

In [5]:
df_parsed = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

NameError: name 'df' is not defined

In [6]:
query = df_parsed.writeStream.outputMode("append").format("console").start()

NameError: name 'df_parsed' is not defined

In [7]:
query.awaitTermination()

NameError: name 'query' is not defined

In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ALS_Recommendation_Batch") \
    .getOrCreate()

# Charger les données
ratings = spark.read.csv("hdfs:///datasets/rating.csv", header=True, inferSchema=True)
movies = spark.read.csv("hdfs:///datasets/movie.csv", header=True, inferSchema=True)


25/04/30 12:19:10 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
25/04/30 12:19:14 WARN FileSystem: Failed to initialize fileystem hdfs://bigdata-node:9000: java.lang.IllegalArgumentException: java.net.UnknownHostException: bigdata-node
25/04/30 12:19:14 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: hdfs:///datasets/rating.csv.
java.lang.IllegalArgumentException: java.net.UnknownHostException: bigdata-node
	at org.apache.hadoop.security.SecurityUtil.buildTokenService(SecurityUtil.java:466)
	at org.apache.hadoop.hdfs.NameNodeProxiesClient.createProxyWithClientProtocol(NameNodeProxiesClient.java:134)
	at org.apache.hadoop.hdfs.DFSClient.<init>(DFSClient.java:374)
	at org.apache.hadoop.hdfs.DFSClient.<init>(DFSClient.java:308)
	at org.apache.hadoop.hdfs.DistributedFileSystem.initDFSClient(DistributedFileSystem.java:202)
	at org.apache.hadoop.hdfs.DistributedFileSystem.initia

IllegalArgumentException: java.net.UnknownHostException: bigdata-node

In [9]:
# Supprimer les valeurs nulles
ratings = ratings.dropna()
movies = movies.dropna()

# Optionnel : supprimer les doublons
ratings = ratings.dropDuplicates()
movies = movies.dropDuplicates()

# Vérifier les types
ratings.printSchema()


NameError: name 'ratings' is not defined

In [10]:
ratings.show(5)
ratings.printSchema()

movies.show(5)
movies.printSchema()


NameError: name 'ratings' is not defined

In [11]:
from pyspark.sql.functions import col, sum as sum_

# Pour la table des ratings
ratings.select([
    sum_(col(c).isNull().cast("int")).alias(c)
    for c in ratings.columns
]).show()

# Pour la table des movies
movies.select([
    sum_(col(c).isNull().cast("int")).alias(c)
    for c in movies.columns
]).show()


NameError: name 'ratings' is not defined

In [12]:
!pip install numpy


[0m

In [13]:

from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

# Assure-toi d’avoir une SparkSession
spark = SparkSession.builder.appName("ALS_Model_Training").getOrCreate()

# Convertir les types de colonnes
ratings = ratings.selectExpr(
    "cast(userId as int) as userId",
    "cast(movieId as int) as movieId",
    "cast(rating as float) as rating"
)


25/04/30 12:19:44 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


NameError: name 'ratings' is not defined

In [None]:
from kafka import KafkaConsumer
import json
 
consumer = KafkaConsumer('MoviesRatings' , bootstrap_servers=['localhost:9092'])
 
def process_msg(msg):
    print(msg.offset)
    dico = dict(json.loads(msg.value))
    print(dico)
 
for msg in consumer:
    process_msg(msg)