In [None]:
# Importieren der erforderlichen Bibliotheken
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, size, array_intersect, percent_rank, abs, broadcast, slice
from pyspark.sql.window import Window
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
# Spark Session initialisieren
spark = SparkSession.builder \
    .appName("Yelp_ALS") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.memory", "16g") \
    .config("spark.executor.cores", "4") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "2047m") \
    .getOrCreate()

In [None]:
# Lese den Yelp JSON-Datensatz ein
ratings1 = spark.read.json("yelp_academic_dataset_review.json")
ratings1 = ratings1.dropna().dropDuplicates()

# Um eine gemeinsame Indexierung zu ermöglichen, benenne die Originalspalten um:
# "user_id" → "user_id_orig" und "business_id" → "business_id_orig"
ratings1 = ratings1.withColumnRenamed("user_id", "user_id_orig") \
                   .withColumnRenamed("business_id", "business_id_orig")
# Die Bewertungs-Spalte "stars" bleibt unverändert.
# Entferne nicht benötigte Spalten
for c in ["cool", "date", "funny", "review_id", "text", "useful"]:
    if c in ratings1.columns:
        ratings1 = ratings1.drop(c)
# Zeige 20 Einträge aus Datensatz 1
ratings1.show(20)

In [None]:
# Duplikate entfernen
ratings1 = ratings1.dropDuplicates(["user_id_orig", "business_id_orig"])
print(ratings1.count())

In [None]:
# Einlesen des Datensatzes nach Angriff (tausche Angriffsart und Angriffsgröße jeweils aus)
ratings2 = spark.read.option("header", "true").csv("all_ratings_Yelp_reverse_bandwagon_20.0.csv")
# Entferne fehlende Werte und Duplikate
ratings2 = ratings2.dropna().dropDuplicates()

# Um Spalten mit Datensatz 1 zusammenzuführen, benennen wir auch hier um:
ratings2 = ratings2.withColumnRenamed("userId", "user_id_orig") \
                   .withColumnRenamed("businessId", "business_id_orig")
# Stelle sicher, dass "stars" als Float vorliegt:
ratings2 = ratings2.withColumn("rating", col("rating").cast("float"))
# Entferne nicht benötigte Spalten
ratings2 = ratings2.drop("Label")
# Zeige 20 Einträge aus Datensatz 2
ratings2.show(20)

In [None]:
# Kombiniere alle eindeutigen Nutzer-IDs aus beiden Datensätzen
combined_users = ratings1.select("user_id_orig").union(ratings2.select("user_id_orig")).distinct()
user_indexer = StringIndexer(inputCol="user_id_orig", outputCol="userId", handleInvalid="skip")
user_indexer_model = user_indexer.fit(combined_users)

# Wende den gleichen Nutzer-Indexer auf beide Datensätze an
ratings1 = user_indexer_model.transform(ratings1)
ratings2 = user_indexer_model.transform(ratings2)

# Items: Vereine alle eindeutigen business_id_orig aus beiden Datensätzen
combined_businesses = ratings1.select("business_id_orig").union(ratings2.select("business_id_orig")).distinct()
business_indexer = StringIndexer(inputCol="business_id_orig", outputCol="businessId_new", handleInvalid="skip")
business_indexer_model = business_indexer.fit(combined_businesses)

# Wende den Item-Indexer auf beide Datensätze an
ratings1 = business_indexer_model.transform(ratings1)
ratings2 = business_indexer_model.transform(ratings2)

# Entferne die Originalspalten, da ausschließlich mit den indexierten Spalten weitergearbeitet wird
ratings1 = ratings1.drop("user_id_orig", "business_id_orig")
ratings2 = ratings2.drop("user_id_orig", "business_id_orig")
ratings2 = ratings2.withColumnRenamed("rating", "stars")

In [None]:
# Teile die Daten in Trainings- und Testdaten auf in einem 80/20-Verhältnis
train1, test1 = ratings1.randomSplit([0.8, 0.2], seed=42)
print(f"Datensatz 1: Trainingsset: {train1.count()} Einträge, Testset: {test1.count()} Einträge")

train2, test2 = ratings2.randomSplit([0.8, 0.2], seed=42)
print(f"Datensatz 2: Trainingsset: {train2.count()} Einträge, Testset: {test2.count()} Einträge")

In [None]:
# ALS-Algorithmus für Datensatz 1 (vor Angriff)
als1 = ALS(
    rank=100,
    maxIter=20,
    regParam=0.05,
    userCol="userId",
    itemCol="businessId_new",
    ratingCol="stars",
    coldStartStrategy="drop"
)
als_model1 = als1.fit(train1)
predictions1 = als_model1.transform(test1)

In [None]:
# RMSE und MAE für Datensatz 1 berechnen
evaluator_rmse1 = RegressionEvaluator(metricName="rmse", labelCol="stars", predictionCol="prediction")
evaluator_mae1 = RegressionEvaluator(metricName="mae", labelCol="stars", predictionCol="prediction")
rmse1 = evaluator_rmse1.evaluate(predictions1)
mae1 = evaluator_mae1.evaluate(predictions1)

print(f"Datensatz 1 - RMSE: {rmse1}")
print(f"Datensatz 1 - MAE: {mae1}")

In [None]:
# HitRate-Berechnung für Datensatz 1

# Anzahl der Partitionen optimieren
spark.conf.set("spark.sql.shuffle.partitions", "200")

# Relevante Items nur einmal berechnen
window_spec1 = Window.orderBy(col("stars").desc())

test1 = test1.withColumn("percentile", percent_rank().over(window_spec1))
# Verwende die besten 20% der Bewertungen als relevante Items und gruppiere nach userId
relevant_items1 = test1.filter(col("percentile") <= 0.2) \
                       .groupBy("userId") \
                       .agg(expr("collect_list(businessId_new) as relevant_items")) \
                       .cache()

# Empfehlungen nur einmal berechnen
top_k_recommendations = als_model1.recommendForAllUsers(20).cache()

# Extrahiere empfohlene Items
top_k_recommendations = top_k_recommendations.withColumn(
    "recommended_item_ids",
    expr("transform(recommendations, x -> x['businessId_new'])")
).select("userId", "recommended_item_ids")

# Verbinde mit relevanten Items (Broadcast für kleinere Tabellen)
joined_data = top_k_recommendations.join(broadcast(relevant_items1), on="userId", how="inner")

# Berechne Treffer für k=10 und k=20 gleichzeitig
hit_data = joined_data.withColumn(
    "hits_10",
    size(array_intersect(slice(col("recommended_item_ids"), 1, 10), col("relevant_items")))
).withColumn(
    "hits_20",
    size(array_intersect(col("recommended_item_ids"), col("relevant_items")))
).withColumn(
    "is_hit_10", col("hits_10") > 0
).withColumn(
    "is_hit_20", col("hits_20") > 0
)

# HitRate berechnen
count_users = joined_data.count()
# Berechne die durchschnittliche HitRate@10 und HitRate@20 über alle User
hitrate_10 = hit_data.filter(col("is_hit_10") == True).count() / count_users if count_users > 0 else None
hitrate_20 = hit_data.filter(col("is_hit_20") == True).count() / count_users if count_users > 0 else None

print(f"Datensatz 1 - HitRate@10: {hitrate_10}")
print(f"Datensatz 1 - HitRate@20: {hitrate_20}")

In [None]:
# ALS-Algorithmus für Datensatz 2 (nach Angriff)
als2 = ALS(
    rank=100,
    maxIter=20,
    regParam=0.05,
    userCol="userId",
    itemCol="businessId_new",
    ratingCol="stars",
    coldStartStrategy="drop"
)
als_model2 = als2.fit(train2)
predictions2 = als_model2.transform(test2)

In [None]:
# RMSE und MAE für Datensatz 2 berechnen
evaluator_rmse2 = RegressionEvaluator(metricName="rmse", labelCol="stars", predictionCol="prediction")
evaluator_mae2 = RegressionEvaluator(metricName="mae", labelCol="stars", predictionCol="prediction")
rmse2 = evaluator_rmse2.evaluate(predictions2)
mae2 = evaluator_mae2.evaluate(predictions2)

print(f"Datensatz 2 - RMSE: {rmse2}")
print(f"Datensatz 2 - MAE: {mae2}")

In [None]:
# HitRate-Berechnung für Datensatz 2

# Anzahl der Partitionen optimieren
spark.conf.set("spark.sql.shuffle.partitions", "200")

# Relevante Items nur einmal berechnen
window_spec2 = Window.orderBy(col("stars").desc())

test2 = test2.withColumn("percentile", percent_rank().over(window_spec2))
# Verwende die besten 20% der Bewertungen als relevante Items und gruppiere nach userId
relevant_items2 = test2.filter(col("percentile") <= 0.2) \
                       .groupBy("userId") \
                       .agg(expr("collect_list(businessId_new) as relevant_items")) \
                       .cache()

# Empfehlungen nur einmal berechnen
top_k_recommendations = als_model2.recommendForAllUsers(20).cache()

# Extrahiere empfohlene Items
top_k_recommendations = top_k_recommendations.withColumn(
    "recommended_item_ids",
    expr("transform(recommendations, x -> x['businessId_new'])")
).select("userId", "recommended_item_ids")

# Verbinde mit relevanten Items
joined_data = top_k_recommendations.join(broadcast(relevant_items2), on="userId", how="inner")

# Berechne Treffer für k=10 und k=20 gleichzeitig
hit_data = joined_data.withColumn(
    "hits_10",
    size(array_intersect(slice(col("recommended_item_ids"), 1, 10), col("relevant_items")))
).withColumn(
    "hits_20",
    size(array_intersect(col("recommended_item_ids"), col("relevant_items")))
).withColumn(
    "is_hit_10", col("hits_10") > 0
).withColumn(
    "is_hit_20", col("hits_20") > 0
)

# Schritt 5: HitRate berechnen
count_users = joined_data.count()
# Berechne die durchschnittliche HitRate@10 und HitRate@20 über alle User
hitrate_10 = hit_data.filter(col("is_hit_10") == True).count() / count_users if count_users > 0 else None
hitrate_20 = hit_data.filter(col("is_hit_20") == True).count() / count_users if count_users > 0 else None

print(f"Datensatz 2 - HitRate@10: {hitrate_10}")
print(f"Datensatz 2 - HitRate@20: {hitrate_20}")

In [None]:
# Berechne Vorhersagen auf den Trainingsdaten beider Modelle
train_predictions1 = als_model1.transform(train1) \
    .select("userId", "businessId_new", col("prediction").alias("prediction_1"))
train_predictions2 = als_model2.transform(train2) \
    .select("userId", "businessId_new", col("prediction").alias("prediction_2"))

# Join über die gemeinsamen Schlüssel
common_predictions = train_predictions1.join(train_predictions2, on=["userId", "businessId_new"], how="inner")

# Berechne den absoluten Unterschied und den durchschnittlichen Prediction Shift
common_predictions = common_predictions.withColumn("abs_diff", abs(col("prediction_1") - col("prediction_2")))
prediction_shift = common_predictions.agg({"abs_diff": "avg"}).collect()[0][0]
print(f"Prediction Shift: {prediction_shift}")

In [None]:
# Beende die Spark-Session
spark.stop()