In [2]:
from pyspark.sql import SparkSession

# Add here your team number teamx
team = 24

warehouse = 'project/hive/warehouse'

spark = SparkSession.builder\
    .appName("teаm {} - spark ML ".format(team))\
    .master("yarn")\
    .config("hive.metastore.uris", "thrift://hadoop-02.uni.innopolis.ru:9883")\
    .config("spark.sql.warehouse.dir", warehouse)\
    .getOrCreate()

spark

RuntimeError: Java gateway process exited before sending its port number

In [None]:
hdfs_train_path = "project/data/train"
hdfs_test_path = "project/data/test"

def verify_file_existence():
    hdfs_train_files = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration()) \
        .listStatus(spark._jvm.org.apache.hadoop.fs.Path(hdfs_train_path))
    hdfs_test_files = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration()) \
        .listStatus(spark._jvm.org.apache.hadoop.fs.Path(hdfs_test_path))
    
    print("\nHDFS Verification:")
    print(f"Training files found: {len(hdfs_train_files)}")
    print(hdfs_train_files)
    print(f"Test files found: {len(hdfs_test_files)}")
    print(hdfs_test_files)
    
verify_file_existence()

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# KMeans
kmeans = KMeans(featuresCol="features", predictionCol="cluster")

# Гиперпараметры
param_grid = ParamGridBuilder() \
    .addGrid(kmeans.k, [5, 10]) \
    .addGrid(kmeans.initMode, ["k-means||", "random"]) \
    .build()

# Оценка
evaluator = ClusteringEvaluator(metricName="silhouette", featuresCol="features")

# Кросс-валидация
cv = CrossValidator(estimator=kmeans,
                    estimatorParamMaps=param_grid,
                    evaluator=evaluator,
                    numFolds=3)

# Обучение
cv_model = cv.fit(train_data)

# Лучшая модель
best_kmeans = cv_model.bestModel

# Сохранение модели
best_kmeans.write().overwrite().save("hdfs://project/models/model1")

# Предсказание
predictions_kmeans = best_kmeans.transform(test_data)
predictions_kmeans.select("id", "cluster").write.csv("hdfs://project/output/model1_predictions", mode="overwrite")

In [None]:
from pyspark.ml.clustering import BisectingKMeans

# Bisecting KMeans
bisecting_km = BisectingKMeans(featuresCol="features", predictionCol="cluster")

# Гиперпараметры
param_grid_bisecting = ParamGridBuilder() \
    .addGrid(bisecting_km.k, [5, 10]) \
    .addGrid(bisecting_km.minDivisibleClusterSize, [2.0, 4.0]) \
    .build()

# Кросс-валидация
cv_bisecting = CrossValidator(estimator=bisecting_km,
                              estimatorParamMaps=param_grid_bisecting,
                              evaluator=evaluator,
                              numFolds=3)

# Обучение
cv_bisecting_model = cv_bisecting.fit(train_data)

# Лучшая модель
best_bisecting = cv_bisecting_model.bestModel

# Сохранение модели
best_bisecting.write().overwrite().save("hdfs://project/models/model2")

# Предсказание
predictions_bisecting = best_bisecting.transform(test_data)
predictions_bisecting.select("id", "cluster").write.csv("hdfs://project/output/model2_predictions", mode="overwrite")

In [None]:
# Оценка KMeans
silhouette_kmeans = evaluator.evaluate(predictions_kmeans)

# Оценка Bisecting KMeans
silhouette_bisecting = evaluator.evaluate(predictions_bisecting)

# Сравнение
comparison_df = spark.createDataFrame([
    ("KMeans", silhouette_kmeans),
    ("Bisecting KMeans", silhouette_bisecting)
], ["Model", "Silhouette Score"])

comparison_df.write.csv("hdfs://project/output/evaluation", mode="overwrite")

In [None]:
final_df = processed_df.withColumn("cluster", best_kmeans.predict(col("features")))
final_df.select("id", "name", "cluster").show()