In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col
import numpy as np

from pyspark.ml.functions import vector_to_array

from d_imm.imm_model import DistributedIMM

In [2]:
spark = SparkSession.builder \
    .appName("DistributedIMM Dummy Data Test") \
    .master("local[*]") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

In [3]:
# Dummy dataset (5 rows, 3 features)
dummy_data = [
    (1.0, 2.0, 3.0),
    (2.0, 3.0, 4.0),
    (3.0, 4.0, 5.0),
    (8.0, 9.0, 10.0),
    (9.0, 10.0, 11.0),
]

# Create DataFrame
columns = ["feature1", "feature2", "feature3"]
df = spark.createDataFrame(dummy_data, columns)

# Assemble features into a single vector column
assembler = VectorAssembler(inputCols=columns, outputCol="features")
feature_df = assembler.transform(df).select("features")

# Train a KMeans model (k=2)
kmeans = KMeans().setK(2).setSeed(1).setFeaturesCol("features")
kmeans_model = kmeans.fit(feature_df)

# Train the DistributedIMM model
d_imm_tree = DistributedIMM(spark, k=2, verbose=2).fit(feature_df, kmeans_model)

Running 'fit' method
Time taken to build the tree: 1 minutes and 10.07 seconds
Running '__fill_stats_distributed__' method
Time taken to fill stats: 0 minutes and 35.73 seconds
Tree building completed.


In [4]:
# Compute Score and Surrogate Score with udf
score_value = d_imm_tree.score(feature_df)
surrogate_score_value = d_imm_tree.surrogate_score(feature_df)

# Print results
print("\n===== Distributed IMM Score Testing with Dummy Data =====")
print(f"Score (K-Means Cost): {score_value:.4f}")
print(f"Surrogate Score (K-Means Surrogate Cost): {surrogate_score_value:.4f}")

# Ensure surrogate score is greater than or equal to k-means score
assert surrogate_score_value >= score_value, "Surrogate score should be greater than or equal to normal score."

print("✅ Score and Surrogate Score tests passed successfully.")


===== Distributed IMM Score Testing with Dummy Data =====
Score (K-Means Cost): 7.5000
Surrogate Score (K-Means Surrogate Cost): 7.5000
✅ Score and Surrogate Score tests passed successfully.


In [6]:
# Compute Score and Surrogate Score with spark sql
score_value = d_imm_tree.score_sql(feature_df)
# surrogate_score_value = d_imm_tree.surrogate_score_sql(feature_df)

# Print results
print("\n===== Distributed IMM Score Testing with Dummy Data =====")
print(f"Score (K-Means Cost): {score_value:.4f}")
print(f"Surrogate Score (K-Means Surrogate Cost): {surrogate_score_value:.4f}")

# Ensure surrogate score is greater than or equal to k-means score
assert surrogate_score_value >= score_value, "Surrogate score should be greater than or equal to normal score."

print("✅ Score and Surrogate Score tests passed successfully.")


===== Distributed IMM Score Testing with Dummy Data =====
Score (K-Means Cost): 7.5000
Surrogate Score (K-Means Surrogate Cost): 7.5000
✅ Score and Surrogate Score tests passed successfully.
