In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
import pandas as pd

from d_imm.imm_model import DistributedIMM

In [2]:
import os

# Set Java environment variable if needed
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk1.8.0_261"
os.environ["PYSPARK_PYTHON"] = "C:\\Users\\saadha\\Desktop\\FYP-code\\GITHUB\\distributed-imm\\d-imm-python\\version-1\\venv\\Scripts\\python.exe"
os.environ["PYSPARK_DRIVER_PYTHON"] = "C:\\Users\\saadha\\Desktop\\FYP-code\\GITHUB\\distributed-imm\\d-imm-python\\version-1\\venv\\Scripts\\python.exe"

In [3]:
spark = SparkSession.builder \
    .appName("KMeansIrisExample") \
    .master("local[*]") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

In [4]:
sc = spark.sparkContext
sc.addPyFile("file:///C:/Users/saadha/Desktop/FYP-code/GITHUB/distributed-imm/d-imm-python/version-1/d_imm/splitters.zip")

In [5]:
def test_import(iterator):
    try:
        from d_imm.splitters.cut_finder import get_all_mistakes
        return ["Imported successfully"]
    except Exception as e:
        return [f"Import failed: {str(e)}"]

rdd = sc.parallelize(range(10), 2)  # Create an RDD with 2 partitions
result = rdd.mapPartitions(test_import).collect()
print(result)

['Imported successfully', 'Imported successfully']


In [6]:
# Load the Iris dataset from the UCI Machine Learning Repository
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
column_names = ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"]
iris_df = pd.read_csv(url, header=None, names=column_names)

In [7]:
# Convert the pandas DataFrame to a Spark DataFrame
df_1 = spark.createDataFrame(iris_df)

# Stack the dataset 5 times row-wise
df = df_1
for _ in range(0):  # Repeat 4 more times to stack 5 times total
    df = df.union(df_1)

In [8]:
# Assemble features into a single vector column
assembler = VectorAssembler(
    inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"],
    outputCol="features"
)

feature_df = assembler.transform(df)

In [9]:
# Set up the KMeans model (k=3 for the three species in the Iris dataset)
kmeans = KMeans().setK(3).setSeed(1).setFeaturesCol("features")

# Fit the model
model = kmeans.fit(feature_df)

In [11]:
d_imm_3 = DistributedIMM(spark,3,verbose=4,mode=3).fit(feature_df,model)

Running 'fit' method
Cluster centers: [array([6.85384615, 3.07692308, 5.71538462, 2.05384615]), array([5.006, 3.418, 1.464, 0.244]), array([5.88360656, 2.74098361, 4.38852459, 1.43442623])]
Sample of clustered data:
+-----------------+----------+
|         features|prediction|
+-----------------+----------+
|[5.1,3.5,1.4,0.2]|         1|
|[4.9,3.0,1.4,0.2]|         1|
|[4.7,3.2,1.3,0.2]|         1|
|[4.6,3.1,1.5,0.2]|         1|
|[5.0,3.6,1.4,0.2]|         1|
+-----------------+----------+
only showing top 5 rows
Building node at depth 0 with 150 samples
Finding the best split using histogram thresholds in a distributed manner
Time taken to collect results from worker nodes: 0 minutes and 24.06 seconds
Flattened results: [{'feature': 0, 'threshold': 5.006, 'mistakes': 0}, {'feature': 0, 'threshold': 5.05, 'mistakes': 0}, {'feature': 0, 'threshold': 5.15, 'mistakes': 0}, {'feature': 0, 'threshold': 5.25, 'mistakes': 0}, {'feature': 0, 'threshold': 5.35, 'mistakes': 0}, {'feature': 0, 't

In [None]:
d_imm_2 = DistributedIMM(spark,3,verbose=4,mode=2).fit(feature_df,model)

In [None]:
d_imm_1 = DistributedIMM(spark,3,verbose=4,mode=1).fit(feature_df,model)

In [None]:
d_imm_0 = DistributedIMM(spark,3,verbose=4,mode=0).fit(feature_df,model)

In [12]:
# Extract feature names from the VectorAssembler
feature_names = assembler.getInputCols()

# Print the feature names to confirm
print("Feature names:", feature_names)

# Plot the tree using the dynamically retrieved feature names
try:
    d_imm_3.plot(filename="iris_imm_tree_3", feature_names=feature_names, view=True)
    print("Tree plot saved as 'iris_imm_tree_3.png' and displayed.")
    
    d_imm_2.plot(filename="iris_imm_tree_2", feature_names=feature_names, view=True)
    print("Tree plot saved as 'iris_imm_tree_2.png' and displayed.")

    d_imm_0.plot(filename="iris_imm_tree_1", feature_names=feature_names, view=True)
    print("Tree plot saved as 'iris_imm_tree_1.png' and displayed.")

    d_imm_1.plot(filename="iris_imm_tree_0", feature_names=feature_names, view=True)
    print("Tree plot saved as 'iris_imm_tree_0.png' and displayed.")
except Exception as e:
    print(f"An error occurred while plotting the tree: {e}")

Feature names: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
Tree plot saved as 'iris_imm_tree_3.png' and displayed.


In [18]:
# Extract feature names from the VectorAssembler
feature_names = assembler.getInputCols()

# Print the feature names to confirm
print("Feature names:", feature_names)

# Plot the tree using the dynamically retrieved feature names
try:
    d_imm_0.plot(filename="iris_imm_tree_2", feature_names=feature_names, view=True)
    print("Tree plot saved as 'iris_imm_tree.png' and displayed.")
except Exception as e:
    print(f"An error occurred while plotting the tree: {e}")

Feature names: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
Tree plot saved as 'iris_imm_tree.png' and displayed.


In [17]:
d_imm_0.feature_importance()

Running 'feature_importance' method


[0, 0, 2, 0]

In [None]:
from d_imm.imm_model import Node

# Root Node
root_node = Node()
root_node.feature = 2  # petal length (cm) column index
root_node.value = 1.9  # Threshold for split

# Left Child - Leaf Node
root_node.left = Node()
root_node.left.value = 1  # Cluster label
# root_node.left.samples = 50
# root_node.left.mistakes = 0

# Right Child - Internal Node
root_node.right = Node()
root_node.right.feature = 2  # petal length (cm) column index
root_node.right.value = 5.1  # Threshold for split

# Right-Left Child - Leaf Node
root_node.right.left = Node()
root_node.right.left.value = 2  # Cluster label
# root_node.right.left.samples = 66
# root_node.right.left.mistakes = 5

# Right-Right Child - Leaf Node
root_node.right.right = Node()
root_node.right.right.value = 0  # Cluster label
# root_node.right.right.samples = 34
# root_node.right.right.mistakes = 0

from pyspark.ml.functions import vector_to_array

d_imm = DistributedIMM(spark, k=3, verbose=1)

clustered_data = model.transform(feature_df).select("features", "prediction")
clustered_data_vector = clustered_data.withColumn("features_array", vector_to_array("features"))

# Test the fill_stats_distributed method with the manually created tree
d_imm.fill_stats_distributed(root_node,clustered_data_vector)

# Print the results for verification
print("Root Node Stats: Samples =", root_node.samples)
print("Left Child Stats: Samples =", root_node.left.samples, "Mistakes =", root_node.left.mistakes)
print("Right Child Stats: Samples =", root_node.right.samples)
print("Right-Left Child Stats: Samples =", root_node.right.left.samples, "Mistakes =", root_node.right.left.mistakes)
print("Right-Right Child Stats: Samples =", root_node.right.right.samples, "Mistakes =", root_node.right.right.mistakes)

TESTING FEATURE IMPORTANCE 

In [None]:
feature_imp = d_imm.feature_importance()
print(feature_imp)