In [1]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np
from ExKMC.Tree import Tree
from IPython.display import Image
import time

In [2]:

df = pd.read_csv("/Users/bojitha/Downloads/SUSY.csv.gz" , compression='gzip', header=None)

# df = pd.read_csv("experiment_results/subset_features_28_datapoints_3000000.csv")


In [5]:
df = df.drop(df.columns[0], axis=1)
df.columns = [f'feature {i+1}' for i in range(df.shape[1])]
print(df.columns)

df.columns

Index(['feature 1', 'feature 2', 'feature 3', 'feature 4', 'feature 5',
       'feature 6', 'feature 7', 'feature 8', 'feature 9', 'feature 10',
       'feature 11', 'feature 12', 'feature 13', 'feature 14', 'feature 15',
       'feature 16', 'feature 17', 'feature 18'],
      dtype='object')


Index(['feature 1', 'feature 2', 'feature 3', 'feature 4', 'feature 5',
       'feature 6', 'feature 7', 'feature 8', 'feature 9', 'feature 10',
       'feature 11', 'feature 12', 'feature 13', 'feature 14', 'feature 15',
       'feature 16', 'feature 17', 'feature 18'],
      dtype='object')

In [6]:
df.head

<bound method NDFrame.head of          feature 1  feature 2  feature 3  feature 4  feature 5  feature 6  \
0         0.972861   0.653855   1.176225   1.157156  -1.739873  -0.874309   
1         1.667973   0.064191  -1.225171   0.506102  -0.338939   1.672543   
2         0.444840  -0.134298  -0.709972   0.451719  -1.613871  -0.768661   
3         0.381256  -0.976145   0.693152   0.448959   0.891753  -0.677328   
4         1.309996  -0.690089  -0.676259   1.589283  -0.693326   0.622907   
...            ...        ...        ...        ...        ...        ...   
4999995   0.853325  -0.961783  -1.487277   0.678190   0.493580   1.647969   
4999996   0.951581   0.139370   1.436884   0.880440  -0.351948  -0.740852   
4999997   0.840389   1.419162  -1.218766   1.195631   1.695645   0.663756   
4999998   1.784218  -0.833565  -0.560091   0.953342  -0.688969  -1.428233   
4999999   0.761500   0.680454  -1.186213   1.043521  -0.316755   0.246879   

         feature 7  feature 8  feature 9  fea

In [7]:
import os

output_dir = "SUSY_experiment_results"
os.makedirs(output_dir, exist_ok=True)

X_full = df

class KMeansWrapper:
    def __init__(self, cluster_centers_ , n_clusters=10, random_state=43):
        self.n_clusters = n_clusters
        self.cluster_centers_ = cluster_centers_
        self.random_state = random_state
        self.labels_ = None

    def fit(self, X):
        # Use the 'prediction' column as cluster labels
        if 'prediction' not in X_full.columns:
            raise ValueError("The dataframe must contain a 'prediction' column.")
        self.labels_ = X_full.loc[X.index, 'prediction'].values
        return self

    def predict(self, X):
        # Return the stored labels for the given indices
        return self.labels_

def measure_runtime_and_save_tree(feature_count, data_points):
    # Select first `feature_count` columns and `data_points` rows
    X_subset = X_full.iloc[:data_points, :feature_count]
    print(X_subset.columns)
    # X_subset.to_csv(os.path.join(output_dir, f"subset_features_{feature_count}_datapoints_{data_points}.csv"), index=False)

    # Preprocess the dataset: only standardize numerical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), X_subset.columns)
        ]
    )

    X_preprocessed = preprocessor.fit_transform(X_subset)
    X_preprocessed = pd.DataFrame(X_preprocessed)

    # Fit KMeansWrapper model
    k = 10  # Number of clusters
    # kmeans = KMeansWrapper(cluster_centers_ =cluster_centers , n_clusters=k, random_state=43 )
    kmeans = KMeans(n_clusters=k, random_state=43)
    kmeans.fit(X_subset)

    # Measure the runtime for Tree creation
    start_time = time.time()
    tree = Tree(k=k)
    tree.fit(X_preprocessed, kmeans)
    end_time = time.time()

    tree_filename = f"tree_features_{feature_count}_datapoints_{data_points}"
    tree_filepath = os.path.join(output_dir, tree_filename)
    tree.plot(filename=tree_filepath, feature_names=list(X_subset.columns), view=False)

    score = tree.score(X_preprocessed)
    print(f"Tree Kmeans Cost score: {score}")

    surrogate_score = tree.surrogate_score(X_preprocessed)
    print(f"Tree Surrogate Cost score: {surrogate_score}")


    # Move the generated .gv and .gv.png files to the output directory if not already there
    for ext in [".gv", ".gv.png"]:
        src = f"{tree_filename}{ext}"
        dst = os.path.join(output_dir, f"{tree_filename}{ext}")
        if os.path.exists(src):
            os.replace(src, dst)

    Image(filename=f'{tree_filepath}.gv.png')

    return end_time - start_time, tree_filename

feature_counts = [18]
data_point_sizes = [1000000,3000000,5000000]

results = []

for feature_count in feature_counts:
    for data_points in data_point_sizes:
        print(f"Testing with {feature_count} features and {data_points} data points...")
        runtime, tree_filename = measure_runtime_and_save_tree(feature_count, data_points)
        print(f"Runtime: {runtime} seconds, Tree saved as {tree_filename}.gv.png")
        results.append({
            'Features': feature_count,
            'Data Points': data_points,
            'Runtime (seconds)': runtime,
            'Tree Filename': tree_filename
        })

results_df = pd.DataFrame(results)
print(results_df)

results_csv_path = os.path.join(output_dir, f'{data_point_sizes}_runtime_and_tree_results.csv')
results_df.to_csv(results_csv_path, index=False)


Testing with 18 features and 1000000 data points...
Index(['feature 1', 'feature 2', 'feature 3', 'feature 4', 'feature 5',
       'feature 6', 'feature 7', 'feature 8', 'feature 9', 'feature 10',
       'feature 11', 'feature 12', 'feature 13', 'feature 14', 'feature 15',
       'feature 16', 'feature 17', 'feature 18'],
      dtype='object')




Tree Kmeans Cost score: 12005558.07480888
Tree Surrogate Cost score: 21212113.344883427
Runtime: 21.023695707321167 seconds, Tree saved as tree_features_18_datapoints_1000000.gv.png
Testing with 18 features and 3000000 data points...
Index(['feature 1', 'feature 2', 'feature 3', 'feature 4', 'feature 5',
       'feature 6', 'feature 7', 'feature 8', 'feature 9', 'feature 10',
       'feature 11', 'feature 12', 'feature 13', 'feature 14', 'feature 15',
       'feature 16', 'feature 17', 'feature 18'],
      dtype='object')




Tree Kmeans Cost score: 37059730.35868256
Tree Surrogate Cost score: 65397630.34135698
Runtime: 90.17428183555603 seconds, Tree saved as tree_features_18_datapoints_3000000.gv.png
Testing with 18 features and 5000000 data points...
Index(['feature 1', 'feature 2', 'feature 3', 'feature 4', 'feature 5',
       'feature 6', 'feature 7', 'feature 8', 'feature 9', 'feature 10',
       'feature 11', 'feature 12', 'feature 13', 'feature 14', 'feature 15',
       'feature 16', 'feature 17', 'feature 18'],
      dtype='object')




Tree Kmeans Cost score: 61746279.08814829
Tree Surrogate Cost score: 109053295.66943437
Runtime: 187.93084597587585 seconds, Tree saved as tree_features_18_datapoints_5000000.gv.png
   Features  Data Points  Runtime (seconds)  \
0        18      1000000          21.023696   
1        18      3000000          90.174282   
2        18      5000000         187.930846   

                         Tree Filename  
0  tree_features_18_datapoints_1000000  
1  tree_features_18_datapoints_3000000  
2  tree_features_18_datapoints_5000000  


In [8]:
output_dir = "SUSY_experiment_results_2"
os.makedirs(output_dir, exist_ok=True)

X_full = df  # Assume 'df' is already defined and includes the 'prediction' column

class KMeansWrapper:
    def __init__(self, cluster_centers_, n_clusters=10, random_state=43):
        self.n_clusters = n_clusters
        self.cluster_centers_ = cluster_centers_
        self.random_state = random_state
        self.labels_ = None

    def fit(self, X):
        if 'prediction' not in X_full.columns:
            raise ValueError("The dataframe must contain a 'prediction' column.")
        self.labels_ = X_full.loc[X.index, 'prediction'].values
        return self

    def predict(self, X):
        return self.labels_

def measure_runtime_and_save_tree(feature_count, data_points):
    # Select subset
    X_subset = X_full.iloc[:data_points, :feature_count]
    print(f"Columns used: {list(X_subset.columns)}")

    # Save the subset to a CSV file
    subset_csv_path = os.path.join(output_dir, f"subset_features_{feature_count}_datapoints_{data_points}.csv")
    X_subset.to_csv(subset_csv_path, index=False)

    # Preprocess numeric features
    preprocessor = ColumnTransformer(
        transformers=[('num', StandardScaler(), X_subset.columns)]
    )
    X_preprocessed = preprocessor.fit_transform(X_subset)
    X_preprocessed = pd.DataFrame(X_preprocessed)

    # Fit KMeans model
    k = 10
    kmeans = KMeans(n_clusters=k, random_state=43)
    kmeans.fit(X_subset)

    # Fit tree and measure runtime
    start_time = time.time()
    tree = Tree(k=k)
    tree.fit(X_preprocessed, kmeans)
    end_time = time.time()

    # Save and plot the tree
    tree_filename = f"tree_features_{feature_count}_datapoints_{data_points}"
    tree_filepath = os.path.join(output_dir, tree_filename)
    tree.plot(filename=tree_filepath, feature_names=list(X_subset.columns), view=False)

    print(f"Tree KMeans Cost score: {tree.score(X_preprocessed)}")
    print(f"Tree Surrogate Cost score: {tree.surrogate_score(X_preprocessed)}")

    # Move plot files
    for ext in [".gv", ".gv.png"]:
        src = f"{tree_filename}{ext}"
        dst = os.path.join(output_dir, f"{tree_filename}{ext}")
        if os.path.exists(src):
            os.replace(src, dst)

    Image(filename=f'{tree_filepath}.gv.png')
    return end_time - start_time, tree_filename

# Parameters to test
feature_counts = [18]
data_point_sizes = [1000000, 3000000, 5000000]

results = []

for feature_count in feature_counts:
    for data_points in data_point_sizes:
        print(f"Testing with {feature_count} features and {data_points} data points...")
        runtime, tree_filename = measure_runtime_and_save_tree(feature_count, data_points)
        print(f"Runtime: {runtime:.2f} seconds, Tree saved as {tree_filename}.gv.png")
        results.append({
            'Features': feature_count,
            'Data Points': data_points,
            'Runtime (seconds)': runtime,
            'Tree Filename': tree_filename
        })

# Save results summary
results_df = pd.DataFrame(results)
print(results_df)
results_csv_path = os.path.join(output_dir, f'{data_point_sizes}_runtime_and_tree_results.csv')
results_df.to_csv(results_csv_path, index=False)

Testing with 18 features and 1000000 data points...
Columns used: ['feature 1', 'feature 2', 'feature 3', 'feature 4', 'feature 5', 'feature 6', 'feature 7', 'feature 8', 'feature 9', 'feature 10', 'feature 11', 'feature 12', 'feature 13', 'feature 14', 'feature 15', 'feature 16', 'feature 17', 'feature 18']




Tree KMeans Cost score: 12005558.07480888
Tree Surrogate Cost score: 21212113.344883427
Runtime: 23.90 seconds, Tree saved as tree_features_18_datapoints_1000000.gv.png
Testing with 18 features and 3000000 data points...
Columns used: ['feature 1', 'feature 2', 'feature 3', 'feature 4', 'feature 5', 'feature 6', 'feature 7', 'feature 8', 'feature 9', 'feature 10', 'feature 11', 'feature 12', 'feature 13', 'feature 14', 'feature 15', 'feature 16', 'feature 17', 'feature 18']




Tree KMeans Cost score: 37059730.35868256
Tree Surrogate Cost score: 65397630.34135698
Runtime: 99.65 seconds, Tree saved as tree_features_18_datapoints_3000000.gv.png
Testing with 18 features and 5000000 data points...
Columns used: ['feature 1', 'feature 2', 'feature 3', 'feature 4', 'feature 5', 'feature 6', 'feature 7', 'feature 8', 'feature 9', 'feature 10', 'feature 11', 'feature 12', 'feature 13', 'feature 14', 'feature 15', 'feature 16', 'feature 17', 'feature 18']




Tree KMeans Cost score: 61746279.08814829
Tree Surrogate Cost score: 109053295.66943437
Runtime: 231.42 seconds, Tree saved as tree_features_18_datapoints_5000000.gv.png
   Features  Data Points  Runtime (seconds)  \
0        18      1000000          23.904796   
1        18      3000000          99.651724   
2        18      5000000         231.423066   

                         Tree Filename  
0  tree_features_18_datapoints_1000000  
1  tree_features_18_datapoints_3000000  
2  tree_features_18_datapoints_5000000  
