In [1]:
import os
os.environ["OMP_NUM_THREADS"]    = "4"
os.environ["MKL_NUM_THREADS"]    = "4"
os.environ["OPENBLAS_NUM_THREADS"] = "4"

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np
from ExKMC.Tree import Tree
from IPython.display import Image
import time

In [2]:
import os

print("Number of threads used by OMP:", os.environ.get("OMP_NUM_THREADS"))
print("Number of threads used by MKL:", os.environ.get("MKL_NUM_THREADS"))
print("Number of threads used by OPENBLAS:", os.environ.get("OPENBLAS_NUM_THREADS"))

Number of threads used by OMP: 4
Number of threads used by MKL: 4
Number of threads used by OPENBLAS: 4


In [3]:

df = pd.read_csv("sub_data_2000000.csv")


In [4]:
X_full = df

for col in X_full.columns:
    X_full.loc[:, col] = pd.to_numeric(X_full[col], errors='coerce')

X_full.fillna(X_full.mean(), inplace=True)

def measure_runtime_and_save_tree(feature_count, data_points):
    # Select first `feature_count` columns and `data_points` rows
    X_subset = X_full.iloc[:data_points, :feature_count]

    # Preprocess the dataset
    categorical_cols = [col for col in X_subset.columns if X_subset[col].dtype == 'object']
    numerical_cols = [col for col in X_subset.columns if X_subset[col].dtype != 'object']

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),  # Standardize numerical features
            ('cat', OneHotEncoder(), categorical_cols)  # One-hot encode categorical features
        ]
    )

    X_preprocessed = preprocessor.fit_transform(X_subset)

    # Convert to DataFrame if needed
    X_preprocessed = pd.DataFrame(X_preprocessed)

    # Fit KMeans model
    k = 10  # Number of clusters (IMM)
    kmeans = KMeans(n_clusters=k, random_state=43)

    # Fit KMeans to the preprocessed data (important to avoid NotFittedError)
    kmeans.fit(X_preprocessed)

    # Measure the runtime for Tree creation
    start_time = time.time()

    tree = Tree(k=k)  # Initialize Tree model with number of clusters
    tree.fit(X_preprocessed, kmeans)  # Fit the Tree model

    end_time = time.time()

    tree_filename = f"tree_features_{feature_count}_datapoints_{data_points}"
    tree.plot(filename=str(tree_filename), feature_names=list(X_subset.columns), view=False)


    Image(filename=f'{tree_filename}.gv.png')

    return end_time - start_time, tree_filename

feature_counts = [60]
data_point_sizes = [100000,1000000, 2000000]

results = []

for feature_count in feature_counts:
    for data_points in data_point_sizes:
        print(f"Testing with {feature_count} features and {data_points} data points...")
        runtime, tree_filename = measure_runtime_and_save_tree(feature_count, data_points)
        print(f"Runtime: {runtime} seconds, Tree saved as {tree_filename}.gv.png")
        results.append({
            'Features': feature_count,
            'Data Points': data_points,
            'Runtime (seconds)': runtime,
            'Tree Filename': tree_filename
        })

results_df = pd.DataFrame(results)
print(results_df)

results_df.to_csv('runtime_and_tree_results.csv', index=False)

Testing with 60 features and 100000 data points...
Runtime: 1.8185007572174072 seconds, Tree saved as tree_features_60_datapoints_100000.gv.png
Testing with 60 features and 1000000 data points...
Runtime: 36.28075695037842 seconds, Tree saved as tree_features_60_datapoints_1000000.gv.png
Testing with 60 features and 2000000 data points...
Runtime: 86.37850904464722 seconds, Tree saved as tree_features_60_datapoints_2000000.gv.png
   Features  Data Points  Runtime (seconds)  \
0        60       100000           1.818501   
1        60      1000000          36.280757   
2        60      2000000          86.378509   

                         Tree Filename  
0   tree_features_60_datapoints_100000  
1  tree_features_60_datapoints_1000000  
2  tree_features_60_datapoints_2000000  
