In [1]:
import os
os.environ["OMP_NUM_THREADS"]    = "4"
os.environ["MKL_NUM_THREADS"]    = "4"
os.environ["OPENBLAS_NUM_THREADS"] = "4"

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np
from ExKMC.Tree import Tree
from IPython.display import Image
import time

In [2]:
import os

print("Number of threads used by OMP:", os.environ.get("OMP_NUM_THREADS"))
print("Number of threads used by MKL:", os.environ.get("MKL_NUM_THREADS"))
print("Number of threads used by OPENBLAS:", os.environ.get("OPENBLAS_NUM_THREADS"))

Number of threads used by OMP: 4
Number of threads used by MKL: 4
Number of threads used by OPENBLAS: 4


In [3]:

df = pd.read_csv("sub_data_2000000.csv")


In [5]:
output_folder = "outputs_US"
os.makedirs(output_folder, exist_ok=True)

X_full = df

for col in X_full.columns:
    X_full.loc[:, col] = pd.to_numeric(X_full[col], errors='coerce')

X_full.fillna(X_full.mean(), inplace=True)

def measure_runtime_and_save_tree(feature_count, data_points):
    X_subset = X_full.iloc[:data_points, :feature_count]

    categorical_cols = [col for col in X_subset.columns if X_subset[col].dtype == 'object']
    numerical_cols = [col for col in X_subset.columns if X_subset[col].dtype != 'object']

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(), categorical_cols)
        ]
    )

    X_preprocessed = preprocessor.fit_transform(X_subset)
    X_preprocessed = pd.DataFrame(X_preprocessed)

    k = 10
    kmeans = KMeans(n_clusters=k, random_state=43)
    kmeans.fit(X_preprocessed)

    start_time = time.time()
    tree = Tree(k=k)
    tree.fit(X_preprocessed, kmeans)
    end_time = time.time()

    tree_filename = f"tree_features_{feature_count}_datapoints_{data_points}"
    tree_filepath = os.path.join(output_folder, tree_filename)
    tree.plot(filename=tree_filepath, feature_names=list(X_subset.columns), view=False)

    Image(filename=f'{tree_filepath}.gv.png')

    return end_time - start_time, tree_filepath

feature_counts = [45]
data_point_sizes = [100000, 1000000, 2000000]

results = []

for feature_count in feature_counts:
    for data_points in data_point_sizes:
        print(f"Testing with {feature_count} features and {data_points} data points...")
        runtime, tree_filepath = measure_runtime_and_save_tree(feature_count, data_points)
        print(f"Runtime: {runtime} seconds, Tree saved as {tree_filepath}.gv.png")
        results.append({
            'Features': feature_count,
            'Data Points': data_points,
            'Runtime (seconds)': runtime,
            'Tree Filename': tree_filepath + '.gv.png'
        })

results_df = pd.DataFrame(results)
print(results_df)

results_df.to_csv(os.path.join(output_folder, 'runtime_and_tree_results.csv'), index=False)


Testing with 45 features and 100000 data points...
Runtime: 2.6705679893493652 seconds, Tree saved as outputs_US/tree_features_45_datapoints_100000.gv.png
Testing with 45 features and 1000000 data points...
Runtime: 37.36707901954651 seconds, Tree saved as outputs_US/tree_features_45_datapoints_1000000.gv.png
Testing with 45 features and 2000000 data points...
Runtime: 69.77918791770935 seconds, Tree saved as outputs_US/tree_features_45_datapoints_2000000.gv.png
   Features  Data Points  Runtime (seconds)  \
0        45       100000           2.670568   
1        45      1000000          37.367079   
2        45      2000000          69.779188   

                                       Tree Filename  
0  outputs_US/tree_features_45_datapoints_100000....  
1  outputs_US/tree_features_45_datapoints_1000000...  
2  outputs_US/tree_features_45_datapoints_2000000...  
