In [1]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np
from ExKMC.Tree import Tree
from IPython.display import Image
import time

df = pd.read_csv("iris_stacked_clustered_data_15000.csv")

# num_duplicates = 20000000 // len(df) + 1

# df_large = pd.concat([df] * num_duplicates, ignore_index=True)

# df_large = df_large.iloc[:20000000]

# df_large.to_csv("us_census_data_1990_large.csv", index=False)

In [2]:
row_count = df.shape[0]
print(f"Row count of the dataframe: {row_count}")

Row count of the dataframe: 15150


In [3]:
class KMeans_Wrapper:
    def __init__(self, features, n_clusters, predictions, cluster_centers_):
        self.predictions = predictions
        self.features = features 
        self.cluster_centers_ = cluster_centers_
        self.n_clusters = n_clusters
    
    def fit(self, X):
        print("dummy fitting")
    
    def predict(self, X):
        return self.predictions
    


In [4]:
cluster_centers = np.array([
    [5.88360656, 2.74098361, 4.38852459, 1.43442623], 
    [6.85384615, 3.07692308, 5.71538462, 2.05384615], 
    [5.006, 3.418, 1.464, 0.244]
])

In [5]:
kmeans_wrapper = KMeans_Wrapper(df ,3 , cluster_centers_= cluster_centers , predictions = df.iloc[:, -1].values)

In [6]:
X_full = df

# for col in X_full.columns:
#     X_full.loc[:, col] = pd.to_numeric(X_full[col], errors='coerce')

# X_full.fillna(X_full.mean(), inplace=True)

def measure_runtime_and_save_tree(feature_count, data_points):
#     # Select first `feature_count` columns and `data_points` rows
#     X_subset = X_full.iloc[:data_points, :feature_count]

#     # Preprocess the dataset
#     categorical_cols = [col for col in X_subset.columns if X_subset[col].dtype == 'object']
#     numerical_cols = [col for col in X_subset.columns if X_subset[col].dtype != 'object']

#     preprocessor = ColumnTransformer(
#         transformers=[
#             ('num', StandardScaler(), numerical_cols),  # Standardize numerical features
#             ('cat', OneHotEncoder(), categorical_cols)  # One-hot encode categorical features
#         ]
#     )

#     X_preprocessed = preprocessor.fit_transform(X_subset)

#     # Convert to DataFrame if needed
#     X_preprocessed = pd.DataFrame(X_preprocessed)

    # # Fit KMeans model
    k = 3  # Number of clusters (IMM)
        # kmeans = KMeans_Wrapper(n_clusters=k)

        # # Fit KMeans to the preprocessed data (important to avoid NotFittedError)
        # kmeans.fit(X_preprocessed)

        # # Measure the runtime for Tree creation
    start_time = time.time()

    tree = Tree(k=k)  # Initialize Tree model with number of clusters
    tree.fit(X_full, kmeans_wrapper)  # Fit the Tree model

    end_time = time.time()

    tree_filename = f"tree_features_{4}_datapoints_{15000}"
    tree.plot(filename=str(tree_filename), feature_names=list(X_full.columns), view=False)


    Image(filename=f'{tree_filename}.gv.png')

    return end_time - start_time, tree_filename

feature_counts = [4]
data_point_sizes = [15000]

results = []

for feature_count in feature_counts:
    for data_points in data_point_sizes:
        print(f"Testing with {feature_count} features and {data_points} data points...")
        runtime, tree_filename = measure_runtime_and_save_tree(feature_count, data_points)
        print(f"Runtime: {runtime} seconds, Tree saved as {tree_filename}.gv.png")
        results.append({
            'Features': feature_count,
            'Data Points': data_points,
            'Runtime (seconds)': runtime,
            'Tree Filename': tree_filename
        })

results_df = pd.DataFrame(results)
print(results_df)

results_df.to_csv('runtime_and_tree_results.csv', index=False)

Testing with 4 features and 15000 data points...
Runtime: 0.041790008544921875 seconds, Tree saved as tree_features_4_datapoints_15000.gv.png
   Features  Data Points  Runtime (seconds)                     Tree Filename
0         4        15000            0.04179  tree_features_4_datapoints_15000


In [7]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from IPython.display import Image

# Ensure Tree model is imported
# from your_module import Tree  

# Convert all columns to numeric
X_full = df.apply(pd.to_numeric, errors='coerce')

# Fill NaN values with column means
X_full.fillna(X_full.mean(numeric_only=True), inplace=True)

def measure_runtime_and_save_tree(feature_count, data_points):
    # Select a subset of the dataset
    X_subset = X_full.iloc[:data_points, :feature_count]

    # Identify categorical and numerical columns
    categorical_cols = X_subset.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_cols = X_subset.select_dtypes(exclude=['object', 'category']).columns.tolist()

    # Define preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ]
    )

    # Apply transformations
    X_preprocessed = preprocessor.fit_transform(X_subset)
    
    # Convert to DataFrame with feature names
    preprocessed_feature_names = preprocessor.get_feature_names_out()
    X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=preprocessed_feature_names)

    # KMeans clustering
    k = 10
    kmeans = KMeans(n_clusters=k, random_state=43, n_init=10)
    kmeans.fit(X_preprocessed)

    # Add cluster assignments
    X_preprocessed_df['Cluster'] = kmeans.labels_

    # Save feature data with cluster labels
    kmeans_results_filename = f"kmeans_features_{feature_count}_datapoints_{data_points}.csv"
    X_preprocessed_df.to_csv(kmeans_results_filename, index=False)
    print(f"KMeans results saved to {kmeans_results_filename}")

    # Measure the runtime of tree creation
    start_time = time.time()
    
    tree = Tree(k=k)
    tree.fit(X_preprocessed, kmeans)
    
    end_time = time.time()

    # Save and visualize tree
    tree_filename = f"tree_features_{feature_count}_datapoints_{data_points}"
    tree.plot(filename=tree_filename, feature_names=X_subset.columns.tolist(), view=False)

    # Display generated tree image
    Image(filename=f'{tree_filename}.gv.png')

    return end_time - start_time, tree_filename, kmeans_results_filename

# Define test configurations
feature_counts = [60]
data_point_sizes = [100000, 500000, 1000000]
results = []

# Run tests and store results
for feature_count in feature_counts:
    for data_points in data_point_sizes:
        print(f"Testing with {feature_count} features and {data_points} data points...")
        runtime, tree_filename, kmeans_results_filename = measure_runtime_and_save_tree(feature_count, data_points)
        print(f"Runtime: {runtime:.2f} seconds, Tree saved as {tree_filename}.gv.png")
        results.append({
            'Features': feature_count,
            'Data Points': data_points,
            'Runtime (seconds)': runtime,
            'Tree Filename': tree_filename,
            'KMeans Results Filename': kmeans_results_filename
        })

# Save runtime results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('runtime_and_tree_results.csv', index=False)

print(results_df)


Testing with 60 features and 100000 data points...
KMeans results saved to kmeans_features_60_datapoints_100000.csv
Runtime: 0.03 seconds, Tree saved as tree_features_60_datapoints_100000.gv.png
Testing with 60 features and 500000 data points...
KMeans results saved to kmeans_features_60_datapoints_500000.csv
Runtime: 0.02 seconds, Tree saved as tree_features_60_datapoints_500000.gv.png
Testing with 60 features and 1000000 data points...
KMeans results saved to kmeans_features_60_datapoints_1000000.csv
Runtime: 0.03 seconds, Tree saved as tree_features_60_datapoints_1000000.gv.png
   Features  Data Points  Runtime (seconds)  \
0        60       100000           0.032372   
1        60       500000           0.022983   
2        60      1000000           0.025612   

                         Tree Filename  \
0   tree_features_60_datapoints_100000   
1   tree_features_60_datapoints_500000   
2  tree_features_60_datapoints_1000000   

                     KMeans Results Filename  
0   kme