In [1]:
import os
import sys
import jax
import jax.numpy as jnp
import flax.linen as nn
import optax
import pickle
import matplotlib.pyplot as plt
import numpy as np
from typing import Sequence, List, Dict, Any, Tuple, Optional
from tqdm import tqdm
from functools import partial

import e3nn_jax as e3nn
import plotly.graph_objects as go
from itertools import permutations
import chex


# Add src directory to path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
src_dir = os.path.join(project_root, 'src')
sys.path.append(src_dir)

from utils.plotters import visualize_geometry, colorplot, visualize_signal, compare_geometries
from utils.geometries import trigonal_plane, tetrahedron, octahedron, icosahedron, dodecahedron
from utils.alignment import sample_uniform_quaternion, align_signals, find_best_random_quaternion, rotate_points_quaternion, stack_points, point_distance, spherical_harmonic_distance, stack_points, choose_best_quaternion, evenly_distributed_quaternions, quaternion_rotation_distance, spherical_grid_distance
import spectra
from spectra import sum_of_diracs, powerspectrum, bispectrum, trispectrum

# Enable 64-bit precision
jax.config.update("jax_enable_x64", True)

In [2]:
with open('../data/qm9_local_envs.pkl', 'rb') as f:
    local_envs = pickle.load(f)

local_envs = local_envs[:100000]
print(f"Number of local environments: {len(local_envs)}")

Number of local environments: 100000


In [3]:
lmax = 4
bispectra = []
num_neighbors = []
for local_env in tqdm(local_envs):
    true_geometry = jnp.array(local_env)
    num_neighbors.append(true_geometry.shape[0])
    true_signal = sum_of_diracs(true_geometry, lmax)
    true_bispectrum = bispectrum(true_signal)
    bispectra.append(true_bispectrum)

  0%|          | 0/100000 [00:00<?, ?it/s]

100%|██████████| 100000/100000 [03:09<00:00, 528.89it/s]


In [9]:
import numpy as np
import time
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

def train_neighbor_classifier(X, y):
    """
    Train a classifier to predict the number of neighbors given bispectrum vectors.
    
    Parameters:
    X (numpy.ndarray): Bispectrum vectors of shape (n_samples, 15)
    y (numpy.ndarray): Number of neighbors (1-5) of shape (n_samples,)
    
    Returns:
    best_model: Trained model
    scaler: Fitted scaler for preprocessing new data
    """
    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialize models
    models = {
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'SVM': SVC(kernel='rbf', random_state=42),
        'Neural Network': MLPClassifier(hidden_layer_sizes=(32, 16), max_iter=1000, random_state=42)
    }
    
    # Evaluate models using cross-validation
    best_model_name = None
    best_cv_score = 0
    
    for name, model in models.items():
        cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
        mean_cv_score = np.mean(cv_scores)
        
        print(f"{name} - Mean CV Accuracy: {mean_cv_score:.4f}")
        
        if mean_cv_score > best_cv_score:
            best_cv_score = mean_cv_score
            best_model_name = name
    
    print(f"\nBest model based on cross-validation: {best_model_name}")
    
    # Train all models on the full training set and time inference
    inference_times = {}
    trained_models = {}
    
    for name, model in models.items():
        # Train model
        model.fit(X_train_scaled, y_train)
        trained_models[name] = model
        
        # Time inference
        start_time = time.time()
        model.predict(X_test_scaled)
        end_time = time.time()
        inference_time = end_time - start_time
        inference_times[name] = inference_time
        
        # Evaluate on the test set
        y_pred = model.predict(X_test_scaled)
        test_accuracy = accuracy_score(y_test, y_pred)
        
        print(f"{name} - Test accuracy: {test_accuracy:.4f}, Inference time: {inference_time:.4f} seconds")
    
    # Select the fastest model
    fastest_model_name = min(inference_times, key=inference_times.get)
    print(f"\nFastest model for inference: {fastest_model_name} ({inference_times[fastest_model_name]:.4f} seconds)")
    
    # Use the fastest model as the best model
    best_model = trained_models[fastest_model_name]
    
    # Print classification report for the best model
    y_pred = best_model.predict(X_test_scaled)
    print("\nClassification Report for the fastest model:")
    print(classification_report(y_test, y_pred))
    
    return best_model, scaler

def predict_neighbors(model, scaler, bispectra):
    """
    Predict the number of neighbors for new bispectrum vectors.
    
    Parameters:
    model: Trained model
    scaler: Fitted scaler
    bispectra (numpy.ndarray): Bispectrum vectors of shape (n_samples, 15)
    
    Returns:
    numpy.ndarray: Predicted number of neighbors for each sample
    """
    # Reshape if single sample
    if bispectra.ndim == 1:
        bispectra = bispectra.reshape(1, -1)
    
    # Scale features
    bispectra_scaled = scaler.transform(bispectra)
    
    # Make predictions
    predictions = model.predict(bispectra_scaled)
    
    return predictions

In [10]:
# Example usage
if __name__ == "__main__":
    X = bispectra
    y = num_neighbors
    
    # Train and select the best model
    best_model, scaler = train_neighbor_classifier(X, y)
    
    # Save the model and scaler for future use
    import joblib
    joblib.dump(best_model, 'neighbor_classifier_model.pkl')
    joblib.dump(scaler, 'neighbor_classifier_scaler.pkl')
    
    # Example of loading and using the saved model
    # loaded_model = joblib.load('neighbor_classifier_model.pkl')
    # loaded_scaler = joblib.load('neighbor_classifier_scaler.pkl')
    # prediction = predict_neighbors(loaded_model, loaded_scaler, new_bispectrum)

Random Forest - Mean CV Accuracy: 1.0000
SVM - Mean CV Accuracy: 1.0000
Neural Network - Mean CV Accuracy: 1.0000

Best model based on cross-validation: SVM
Random Forest - Test accuracy: 1.0000, Inference time: 0.0432 seconds
SVM - Test accuracy: 1.0000, Inference time: 0.1490 seconds
Neural Network - Test accuracy: 1.0000, Inference time: 0.0040 seconds

Fastest model for inference: Neural Network (0.0040 seconds)

Classification Report for the fastest model:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00      1607
           2       1.00      1.00      1.00      4350
           3       1.00      1.00      1.00      4677
           4       1.00      1.00      1.00      9332
           5       1.00      1.00      1.00        34

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000



In [6]:
# Test the best model on all bispectra
# Convert bispectra list to numpy array first
import numpy as np
bispectra_array = np.array(bispectra)
pred_num_neighbors = predict_neighbors(best_model, scaler, bispectra_array)

# Print some statistics about the predictions
print(f"Predicted neighbors statistics:")
print(f"Mean: {pred_num_neighbors.mean():.2f}")
print(f"Min: {pred_num_neighbors.min()}")
print(f"Max: {pred_num_neighbors.max()}")
print(f"Number of samples: {len(pred_num_neighbors)}")

# Compare with actual values if available
if 'num_neighbors' in locals():
    from sklearn.metrics import mean_absolute_error, r2_score
    mae = mean_absolute_error(num_neighbors, pred_num_neighbors)
    r2 = r2_score(num_neighbors, pred_num_neighbors)
    print(f"\nModel performance:")
    print(f"Mean Absolute Error: {mae:.2f}")
    print(f"R² Score: {r2:.4f}")


Predicted neighbors statistics:
Mean: 3.09
Min: 1
Max: 5
Number of samples: 100000

Model performance:
Mean Absolute Error: 0.00
R² Score: 1.0000


In [11]:
# Load the best model and scaler
import joblib
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

# Load the saved model and scaler
loaded_model = joblib.load('neighbor_classifier_model.pkl')
loaded_scaler = joblib.load('neighbor_classifier_scaler.pkl')

# Prepare the data for inference
X = np.array(bispectra)
y_true = np.array(num_neighbors)

# Scale the features
X_scaled = loaded_scaler.transform(X)

# Make predictions
y_pred = loaded_model.predict(X_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Generate a detailed classification report
print("\nClassification Report:")
print(classification_report(y_true, y_pred))

# Check for any misclassifications
misclassified = np.where(y_pred != y_true)[0]
print(f"\nNumber of misclassified samples: {len(misclassified)} out of {len(y_true)}")
if len(misclassified) > 0:
    print(f"Sample of misclassifications (first 10):")
    for i in misclassified[:10]:
        print(f"  Sample {i}: True={y_true[i]}, Predicted={y_pred[i]}")


Model Accuracy: 1.0000

Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00      8037
           2       1.00      1.00      1.00     21748
           3       1.00      1.00      1.00     23386
           4       1.00      1.00      1.00     46657
           5       1.00      1.00      1.00       172

    accuracy                           1.00    100000
   macro avg       1.00      1.00      1.00    100000
weighted avg       1.00      1.00      1.00    100000


Number of misclassified samples: 0 out of 100000
