In [None]:
import os
import time
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import LassoLars, RidgeCV, LinearRegression
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import matplotlib.pyplot as plt

# Measure the time taken to load data
start = time.time()

# Load data, this should work with any of the fixed representations
data_dir = 'soap/average'
files = np.sort(os.listdir(data_dir))
data_list = []

for file in files:
    with open(os.path.join(data_dir, file), 'rb') as f:
        data = pickle.load(f)
        data_list.append(data)

print(np.shape(data_list))

# Reshape and scale the data
X = np.array(data_list).reshape(len(files), -1)
min_max_scaler = MinMaxScaler()
X_minmax = min_max_scaler.fit_transform(X)
y = np.load('sample_energy.npy')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_minmax, y, test_size=0.2, random_state=42)

# Define models and their parameters for grid search
models = {
    "ExtraTreesRegressor": ExtraTreesRegressor(),
    "LassoLars": LassoLars(),
    "RidgeCV": RidgeCV(),
    "LinearRegression": LinearRegression(),
    "SVR": SVR(kernel='linear', gamma='scale'),
    "KNN": KNeighborsRegressor(),
    "MLP": MLPRegressor(),
    "AdaBoostRegressor": AdaBoostRegressor()
}

params = {
    "ExtraTreesRegressor": {"n_estimators": [1]},
    "LassoLars": {"alpha": [0.001]},
    "RidgeCV": {"alphas": [0.001]},
    "LinearRegression": {},
    "SVR": {"C": [1e-8]},
    "KNN": {"n_neighbors": [5], "weights": ['uniform', 'distance']},
    "MLP": {"hidden_layer_sizes": [(16, 32, 64)], "alpha": [0.0001]},
    "AdaBoostRegressor": {"n_estimators": [5], "learning_rate": [0.1]}
}

def evaluate_model(name, model, params):
    """
    Function to perform grid search and evaluate the model.

    Args:
    name (str): Name of the model.
    model: The model object.
    params (dict): Hyperparameters for the model.

    Returns:
    None
    """
    # Create a pipeline with optional standard scaling
    if name == "LinearRegression":
        pipe = Pipeline(steps=[('model', model)])
    else:
        pipe = Pipeline(steps=[('scl', StandardScaler()), ('model', model)])
    
    # Perform grid search
    grid = GridSearchCV(estimator=pipe, param_grid={f'model__{key}': value for key, value in params.items()}, cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)
    
    # Predict and evaluate the model
    y_pred = grid.predict(X_test)
    mse = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Plot the results
    plt.figure(figsize=(8, 6))
    plt.hist2d(y_test, y_pred, bins=(200, 200), cmap=plt.cm.jet, range=[[100, 800], [100, 800]])
    plt.colorbar(label='Density')
    plt.title(f'SOAP - Average - {name}')
    plt.ylim(100, 800)
    plt.xlim(100, 800)
    plt.plot([100, 800], [100, 800], color='white')
    plt.savefig("soap_best_plot.png")
    plt.show()
    
    # Print the results
    print(f"{name} Best Params: {grid.best_params_}, MAE: {mse}, R^2: {r2}")

# Evaluate all models
for name, model in models.items():
    evaluate_model(name, model, params[name])

end = time.time()
print(f"Time taken: {end - start} seconds")