### Determining the optimal number of hidden layers and neurons for an Artificial Neural Network (ANN) 
This can be challenging and often requires experimentation. However, there are some guidelines and methods that can help you in making an informed decision:

- Start Simple: Begin with a simple architecture and gradually increase complexity if needed.
- Grid Search/Random Search: Use grid search or random search to try different architectures.
- Cross-Validation: Use cross-validation to evaluate the performance of different architectures.
- Heuristics and Rules of Thumb: Some heuristics and empirical rules can provide starting points, such as:
  -    The number of neurons in the hidden layer should be between the size of the input layer and the size of the output layer.
  -  A common practice is to start with 1-2 hidden layers.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load and prepare data
df = pd.read_csv("Churn_Modelling.csv")
df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)

# Separate features and target
X = df.drop("Exited", axis=1)
y = df["Exited"]

# Define feature types
categorical_features = ['Geography', 'Gender']
numeric_features = X.select_dtypes(include=[np.number]).columns.difference(categorical_features)

# Create preprocessor
# Set OneHotEncoder output to dense (sparse=False) to avoid sparse matrix issues with Keras
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features)
])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit preprocessor and get number of features after transformation
preprocessor.fit(X_train)
X_train_transformed = preprocessor.transform(X_train)
n_features = X_train_transformed.shape[1]

print(f"Original features: {X_train.shape[1]}")
print(f"Features after preprocessing: {n_features}")

# Define model builder using n_features from outer scope
def build_model(neurons=32, layers=1):
    model = Sequential()
    model.add(Dense(neurons, activation='relu', input_shape=(n_features,)))
    for _ in range(layers - 1):
        model.add(Dense(neurons, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Wrap model with scikeras KerasClassifier
model = KerasClassifier(
    model=build_model,
    verbose=0,
    # fix for warning about y needing integer labels - your y is already binary 0/1 so no LabelEncoder needed
)

# Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Parameter grid for GridSearchCV
param_grid = {
    'classifier__model__neurons': [32, 64],
    'classifier__model__layers': [1, 2],
    'classifier__epochs': [10, 20],
    'classifier__batch_size': [32]
}

print("Starting Grid Search...")
grid_search = GridSearchCV(pipeline, param_grid, cv=3, verbose=1, n_jobs=1)
grid_search.fit(X_train, y_train)

print("Best Score: ", grid_search.best_score_)
print("Best Params: ", grid_search.best_params_)

best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print(f"Test Score: {test_score}")

y_pred = best_model.predict(X_test)
print(f"Predictions shape: {y_pred.shape}")
print(f"First 10 predictions: {y_pred[:10]}")
