In [7]:
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
import umap

In [8]:
# Function definition to create a CNN model
def create_cnn_model(N, n_features, n_classes, dropout_rate):
    model = Sequential()
    model.add(Conv1D(N, kernel_size=3, activation='relu', input_shape=(n_features, 1)))
    model.add(Dropout(dropout_rate))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=(N+1)/2))
    model.add(Conv1D(2*N, kernel_size=3, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=(2*N+1)/2))
    model.add(Flatten())
    model.add(Dense(3*N, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(n_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    return model

In [9]:
# Load the training data
train_data = pd.read_csv("/kaggle/input/swc-dataset/train_data_swc.csv")

# Extract the features (X) and target labels (y) from the training data
X = train_data.drop("y", axis=1)
y = train_data["y"]

# Load the test data
X_test = pd.read_csv("/kaggle/input/swc-dataset/test_data_swc.csv")

In [10]:
# Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Standardize the data using the training data's statistics
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [11]:
n_classes = len(y.unique())     # Number of classes: 9
n_features = X_train.shape[1]   # Number of features: 108

In [12]:
# Define the parameter grid for hyperparameter tuning
N = 40
dropout_rate = 0.3

params_grid_pipeline = {
    "umap__n_components": [N],
    "umap__n_neighbors": [5, 10, 20],
    "umap__min_dist": [0.1, 0.2],
}

# Create a KerasClassifier with UMAP-transformed features
umap_model = umap.UMAP()
cnn_model = KerasClassifier(build_fn=lambda: create_cnn_model(N, n_features, n_classes, dropout_rate), verbose=0)
pipeline = Pipeline([("umap", umap_model),
                     # ("reshape", FunctionTransformer(lambda x: np.reshape(x, (x.shape[0], -1, 1)))),
                     ("cnn", cnn_model)])
grid_search = GridSearchCV(estimator=pipeline, param_grid=params_grid_pipeline, cv=StratifiedKFold(n_splits=3, shuffle=True), verbose=2, n_jobs=-1)

# Fit the GridSearchCV to find the best hyperparameters
grid_search.fit(X_train, y_train)

# Access the best hyperparameters
best_params = grid_search.best_params_

Fitting 3 folds for each of 6 candidates, totalling 18 fits


  cnn_model = KerasClassifier(build_fn=lambda: create_cnn_model(N, n_features, n_classes, dropout_rate), verbose=0)
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


[CV] END umap__min_dist=0.1, umap__n_components=40, umap__n_neighbors=5; total time= 1.9min


2023-10-19 16:48:14.357857: F tensorflow/tsl/platform/statusor.cc:33] Attempting to fetch value instead of handling error INTERNAL: failed initializing StreamExecutor for CUDA device ordinal 0: INTERNAL: failed call to cuDevicePrimaryCtxRetain: CUDA_ERROR_OUT_OF_MEMORY: out of memory; total memory reported: 17071734784


[CV] END umap__min_dist=0.1, umap__n_components=40, umap__n_neighbors=5; total time= 2.0min


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGABRT(-6)}

In [None]:
# Apply UMAP to reduce dimensionality
umap_model = umap.UMAP(n_components=best_params['umap__n_components'],
                       n_neighbors=best_params['umap__n_neighbors'], min_dist=best_params['umap__min_dist'])
X_train = umap_model.fit_transform(X_train)
X_val = umap_model.transform(X_val)
X_test = umap_model.transform(X_test)

In [None]:
# Create the final CNN model with UMAP-transformed features
final_model = create_cnn_model(N, best_params['umap__n_components'], n_classes, dropout_rate)

# Train the final model on the UMAP-transformed data
final_model.fit(X_train, y_train, epochs=10, batch_size=32)

In [None]:
# Create base models (CNNs) as scikit-learn estimators
base_models = [
    ("cnn1", KerasClassifier(build_fn=lambda: create_cnn_model(N, best_params['umap__n_components'], n_classes, dropout_rate),
                             epochs=10, batch_size=32))
]

# Define the stacking ensemble model
stacking_model = StackingClassifier(estimators=base_models, final_estimator=LGBMClassifier(n_estimators=30, n_jobs=-1, force_col_wise=True))

# Fit the optimized model on the scaled training data
stacking_model.fit(X_train, y_train)

In [None]:
# Save the optimized model to a file
# model_filename = "stacking_model.pkl"
# joblib.dump(stacking_model, model_filename)

In [None]:
# Make predictions on the scaled validation set using the optimized model
val_predictions = stacking_model.predict(X_val)

# Calculate prediction probabilities for validation predictions
val_proba = stacking_model.predict_proba(X_val)

# Clip predicted probabilities to avoid extremes of the log function
val_proba = np.clip(val_proba, a_min=1e-15, a_max=1 - 1e-15)

# Calculate log loss for validation predictions
val_log_loss = log_loss(y_val, val_proba)
print(f"Validation Log Loss: {val_log_loss:.4f}")

In [None]:
# Make predictions on the scaled test data using the optimized model
test_predictions = stacking_model.predict(X_test)

# Calculate prediction probabilities for test predictions
test_proba = stacking_model.predict_proba(X_test)

# Clip predicted probabilities to avoid extremes of the log function
test_proba = np.clip(test_proba, a_min=1e-15, a_max=1 - 1e-15)

# Save the test predictions to a CSV file
submission_df = pd.DataFrame(test_proba, columns=[f"c{i}" for i in range(1, n_classes + 1)])
submission_df.to_csv("test_predictions.csv", index=False)