In [1]:
import umap
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from lightgbm import LGBMClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.ensemble import StackingClassifier
from skopt import BayesSearchCV

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:
# Function definition to create a CNN model
def create_cnn_model(N, n_features, n_classes, dropout_rate=0.3):
    model = Sequential()
    model.add(Conv1D(N, kernel_size=3, activation='relu', input_shape=(n_features, 1)))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(n_classes, activation='softmax'))
    model.add(Dropout(dropout_rate))
    model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    return model

In [3]:
# Load the training data
train_data = pd.read_csv("/kaggle/input/swc-dataset/train_data_swc.csv")
# Extract the features (X) and target labels (y) from the training data
X = train_data.drop("y", axis=1)
y = train_data["y"]

# Load the test data
X_test = pd.read_csv("/kaggle/input/swc-dataset/test_data_swc.csv")

In [4]:
# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [5]:
n_classes = len(y.unique())     # Number of classes: 9
assert n_classes == 9

# Encode the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [6]:
# Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

n_features = X_train.shape[1]   # Number of features: 108
assert n_features == 108

In [7]:
# Create a KerasClassifier for the simplified CNN model
cnn_model = KerasClassifier(build_fn=lambda: create_cnn_model(64, n_features, n_classes), verbose=0)

# Create a Stacking Classifier with LGBM as the final estimator
stacking_model = StackingClassifier(estimators=[('cnn', cnn_model)], final_estimator=LGBMClassifier(n_estimators=30, n_jobs=-1, force_col_wise=True))

  cnn_model = KerasClassifier(build_fn=lambda: create_cnn_model(64, n_features, n_classes), verbose=0)


In [8]:
# Use StratifiedKFold for parameter tuning
cv = StratifiedKFold(n_splits=3, shuffle=True)

# Define hyperparameter search spaces
param_space_cnn = {
    'cnn__N': [32, 128],
    'cnn__dropout_rate': [0.2, 0.5]
}

param_space_stacking = {
    'final_estimator__estimator__n_estimators': [10, 50],
    'final_estimator__estimator__learning_rate': [0.01, 0.2]
}

pipeline = Pipeline([("cnn", cnn_model)])

# Perform hyperparameter tuning with Bayesian optimization for CNN model
opt_cnn = BayesSearchCV(estimator=pipeline, search_spaces=param_space_cnn, cv=cv, n_iter=50, scoring='neg_log_loss', n_jobs=-1, random_state=42)
opt_cnn.fit(X_train, y_train)
best_cnn_params = opt_cnn.best_params_



ValueError: N is not a legal parameter

In [None]:
# Use the best CNN parameters to create the final CNN model
final_cnn_model = create_cnn_model(best_cnn_params['cnn__N'], n_features, n_classes, best_cnn_params['cnn__dropout_rate'])

# Perform hyperparameter tuning with Bayesian optimization for Stacking Classifier
opt_stacking = BayesSearchCV(stacking_model, param_space_stacking, cv=cv, n_iter=50, scoring='neg_log_loss', n_jobs=-1, random_state=42)
opt_stacking.fit(X_train, y_train)
best_stacking_params = opt_stacking.best_params_

In [None]:
# Use the best Stacking Classifier parameters
stacking_model = StackingClassifier(estimators=[('cnn', final_cnn_model)],
                final_estimator=LGBMClassifier(n_estimators=best_stacking_params['final_estimator__estimator__n_estimators'], 
                learning_rate=best_stacking_params['final_estimator__estimator__learning_rate'], n_jobs=-1, force_col_wise=True))

# Fit the optimized model on the scaled training data
stacking_model.fit(X_train, y_train)

In [None]:
# Make predictions on the validation set
val_predictions = stacking_model.predict(X_val)

# Calculate prediction probabilities for validation predictions
val_proba = stacking_model.predict_proba(X_val)

# Clip predicted probabilities to avoid extremes of the log function
val_proba = np.clip(val_proba, a_min=1e-15, a_max=1 - 1e-15)

# Calculate log loss for validation predictions
val_log_loss = log_loss(y_val, val_proba)
print(f"Validation Log Loss: {val_log_loss:.4f}")

In [None]:
# Make predictions on the test data
test_predictions = stacking_model.predict(X_test)

# Calculate prediction probabilities for test predictions
test_proba = stacking_model.predict_proba(X_test)

# Clip predicted probabilities to avoid extremes of the log function
test_proba = np.clip(test_proba, a_min=1e-15, a_max=1 - 1e-15)

# Create a DataFrame for test predictions
submission_df = pd.DataFrame(test_proba, columns=[f"c{i}" for i in range(1, n_classes + 1)])

# Save the test predictions to a CSV file
submission_df.to_csv("test_predictions.csv", index=False)