In [8]:
!pip install xgboost

import tensorflow as tf

# Check if the GPU is available and output its name
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")


Default GPU Device: /device:GPU:0


In [45]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, concatenate
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
# from tensorflow.keras.utils import to_categorical

import numpy as np

import xgboost as xgb
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score

# Confirm that GPU is being used
print("TensorFlow version:", tf.__version__)
print(tf.config.list_physical_devices('GPU'))

TensorFlow version: 2.15.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [41]:
def build_cnn(input_shape):
    # Input layer
    inputs = Input(shape=input_shape)

    # Convolutional layers with different kernel sizes
    conv_layers = []
    kernel_sizes = [1, 3, 5, 9, 15, 21]
    for size in kernel_sizes:
        conv = Conv1D(filters=64, kernel_size=size, activation='relu', padding='same')(inputs)
        conv_layers.append(conv)

    # Concatenate all convolutional layers
    concatenated = concatenate(conv_layers, axis=-1)

    # Final convolutional layer
    final_conv = Conv1D(filters=64, kernel_size=3, activation='relu', padding='same')(concatenated)

    # Max pooling layer
    pooled = MaxPooling1D(pool_size=5)(final_conv)

    # Dropout layer
    dropped = Dropout(rate=0.25)(pooled)

    # Flatten layer
    flatten = Flatten()(dropped)

    # Dense layer
    dense = Dense(64, activation='relu')(flatten)

      # Output layer for binary classification
    output = Dense(1, activation='sigmoid')(dense)

    # Create the model
    model = Model(inputs=inputs, outputs=output)
    return model

In [39]:
import gzip
import pickle


# Loading the compressed pickle file
with gzip.open('cnn_input_data.pkl.gz', 'rb') as f:
    df = pickle.load(f)

print(df)


         ACC                                           Encoding  Label
0     O75439  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      0
1     Q2TBK2  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      0
2     Q5VY80  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      0
3     Q9BZM6  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      0
4     O75489  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      0
...      ...                                                ...    ...
2947  Q96CK0  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      1
2948  Q96CK0  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      1
2949  Q24JY4  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      1
2950  O43257  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      1
2951  Q8R331  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      1

[2951 rows x 3 columns]


In [46]:
X = np.stack(df['Encoding'].values)
y = df['Label'].values.astype('int')  # Ensure labels are integers (0 or 1)

# K-Fold Cross-Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=50)
fold = 0
accuracies = []

for train_index, test_index in kf.split(X):
    fold += 1
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Build and compile CNN model
    input_shape = (1000, 20)
    cnn_model = build_cnn(input_shape)
    cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Early stopping to prevent overfitting
    early_stopping = EarlyStopping(monitor='val_loss', patience=3)

    # Train CNN model
    history = cnn_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

    # Extract features for XGBoost
    feature_layer_model = Model(inputs=cnn_model.input, outputs=cnn_model.layers[-2].output)
    X_train_features = feature_layer_model.predict(X_train)
    X_test_features = feature_layer_model.predict(X_test)

    # Train XGBoost on extracted features
    xgb_model = xgb.XGBClassifier()
    xgb_model.fit(X_train_features, y_train)

    # Predict using XGBoost
    y_pred_xgb = xgb_model.predict(X_test_features)

    # Calculate and store accuracy
    accuracy = accuracy_score(y_test, y_pred_xgb)
    accuracies.append(accuracy)
    print(f"Fold {fold}, Accuracy: {accuracy:.2%}")

# Average accuracy across all folds
average_accuracy = np.mean(accuracies)
print(f"Average Accuracy: {average_accuracy:.2%}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Fold 1, Accuracy: 92.72%
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Fold 2, Accuracy: 89.49%
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Fold 3, Accuracy: 92.03%
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Fold 4, Accuracy: 92.71%
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Fold 5, Accuracy: 89.83%
Average Accuracy: 91.36%


In [47]:
# Loading the compressed pickle file
with gzip.open('testing_data.pkl.gz', 'rb') as f:
    test_df = pickle.load(f)

print(test_df)

                                               Encoding  Label
0     [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      0
1     [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      1
2     [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      0
3     [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      0
4     [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      1
...                                                 ...    ...
2709  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      0
2710  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      0
2711  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      0
2712  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      0
2713  [[-0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0.0, -0...      0

[2714 rows x 2 columns]


In [50]:
X_test = np.stack(test_df['Encoding'].values)
y_test = test_df['Label'].values.astype('int')  # Ensure labels are integers (0 or 1)



X_test_features = feature_layer_model.predict(X_test)

xgb_model.fit(X_train_features, y_train)

# Predict using XGBoost
y_pred_xgb = xgb_model.predict(X_test_features)

# Calculate and store accuracy
accuracy = accuracy_score(y_test, y_pred_xgb)
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 73.62%
