In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import numpy as np

# Load the tokenized and vectorized datasets and labels from Google Drive
X_train_vectorized = np.load('/content/drive/MyDrive/X_train_vectorized.npy')
y_train = np.load('/content/drive/MyDrive/y_train.npy')

X_val_vectorized = np.load('/content/drive/MyDrive/X_val_vectorized.npy')
y_val = np.load('/content/drive/MyDrive/y_val.npy')

X_test_vectorized = np.load('/content/drive/MyDrive/X_test_vectorized.npy')
y_test = np.load('/content/drive/MyDrive/y_test.npy')

print("Datasets loaded successfully from Google Drive.")

# Check the shapes of the reloaded datasets
print("X_train_vectorized shape:", X_train_vectorized.shape)
print("y_train shape:", y_train.shape)
print("X_val_vectorized shape:", X_val_vectorized.shape)
print("y_val shape:", y_val.shape)
print("X_test_vectorized shape:", X_test_vectorized.shape)
print("y_test shape:", y_test.shape)

Datasets loaded successfully from Google Drive.
X_train_vectorized shape: (31608, 500)
y_train shape: (31608,)
X_val_vectorized shape: (10536, 500)
y_val shape: (10536,)
X_test_vectorized shape: (10537, 500)
y_test shape: (10537,)


In [10]:
#Convert labels to either 0 (normal) or 1 (abnormal) for binary model
import numpy as np

# Map the labels into two categories: 'normal' and 'abnormal'
y_train_binary = np.where(y_train == 3, 0, 1)  # 0 for 'Normal', 1 for 'Abnormal'
y_val_binary = np.where(y_val == 3, 0, 1)
y_test_binary = np.where(y_test == 3, 0, 1)

# Verify the transformation
print("Binary labels distribution:")
print(f"Training set - Normal: {np.sum(y_train_binary == 0)}, Abnormal: {np.sum(y_train_binary == 1)}")
print(f"Validation set - Normal: {np.sum(y_val_binary == 0)}, Abnormal: {np.sum(y_val_binary == 1)}")

Binary labels distribution:
Training set - Normal: 9697, Abnormal: 21911
Validation set - Normal: 3297, Abnormal: 7239


In [11]:
#Simple baseline binary model that predicts the majority class (abnormal)

from sklearn.metrics import accuracy_score, classification_report

# Set the baseline class to "Abnormal" (class 1)
baseline_class = 1

# Predict the baseline class for all datasets
y_train_pred = np.full_like(y_train_binary, baseline_class)
y_val_pred = np.full_like(y_val_binary, baseline_class)
y_test_pred = np.full_like(y_test_binary, baseline_class)

# Evaluate the baseline model
train_accuracy = accuracy_score(y_train_binary, y_train_pred)
val_accuracy = accuracy_score(y_val_binary, y_val_pred)
test_accuracy = accuracy_score(y_test_binary, y_test_pred)

print("Baseline Model Performance:")
print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Validation Accuracy: {val_accuracy:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")

# Detailed classification report for the test set
print("\nClassification Report (Test Set):")
print(classification_report(y_test_binary, y_test_pred, target_names=['Normal', 'Abnormal']))


Baseline Model Performance:
Training Accuracy: 0.69
Validation Accuracy: 0.69
Test Accuracy: 0.68

Classification Report (Test Set):
              precision    recall  f1-score   support

      Normal       0.00      0.00      0.00      3349
    Abnormal       0.68      1.00      0.81      7188

    accuracy                           0.68     10537
   macro avg       0.34      0.50      0.41     10537
weighted avg       0.47      0.68      0.55     10537



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
#Improved model for version with binary data

import tensorflow as tf
from tensorflow.keras import layers

# Parameters
max_features = 10000  # Vocabulary size
sequence_length = 500  # Input sequence length
embedding_sizes = [8, 16, 32, 64]  # Experiment with these embedding sizes

# Function to build a binary classification model
def build_model(embedding_dim):
    model = tf.keras.Sequential([
        layers.Embedding(input_dim=max_features, output_dim=embedding_dim, input_length=sequence_length),
        layers.Dropout(0.2),
        layers.GlobalAveragePooling1D(),
        layers.Dropout(0.2),
        layers.Dense(1, activation='sigmoid')  # Output layer for binary classification
    ])
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

# Loop to train models with different embedding sizes
for embedding_dim in embedding_sizes:
    print(f"Training model with embedding size {embedding_dim}...")
    model = build_model(embedding_dim)

    # Explicitly build the model
    model.build(input_shape=(None, sequence_length))
    model.summary()  # Verify the model structure

    # Debug: Check input data shapes and types
    print("X_train_vectorized shape:", X_train_vectorized.shape)
    print("y_train shape:", y_train_binary.shape)  # Use binary labels
    print("X_val_vectorized shape:", X_val_vectorized.shape)
    print("y_val shape:", y_val_binary.shape)  # Use binary labels
    print("X_train_vectorized dtype:", X_train_vectorized.dtype)

    # Ensure inputs are integer type
    X_train_vectorized = tf.cast(X_train_vectorized, tf.int32)
    X_val_vectorized = tf.cast(X_val_vectorized, tf.int32)

    # Train the model
    history = model.fit(
        X_train_vectorized, y_train_binary,  # Use binary labels
        validation_data=(X_val_vectorized, y_val_binary),  # Use binary labels
        epochs=5,
        batch_size=64,
        verbose=1
    )

    # Evaluate the model
    val_loss, val_accuracy = model.evaluate(X_val_vectorized, y_val_binary, verbose=0)
    print(f"Embedding size {embedding_dim}: Validation Accuracy = {val_accuracy:.4f}")


Training model with embedding size 8...




X_train_vectorized shape: (31608, 500)
y_train shape: (31608,)
X_val_vectorized shape: (10536, 500)
y_val shape: (10536,)
X_train_vectorized dtype: <dtype: 'int32'>
Epoch 1/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - accuracy: 0.6917 - loss: 0.6213 - val_accuracy: 0.6871 - val_loss: 0.5400
Epoch 2/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 14ms/step - accuracy: 0.7045 - loss: 0.5109 - val_accuracy: 0.6871 - val_loss: 0.4430
Epoch 3/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 19ms/step - accuracy: 0.8262 - loss: 0.4200 - val_accuracy: 0.8893 - val_loss: 0.3777
Epoch 4/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 16ms/step - accuracy: 0.8689 - loss: 0.3704 - val_accuracy: 0.8892 - val_loss: 0.3479
Epoch 5/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 19ms/step - accuracy: 0.8722 - loss: 0.3418 - val_accuracy: 0.8906 - val_loss: 0.3199
Embedding size 8: Vali

X_train_vectorized shape: (31608, 500)
y_train shape: (31608,)
X_val_vectorized shape: (10536, 500)
y_val shape: (10536,)
X_train_vectorized dtype: <dtype: 'int32'>
Epoch 1/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 25ms/step - accuracy: 0.6899 - loss: 0.6092 - val_accuracy: 0.6871 - val_loss: 0.4926
Epoch 2/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 23ms/step - accuracy: 0.7574 - loss: 0.4600 - val_accuracy: 0.8909 - val_loss: 0.3866
Epoch 3/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 22ms/step - accuracy: 0.8587 - loss: 0.3761 - val_accuracy: 0.8539 - val_loss: 0.3445
Epoch 4/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 22ms/step - accuracy: 0.8765 - loss: 0.3326 - val_accuracy: 0.8824 - val_loss: 0.3110
Epoch 5/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - accuracy: 0.8849 - loss: 0.3082 - val_accuracy: 0.8659 - val_loss: 0.3009
Embedding size 16: Va

X_train_vectorized shape: (31608, 500)
y_train shape: (31608,)
X_val_vectorized shape: (10536, 500)
y_val shape: (10536,)
X_train_vectorized dtype: <dtype: 'int32'>
Epoch 1/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 29ms/step - accuracy: 0.6957 - loss: 0.5960 - val_accuracy: 0.8595 - val_loss: 0.4491
Epoch 2/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 27ms/step - accuracy: 0.8055 - loss: 0.4196 - val_accuracy: 0.8586 - val_loss: 0.3569
Epoch 3/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 28ms/step - accuracy: 0.8650 - loss: 0.3429 - val_accuracy: 0.8965 - val_loss: 0.3153
Epoch 4/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 30ms/step - accuracy: 0.8809 - loss: 0.3125 - val_accuracy: 0.9021 - val_loss: 0.3032
Epoch 5/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 30ms/step - accuracy: 0.8829 - loss: 0.2939 - val_accuracy: 0.8776 - val_loss: 0.2833
Embedding size 32: V

X_train_vectorized shape: (31608, 500)
y_train shape: (31608,)
X_val_vectorized shape: (10536, 500)
y_val shape: (10536,)
X_train_vectorized dtype: <dtype: 'int32'>
Epoch 1/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 51ms/step - accuracy: 0.7064 - loss: 0.5770 - val_accuracy: 0.8460 - val_loss: 0.4079
Epoch 2/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 48ms/step - accuracy: 0.8351 - loss: 0.3868 - val_accuracy: 0.8881 - val_loss: 0.3273
Epoch 3/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 52ms/step - accuracy: 0.8700 - loss: 0.3261 - val_accuracy: 0.8301 - val_loss: 0.3401
Epoch 4/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 48ms/step - accuracy: 0.8759 - loss: 0.3035 - val_accuracy: 0.8476 - val_loss: 0.3119
Epoch 5/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 50ms/step - accuracy: 0.8837 - loss: 0.2902 - val_accuracy: 0.9076 - val_loss: 0.2778
Embedding size 64: V

In [1]:

# Load the sparse datasets from Google Drive
from scipy.sparse import load_npz
import numpy as np

# Load sparse matrices
X_train_bow = load_npz('/content/drive/MyDrive/X_train_bow.npz')
X_val_bow = load_npz('/content/drive/MyDrive/X_val_bow.npz')
X_test_bow = load_npz('/content/drive/MyDrive/X_test_bow.npz')



print("Datasets loaded successfully from Google Drive.")

# Check the shapes of the reloaded datasets
print("X_train_vectorized shape:", X_train_bow.shape)

print("X_val_vectorized shape:", X_val_bow.shape)

print("X_test_vectorized shape:", X_test_bow.shape)


Datasets loaded successfully from Google Drive.
X_train_vectorized shape: (31608, 48655)
X_val_vectorized shape: (10536, 48655)
X_test_vectorized shape: (10537, 48655)


In [4]:
#Check data types

# Function to inspect dataset types
def inspect_data(name, data):
    print(f"{name} Type: {type(data)}")

# Inspect all datasets
inspect_data("X_train_vectorized", X_train_vectorized)
inspect_data("y_train", y_train)
inspect_data("X_val_vectorized", X_val_vectorized)
inspect_data("y_val", y_val)
inspect_data("X_test_vectorized", X_test_vectorized)
inspect_data("y_test", y_test)


X_train_vectorized Type: <class 'numpy.ndarray'>
y_train Type: <class 'numpy.ndarray'>
X_val_vectorized Type: <class 'numpy.ndarray'>
y_val Type: <class 'numpy.ndarray'>
X_test_vectorized Type: <class 'numpy.ndarray'>
y_test Type: <class 'numpy.ndarray'>


In [5]:
#Check data types for bag-of-words model

# Function to inspect dataset types
def inspect_data(name, data):
    print(f"{name} Type: {type(data)}")

# Inspect all datasets
inspect_data("X_train_bow", X_train_vectorized)
inspect_data("X_val_bow", X_val_vectorized)
inspect_data("X_test_bow", X_test_vectorized)


X_train_bow Type: <class 'numpy.ndarray'>
X_val_bow Type: <class 'numpy.ndarray'>
X_test_bow Type: <class 'numpy.ndarray'>


In [6]:
#Create a baseline bag of words model
#Use weighted F1 score to gauge accuracy, since we have an imbalanced dataset

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report


#Train the model
model = LogisticRegression(max_iter=1000)  # Increase max_iter for convergence on large datasets
model.fit(X_train_bow, y_train)

#Validate the model
y_val_pred = model.predict(X_val_bow)
val_f1 = f1_score(y_val, y_val_pred, average='weighted')

print("Validation Weighted F1 Score:", val_f1)
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))

#Test the model
y_test_pred = model.predict(X_test_bow)
test_f1 = f1_score(y_test, y_test_pred, average='weighted')

print("Test Weighted F1 Score:", test_f1)
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))



Validation Weighted F1 Score: 0.7414861942483449
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.73      0.74       726
           1       0.83      0.71      0.77       554
           2       0.70      0.68      0.69      3098
           3       0.87      0.95      0.91      3297
           4       0.74      0.56      0.64       228
           5       0.57      0.49      0.52       544
           6       0.62      0.62      0.62      2089

    accuracy                           0.75     10536
   macro avg       0.73      0.68      0.70     10536
weighted avg       0.74      0.75      0.74     10536

Test Weighted F1 Score: 0.7388481841661599
Test Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.74      0.76       776
           1       0.83      0.67      0.74       537
           2       0.68      0.68      0.68      3043
           3       0.86      0

In [8]:
import tensorflow as tf
from tensorflow.keras import layers

# Parameters
max_features = 10000  # Vocabulary size
sequence_length = 500  # Input sequence length
num_classes = 7  # Number of output classes
embedding_sizes = [8, 16, 32, 64]  # Experiment with these embedding sizes

# Function to build a multi-class classification model
def build_model(embedding_dim):
    model = tf.keras.Sequential([
        layers.Embedding(input_dim=max_features, output_dim=embedding_dim, input_length=sequence_length),
        layers.Dropout(0.2),
        layers.GlobalAveragePooling1D(),
        layers.Dropout(0.2),
        layers.Dense(num_classes, activation='softmax')  # Output layer for multi-class classification
    ])
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

# Loop to train models with different embedding sizes
for embedding_dim in embedding_sizes:
    print(f"Training model with embedding size {embedding_dim}...")
    model = build_model(embedding_dim)

    # Explicitly build the model
    model.build(input_shape=(None, sequence_length))
    model.summary()  # Verify the model structure

    # Debug: Check input data shapes and types
    print("X_train_vectorized shape:", X_train_vectorized.shape)
    print("y_train shape:", y_train.shape)
    print("X_val_vectorized shape:", X_val_vectorized.shape)
    print("y_val shape:", y_val.shape)
    print("X_train_vectorized dtype:", X_train_vectorized.dtype)

    # Ensure inputs are integer type
    X_train_vectorized = tf.cast(X_train_vectorized, tf.int32)
    X_val_vectorized = tf.cast(X_val_vectorized, tf.int32)

    # Train the model
    history = model.fit(
        X_train_vectorized, y_train,  # Replace with your vectorized input and labels
        validation_data=(X_val_vectorized, y_val),
        epochs=5,
        batch_size=64,
        verbose=1
    )

    # Evaluate the model
    val_loss, val_accuracy = model.evaluate(X_val_vectorized, y_val, verbose=0)
    print(f"Embedding size {embedding_dim}: Validation Accuracy = {val_accuracy:.4f}")



Training model with embedding size 8...




X_train_vectorized shape: (31608, 500)
y_train shape: (31608,)
X_val_vectorized shape: (10536, 500)
y_val shape: (10536,)
X_train_vectorized dtype: int64
Epoch 1/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 17ms/step - accuracy: 0.3241 - loss: 1.6923 - val_accuracy: 0.4335 - val_loss: 1.5698
Epoch 2/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - accuracy: 0.4014 - loss: 1.5767 - val_accuracy: 0.4675 - val_loss: 1.5109
Epoch 3/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 16ms/step - accuracy: 0.4492 - loss: 1.5184 - val_accuracy: 0.4797 - val_loss: 1.4522
Epoch 4/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.4734 - loss: 1.4638 - val_accuracy: 0.5111 - val_loss: 1.4057
Epoch 5/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 15ms/step - accuracy: 0.4909 - loss: 1.4122 - val_accuracy: 0.5228 - val_loss: 1.3759
Embedding size 8: Validation Accur

X_train_vectorized shape: (31608, 500)
y_train shape: (31608,)
X_val_vectorized shape: (10536, 500)
y_val shape: (10536,)
X_train_vectorized dtype: <dtype: 'int32'>
Epoch 1/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 21ms/step - accuracy: 0.3334 - loss: 1.6368 - val_accuracy: 0.4450 - val_loss: 1.5313
Epoch 2/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 16ms/step - accuracy: 0.4481 - loss: 1.5335 - val_accuracy: 0.4951 - val_loss: 1.4347
Epoch 3/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 20ms/step - accuracy: 0.4861 - loss: 1.4339 - val_accuracy: 0.4885 - val_loss: 1.3708
Epoch 4/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 21ms/step - accuracy: 0.4978 - loss: 1.3824 - val_accuracy: 0.5168 - val_loss: 1.3210
Epoch 5/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 17ms/step - accuracy: 0.5072 - loss: 1.3324 - val_accuracy: 0.5158 - val_loss: 1.2890
Embedding size 16: Va

X_train_vectorized shape: (31608, 500)
y_train shape: (31608,)
X_val_vectorized shape: (10536, 500)
y_val shape: (10536,)
X_train_vectorized dtype: <dtype: 'int32'>
Epoch 1/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 32ms/step - accuracy: 0.3489 - loss: 1.6219 - val_accuracy: 0.4658 - val_loss: 1.5070
Epoch 2/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 30ms/step - accuracy: 0.4652 - loss: 1.4916 - val_accuracy: 0.5232 - val_loss: 1.4135
Epoch 3/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 30ms/step - accuracy: 0.4969 - loss: 1.4015 - val_accuracy: 0.5112 - val_loss: 1.3291
Epoch 4/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 31ms/step - accuracy: 0.5145 - loss: 1.3274 - val_accuracy: 0.5219 - val_loss: 1.2651
Epoch 5/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 31ms/step - accuracy: 0.5285 - loss: 1.2652 - val_accuracy: 0.5633 - val_loss: 1.2224
Embedding size 32: V

X_train_vectorized shape: (31608, 500)
y_train shape: (31608,)
X_val_vectorized shape: (10536, 500)
y_val shape: (10536,)
X_train_vectorized dtype: <dtype: 'int32'>
Epoch 1/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 51ms/step - accuracy: 0.3510 - loss: 1.6140 - val_accuracy: 0.4589 - val_loss: 1.4664
Epoch 2/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 47ms/step - accuracy: 0.4796 - loss: 1.4547 - val_accuracy: 0.5085 - val_loss: 1.3508
Epoch 3/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 48ms/step - accuracy: 0.5064 - loss: 1.3398 - val_accuracy: 0.5172 - val_loss: 1.2661
Epoch 4/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 53ms/step - accuracy: 0.5260 - loss: 1.2666 - val_accuracy: 0.5539 - val_loss: 1.2140
Epoch 5/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 55ms/step - accuracy: 0.5593 - loss: 1.1896 - val_accuracy: 0.5896 - val_loss: 1.1412
Embedding size 64: V

In [9]:
#Use Keras tuner to experiment with different numbers of hidden layers, embedding sizes, learning rates
!pip install keras-tuner
import keras_tuner as kt
from tensorflow.keras import layers

# Parameters
max_features = 10000  # Vocabulary size
sequence_length = 500  # Input sequence length
num_classes = 7  # Number of output classes

# Define the model builder function
def model_builder(hp):
    model = tf.keras.Sequential()

    # Embedding layer with reduced search space for dimensions
    model.add(
        layers.Embedding(
            input_dim=max_features,
            output_dim=hp.Choice('embedding_dim', [128, 256]),
            input_length=sequence_length
        )
    )

    # Add Convolutional Layers with reduced options
    for i in range(hp.Int('num_conv_layers', 0, 1)):  # Up to 1 Conv layer
        model.add(layers.Conv1D(
            filters=hp.Choice(f'filters_{i}', [32, 64]),
            kernel_size=hp.Choice(f'kernel_size_{i}', [3]),  # Single kernel size option
            activation='relu'
        ))
        model.add(layers.MaxPooling1D(pool_size=2))

    model.add(layers.GlobalAveragePooling1D())

    # Add Dense Layers with reduced options
    for i in range(hp.Int('num_dense_layers', 1, 2)):  # Up to 2 Dense layers
        model.add(layers.Dense(
            units=hp.Choice(f'units_{i}', [64, 128]),
            activation='relu'
        ))

    # Output layer
    model.add(layers.Dense(num_classes, activation='softmax'))

    # Compile the model with fewer learning rate options
    model.compile(
        optimizer=tf.keras.optimizers.Adam(
            hp.Choice('learning_rate', [1e-3, 1e-4])
        ),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

# Initialize the Keras Tuner with fewer trials
tuner = kt.RandomSearch(
    model_builder,
    objective='val_accuracy',  # Optimize for validation accuracy
    max_trials=5,
    executions_per_trial=1,  # Number of executions per trial
    directory='my_dir',  # Directory to save results
    project_name='tuning_embedding_and_layers2'
)

# Start the search with fewer epochs
tuner.search(
    X_train_vectorized, y_train,
    validation_data=(X_val_vectorized, y_val),
    epochs=3,  # Fewer epochs
    batch_size=64
)

# Retrieve and print the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best embedding size: {best_hps.get('embedding_dim')}")
print(f"Best number of convolutional layers: {best_hps.get('num_conv_layers')}")
print(f"Best number of dense layers: {best_hps.get('num_dense_layers')}")
print(f"Best learning rate: {best_hps.get('learning_rate')}")

# Build and train the best model
best_model = tuner.hypermodel.build(best_hps)
history = best_model.fit(
    X_train_vectorized, y_train,
    validation_data=(X_val_vectorized, y_val),
    epochs=5,  # Train the best model for more epochs
    batch_size=64
)




Trial 5 Complete [00h 07m 09s]
val_accuracy: 0.5467919707298279

Best val_accuracy So Far: 0.7189635634422302
Total elapsed time: 00h 41m 28s
Best embedding size: 256
Best number of convolutional layers: 1
Best number of dense layers: 1
Best learning rate: 0.001
Epoch 1/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 407ms/step - accuracy: 0.4770 - loss: 1.3677 - val_accuracy: 0.6526 - val_loss: 0.9374
Epoch 2/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 403ms/step - accuracy: 0.6640 - loss: 0.8717 - val_accuracy: 0.7076 - val_loss: 0.8026
Epoch 3/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 394ms/step - accuracy: 0.7335 - loss: 0.7115 - val_accuracy: 0.7199 - val_loss: 0.7539
Epoch 4/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 393ms/step - accuracy: 0.7795 - loss: 0.5984 - val_accuracy: 0.7401 - val_loss: 0.7058
Epoch 5/5
[1m494/494[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m