In [38]:
import numpy as np
import tensorflow as tf
import random
import re

def generate_dataset(size=10000, length=15, alphabet="abcd"):
    dataset = []
    regexes = []
    labels = []
    
    half_size = size // 2  # 50% should contain the regex
    
    # Generate strings containing the regex
    for _ in range(half_size):
        regex_pattern = ''.join([random.choice('abcd') for _ in range(5)])
        while True:
            rand_pos = random.randint(0, length - len(regex_pattern))  # Choose a position for regex insertion
            random_chars = [random.choice(alphabet) for _ in range(length)]
            random_chars[rand_pos:rand_pos+len(regex_pattern)] = list(regex_pattern)  # Insert regex pattern
            
            generated_string = "".join(random_chars)
            
            if re.search(regex_pattern, generated_string):  # Ensure it actually matches the regex
                dataset.append(generated_string)
                regexes.append(regex_pattern)
                labels.append(1)  # Label 1 for regex match
                break
    
    # Generate strings NOT containing the regex
    for _ in range(half_size):
        regex_pattern = ''.join([random.choice('abcd') for _ in range(5)])
        while True:
            random_string = "".join(random.choice(alphabet) for _ in range(length))
            if not re.search(regex_pattern, random_string):  # Ensure it does NOT match the regex
                dataset.append(random_string)
                regexes.append(regex_pattern)
                labels.append(0)  # Label 0 for no match
                break
    
    return dataset, regexes, labels

dataset, regexes, labels = generate_dataset()

# Combine into a single structure
combined_data = list(zip(dataset, regexes, labels))
random.shuffle(combined_data)

# Train/test split (70/30 for this example)
split_idx = 7000
training_data = combined_data[:split_idx]
testing_data = combined_data[split_idx:]

In [39]:
def convert_to_one_hot(data, length=15):
    """
    data is a list of tuples: (str, regex, label)
    returns:
        X: np.array of shape (num_samples, length, 4)
        y: np.array of shape (num_samples,)
    """
    char_to_idx = {'a': 0, 'b': 1, 'c': 2, 'd': 3}
    
    X = []
    y = []
    for (s, r, label) in data:
        # Initialize a 15×4 zero matrix for each string
        one_hot = np.zeros((length, 4), dtype=np.float32)
        
        for i, ch in enumerate(s):
            one_hot[i, char_to_idx[ch]] = 1.0
        
        X.append(one_hot)
        y.append(label)
        
    X = np.array(X)
    y = np.array(y)
    return X, y

In [46]:
X_train, y_train = convert_to_one_hot(training_data)
X_test, y_test   = convert_to_one_hot(testing_data)

print("X_train shape:", X_train.shape)  # (7000, 15, 4)
print("y_train shape:", y_train.shape)  # (7000,)

X_train shape: (7000, 15, 4)
y_train shape: (7000,)


In [41]:
model_single_filter = tf.keras.Sequential([
    tf.keras.layers.Conv1D(
        filters=1,              # single filter
        kernel_size=5,
        activation='relu',
        input_shape=(15, 4)     # (sequence_length=15, one_hot_size=4)
    ),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_single_filter.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model_single_filter.summary()

history_single_filter = model_single_filter.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=5,               # you can increase this
    batch_size=32,
    verbose=1
)

conv_layer = model_single_filter.layers[0]
filters, biases = conv_layer.get_weights()

print("Filter shape:", filters.shape)  
# Should be (kernel_size=5, input_dim=4, num_filters=1) -> (5, 4, 1)

print("Filter weights:\n", filters[..., 0])  # Show the single filter in shape (5,4)
print("Filter bias:\n", biases)             # Shape (1,)

Epoch 1/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 768us/step - accuracy: 0.4922 - loss: 0.7285 - val_accuracy: 0.5017 - val_loss: 0.7015
Epoch 2/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 586us/step - accuracy: 0.5078 - loss: 0.6977 - val_accuracy: 0.4903 - val_loss: 0.6975
Epoch 3/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 545us/step - accuracy: 0.5012 - loss: 0.6962 - val_accuracy: 0.4880 - val_loss: 0.6958
Epoch 4/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 535us/step - accuracy: 0.4982 - loss: 0.6950 - val_accuracy: 0.4900 - val_loss: 0.6949
Epoch 5/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 540us/step - accuracy: 0.5006 - loss: 0.6946 - val_accuracy: 0.4920 - val_loss: 0.6945
Filter shape: (5, 4, 1)
Filter weights:
 [[-0.4002272  -0.3845229   0.06942625 -0.4660126 ]
 [ 0.30613258 -0.478914   -0.13019168  0.28906512]
 [-0.314098   -0.4933697   0.03349493 -0.5

In [42]:
model_complex = tf.keras.Sequential([
    tf.keras.layers.Conv1D(filters=128, kernel_size=3, activation='relu', input_shape=(15, 4)),
    tf.keras.layers.MaxPooling1D(pool_size=2),

    tf.keras.layers.Conv1D(filters=512, kernel_size=3, activation='relu'),

    tf.keras.layers.GlobalMaxPooling1D(),

    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_complex.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model_complex.summary()

history_complex = model_complex.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=5,
    batch_size=32,
    verbose=1
)

loss_single, acc_single = model_single_filter.evaluate(X_test, y_test, verbose=0)
loss_complex, acc_complex = model_complex.evaluate(X_test, y_test, verbose=0)

print("Single-filter CNN - Test Loss:", loss_single, " Test Accuracy:", acc_single)
print("Complex CNN       - Test Loss:", loss_complex, " Test Accuracy:", acc_complex)


Epoch 1/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5018 - loss: 0.6952 - val_accuracy: 0.5027 - val_loss: 0.6931
Epoch 2/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4876 - loss: 0.6932 - val_accuracy: 0.5027 - val_loss: 0.6931
Epoch 3/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4953 - loss: 0.6932 - val_accuracy: 0.5023 - val_loss: 0.6931
Epoch 4/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5006 - loss: 0.6932 - val_accuracy: 0.4973 - val_loss: 0.6932
Epoch 5/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5112 - loss: 0.6931 - val_accuracy: 0.4973 - val_loss: 0.6932
Single-filter CNN - Test Loss: 0.694471001625061  Test Accuracy: 0.492000013589859
Complex CNN       - Test Loss: 0.6931504607200623  Test Accuracy: 0.4973333477973938
