In [6]:
import numpy as np
import tensorflow as tf
import random
import re

def generate_dataset(size=10000, length=15, alphabet="abcd"):
    dataset = []
    regexes = []
    labels = []
    
    half_size = size // 2  # 50% should contain the regex
    
    # Generate strings containing the regex
    for _ in range(half_size):
        regex_pattern = ''.join([random.choice('abcd') for _ in range(5)])
        while True:
            rand_pos = random.randint(0, length - len(regex_pattern))  # Choose a position for regex insertion
            random_chars = [random.choice(alphabet) for _ in range(length)]
            random_chars[rand_pos:rand_pos+len(regex_pattern)] = list(regex_pattern)  # Insert regex pattern
            
            generated_string = "".join(random_chars)
            
            if re.search(regex_pattern, generated_string):  # Ensure it actually matches the regex
                dataset.append(generated_string)
                regexes.append(regex_pattern)
                labels.append(1)  # Label 1 for regex match
                break
    
    # Generate strings NOT containing the regex
    for _ in range(half_size):
        regex_pattern = ''.join([random.choice('abcd') for _ in range(5)])
        while True:
            random_string = "".join(random.choice(alphabet) for _ in range(length))
            if not re.search(regex_pattern, random_string):  # Ensure it does NOT match the regex
                dataset.append(random_string)
                regexes.append(regex_pattern)
                labels.append(0)  # Label 0 for no match
                break
    
    return dataset, regexes, labels

dataset, regexes, labels = generate_dataset()

# Combine into a single structure
combined_data = list(zip(dataset, regexes, labels))
random.shuffle(combined_data)

# Train/test split (70/30 for this example)
split_idx = 7000
training_data = combined_data[:split_idx]
testing_data = combined_data[split_idx:]

In [7]:
def convert_to_one_hot(data, length=15):
    """
    data is a list of tuples: (str, regex, label)
    returns:
        X: np.array of shape (num_samples, length, 4)
        y: np.array of shape (num_samples,)
    """
    char_to_idx = {'a': 0, 'b': 1, 'c': 2, 'd': 3}
    
    X = []
    y = []
    for (s, r, label) in data:
        # Initialize a 15×4 zero matrix for each string
        one_hot = np.zeros((length, 4), dtype=np.float32)
        
        for i, ch in enumerate(s):
            one_hot[i, char_to_idx[ch]] = 1.0
        
        X.append(one_hot)
        y.append(label)
        
    X = np.array(X)
    y = np.array(y)
    return X, y

In [8]:
X_train, y_train = convert_to_one_hot(training_data)
X_test, y_test   = convert_to_one_hot(testing_data)

print("X_train shape:", X_train.shape)  # (7000, 15, 4)
print("y_train shape:", y_train.shape)  # (7000,)

X_train shape: (7000, 15, 4)
y_train shape: (7000,)


In [9]:
model_single_filter = tf.keras.Sequential([
    tf.keras.layers.Conv1D(
        filters=1,              # single filter
        kernel_size=5,
        activation='relu',
        input_shape=(15, 4)     # (sequence_length=15, one_hot_size=4)
    ),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_single_filter.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model_single_filter.summary()

history_single_filter = model_single_filter.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=5,               # you can increase this
    batch_size=32,
    verbose=1
)

conv_layer = model_single_filter.layers[0]
filters, biases = conv_layer.get_weights()

print("Filter shape:", filters.shape)  
# Should be (kernel_size=5, input_dim=4, num_filters=1) -> (5, 4, 1)

print("Filter weights:\n", filters[..., 0])  # Show the single filter in shape (5,4)
print("Filter bias:\n", biases)             # Shape (1,)

Epoch 1/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 827us/step - accuracy: 0.5032 - loss: 0.7140 - val_accuracy: 0.5120 - val_loss: 0.6973
Epoch 2/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 490us/step - accuracy: 0.5073 - loss: 0.6976 - val_accuracy: 0.5127 - val_loss: 0.6947
Epoch 3/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 502us/step - accuracy: 0.4898 - loss: 0.6957 - val_accuracy: 0.5080 - val_loss: 0.6940
Epoch 4/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 475us/step - accuracy: 0.5039 - loss: 0.6939 - val_accuracy: 0.5070 - val_loss: 0.6937
Epoch 5/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 468us/step - accuracy: 0.4999 - loss: 0.6933 - val_accuracy: 0.5083 - val_loss: 0.6935
Filter shape: (5, 4, 1)
Filter weights:
 [[-0.28383324 -0.51783717 -0.11811454  0.20002685]
 [-0.1818608   0.19188856 -0.08723585  0.01992683]
 [-0.32249528  0.1724988  -0.4041516  -0.1

In [10]:
model_complex = tf.keras.Sequential([
    tf.keras.layers.Conv1D(filters=128, kernel_size=3, activation='relu', input_shape=(15, 4)),
    tf.keras.layers.MaxPooling1D(pool_size=2),

    tf.keras.layers.Conv1D(filters=512, kernel_size=3, activation='relu'),

    tf.keras.layers.GlobalMaxPooling1D(),

    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_complex.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model_complex.summary()

history_complex = model_complex.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=5,
    batch_size=32,
    verbose=1
)

loss_single, acc_single = model_single_filter.evaluate(X_test, y_test, verbose=0)
loss_complex, acc_complex = model_complex.evaluate(X_test, y_test, verbose=0)

print("Single-filter CNN - Test Loss:", loss_single, " Test Accuracy:", acc_single)
print("Complex CNN       - Test Loss:", loss_complex, " Test Accuracy:", acc_complex)


Epoch 1/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4893 - loss: 0.6981 - val_accuracy: 0.5010 - val_loss: 0.6936
Epoch 2/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5127 - loss: 0.6927 - val_accuracy: 0.5040 - val_loss: 0.6931
Epoch 3/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5093 - loss: 0.6932 - val_accuracy: 0.4940 - val_loss: 0.6933
Epoch 4/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5102 - loss: 0.6931 - val_accuracy: 0.5007 - val_loss: 0.6938
Epoch 5/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5113 - loss: 0.6930 - val_accuracy: 0.5027 - val_loss: 0.6953
Single-filter CNN - Test Loss: 0.693518877029419  Test Accuracy: 0.5083333253860474
Complex CNN       - Test Loss: 0.6952851414680481  Test Accuracy: 0.5026666522026062
