In [1]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import re

# Step 1: Generate strings of length 15 over the alphabet {a, b, c, d}
def generate_strings(length, size, alphabet):
    return [''.join(np.random.choice(alphabet, size=length)) for _ in range(size)]

# Step 2: Label strings based on a regex (e.g., `a{3}b{2}`)
def label_strings(strings, regex):
    pattern = re.compile(regex)
    return [1 if pattern.search(s) else 0 for s in strings]

# Step 3: Balance the dataset
def balance_dataset(strings, labels):
    positive = [(s, l) for s, l in zip(strings, labels) if l == 1]
    negative = [(s, l) for s, l in zip(strings, labels) if l == 0]
    min_size = min(len(positive), len(negative))
    balanced_data = positive[:min_size] + negative[:min_size]
    np.random.shuffle(balanced_data)
    balanced_strings, balanced_labels = zip(*balanced_data)
    return list(balanced_strings), list(balanced_labels)

# Step 4: Prepare data using one-hot encoding
def one_hot_encode(strings, alphabet):
    encoder = OneHotEncoder(categories=[list(alphabet)], sparse_output=False)  # Use sparse_output
    encoded = np.array([encoder.fit_transform(np.array(list(s)).reshape(-1, 1)).flatten() for s in strings])
    return encoded

# Step 5: Divide dataset into training and testing
def split_data(data, labels, test_size=0.2):
    return train_test_split(data, labels, test_size=test_size, random_state=42)

# Parameters
alphabet = ['a', 'b', 'c', 'd']
string_length = 15
dataset_size = 10000
regex = r'a{3}b{2}'

# Generate and label data
strings = generate_strings(string_length, dataset_size, alphabet)
labels = label_strings(strings, regex)

# Balance dataset
strings_balanced, labels_balanced = balance_dataset(strings, labels)

# One-hot encode strings
data_encoded = one_hot_encode(strings_balanced, alphabet)
labels_balanced = np.array(labels_balanced)

# Split into training and testing
X_train, X_test, y_train, y_test = split_data(data_encoded, labels_balanced)

# Step 6: Implement a CNN model
model = tf.keras.Sequential([
    tf.keras.layers.Conv1D(filters=1, kernel_size=5, activation='relu', input_shape=(string_length * len(alphabet), 1)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Reshape data for Conv1D
X_train_reshaped = X_train.reshape(-1, X_train.shape[1], 1)
X_test_reshaped = X_test.reshape(-1, X_test.shape[1], 1)

# Train the model
model.fit(X_train_reshaped, y_train, epochs=5, batch_size=32, validation_data=(X_test_reshaped, y_test))

# Analyze filter weights
filters, biases = model.layers[0].get_weights()
print("Filter weights:", filters)

# Step 7: Implement a more complex model
complex_model = tf.keras.Sequential([
    tf.keras.layers.Conv1D(filters=8, kernel_size=5, activation='relu', input_shape=(string_length * len(alphabet), 1)),
    tf.keras.layers.Conv1D(filters=16, kernel_size=3, activation='relu'),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile and train the complex model
complex_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
complex_model.fit(X_train_reshaped, y_train, epochs=5, batch_size=32, validation_data=(X_test_reshaped, y_test))

# Evaluate performance
simple_model_performance = model.evaluate(X_test_reshaped, y_test)
complex_model_performance = complex_model.evaluate(X_test_reshaped, y_test)

print("Simple model performance:", simple_model_performance)
print("Complex model performance:", complex_model_performance)


Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.4568 - loss: 0.7868 - val_accuracy: 0.4038 - val_loss: 0.7998
Epoch 2/5
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4962 - loss: 0.7536 - val_accuracy: 0.4038 - val_loss: 0.7953
Epoch 3/5
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4916 - loss: 0.7576 - val_accuracy: 0.4231 - val_loss: 0.7897
Epoch 4/5
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4705 - loss: 0.7622 - val_accuracy: 0.4038 - val_loss: 0.7861
Epoch 5/5
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5119 - loss: 0.7340 - val_accuracy: 0.4038 - val_loss: 0.7836
Filter weights: [[[ 0.6452526 ]]

 [[-0.43704367]]

 [[ 0.583862  ]]

 [[-0.71115786]]

 [[ 0.5437281 ]]]
Epoch 1/5
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.5376 - loss: 0.6871 - 