# Evaluateing agent effectiveness

- Measure the accuracy, precision, and error rate of your ML agent.
- 
- Evaluate the agent’s response time and resource utilization under different conditions.
- 
- Perform stress testing to understand the scalability of the agent.

## 1 Setup environment

In [1]:
import tensorflow as tf
from tensorflow.keras.datasets import mnist

# Load dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

# Build a simple model
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(x_train, y_train, epochs=5, validation_data=(x_test, y_test))

2025-11-17 08:24:39.292826: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  super().__init__(**kwargs)


Epoch 1/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9133 - loss: 0.2981 - val_accuracy: 0.9573 - val_loss: 0.1446
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9574 - loss: 0.1442 - val_accuracy: 0.9683 - val_loss: 0.1065
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9678 - loss: 0.1071 - val_accuracy: 0.9736 - val_loss: 0.0844
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9730 - loss: 0.0874 - val_accuracy: 0.9759 - val_loss: 0.0798
Epoch 5/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9774 - loss: 0.0735 - val_accuracy: 0.9774 - val_loss: 0.0770


<keras.src.callbacks.history.History at 0x13fa020b0>

## 2. Measure accuracy and precision

In [2]:
from sklearn.metrics import accuracy_score, precision_score
import numpy as np

# Make predictions on the test set
y_pred_probs = model.predict(x_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Calculate precision (average='weighted' to handle multiple classes)
precision = precision_score(y_test, y_pred, average='weighted')
print(f'Precision: {precision:.4f}')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 860us/step
Accuracy: 0.9774
Precision: 0.9774


## 3. Evaluate response time

In [3]:
import time

# Measure response time for multiple iterations
start_time = time.time()
for _ in range(25):
    model.predict(x_test)
end_time = time.time()

average_response_time = (end_time - start_time) / 25
print(f"Average Response Time: {average_response_time:.4f} seconds")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 975us/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 788us/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 851us/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 853us/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 901us/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 922us/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 803us/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 824us/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 887us/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 758us/step
[1m3

## 4. Resource utilization
- CPU
- Memory

Tools such as psutil can be be used to measure resource utilzation in real time

In [4]:
import psutil

# Monitor resource usage
cpu_usage = psutil.cpu_percent()
memory_usage = psutil.virtual_memory().percent

# for better results, measure CPU usage while inference is active,
# and measure memory usage against a baseline before the model is loaded
print(f"CPU Usage: {cpu_usage}%")
print(f"Memory Usage: {memory_usage}%")

CPU Usage: 24.0%
Memory Usage: 53.8%


## 5. Perform Stress testing

In [5]:
import numpy as np
import time

# Ensure correct shape before repeating
print("Original x_test shape:", x_test.shape)  # Expected: (10000, 28, 28)

# Properly duplicate test data along batch axis
large_input = np.repeat(x_test, 10, axis=0)  # Expands batch size only

# Verify new shape
print("Large input shape after fix:", large_input.shape)  # Should be (100000, 28, 28)

# Measure performance under stress
start_time = time.time()
model.predict(large_input)  # Now matches model input (batch_size, 28, 28)
end_time = time.time()

print(f"Response Time under Stress (Reduced Size): {end_time - start_time:.4f} seconds")

Original x_test shape: (10000, 28, 28)
Large input shape after fix: (100000, 28, 28)
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 782us/step
Response Time under Stress (Reduced Size): 4.7256 seconds


### 6. Use benchmarking and cross validation

In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

# Example data generation for demonstration (replace with actual data)
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
agent_model = RandomForestClassifier()  # Replace with your actual model

# Perform 5-fold cross-validation
cv_scores = cross_val_score(agent_model, X, y, cv=5)

# Print the cross-validation scores for each fold
print(f'Cross-Validation Scores: {cv_scores}')

# Print the mean and standard deviation of the scores
print(f'Mean CV Score: {cv_scores.mean():.4f}')
print(f'Standard Deviation of CV Scores: {cv_scores.std():.4f}')

Cross-Validation Scores: [0.94  0.915 0.895 0.885 0.865]
Mean CV Score: 0.9000
Standard Deviation of CV Scores: 0.0257
