## Progress Plots for the Training/Fine-Tuning GPT-2 Model
This script plots the progress of training/fine-tuning the GPT-2 model for observation and identification of any anomalies during training. It contains the following parts:

* Showing the plot without saving it (for observing progress during training)
* Showing the plot and saving it in the working directory

To use the script, you need to manually copy the output when you run the train_model_script() in the training script, and paste it in the log_text variable. The script is designed to extract the loss value, gradient norm, and learning rate over the training process (in epochs) and generate the following plots:

* Loss over Epochs
* Rate of Loss Reduction over Epochs (measured as delta loss in linear scale)
* Rate of Loss Reduction over Epochs (measured as delta loss in logarithmic scale)
* Gradient Norm over Epochs
* Learning Rate Over Epochs


### Show Plot Only (Not Saved)

In [None]:
import re
import matplotlib.pyplot as plt
import numpy as np
from scipy.ndimage import gaussian_filter1d
from collections import defaultdict

# Your log text (truncated for brevity)
log_text = """



"""

# Regular expressions to find loss, gradient norm, learning rate, and epoch values
loss_pattern = re.compile(r"\{'loss': ([\d\.]+),")
grad_norm_pattern = re.compile(r"'grad_norm': ([\d\.]+),")
lr_pattern = re.compile(r"'learning_rate': ([\d\.e\-]+),")
epoch_pattern = re.compile(r"'epoch': ([\d\.]+)\}")

# Extracting the values into dictionaries grouped by epoch
data = defaultdict(lambda: {'loss': [], 'grad_norm': [], 'lr': []})

# Finding matches and grouping data
for match in re.finditer(r"\{'loss': ([\d\.]+), 'grad_norm': ([\d\.]+), 'learning_rate': ([\d\.e\-]+), 'epoch': ([\d\.]+)\}", log_text):
    loss, grad_norm, lr, epoch = map(float, match.groups())
    data[epoch]['loss'].append(loss)
    data[epoch]['grad_norm'].append(grad_norm)
    data[epoch]['lr'].append(lr)

# Calculate average values per epoch
averaged_data = {
    'epoch': sorted(data.keys()),
    'loss': [],
    'grad_norm': [],
    'lr': []
}

for epoch in averaged_data['epoch']:
    averaged_data['loss'].append(np.mean(data[epoch]['loss']))
    averaged_data['grad_norm'].append(np.mean(data[epoch]['grad_norm']))
    averaged_data['lr'].append(np.mean(data[epoch]['lr']))

# Extract original and averaged values
epoch_values = averaged_data['epoch']
loss_values = averaged_data['loss']
grad_norm_values = averaged_data['grad_norm']
lr_values = averaged_data['lr']

original_epoch_values = []
original_loss_values = []
original_grad_norm_values = []
original_lr_values = []

for epoch, values in data.items():
    original_epoch_values.extend([epoch] * len(values['loss']))
    original_loss_values.extend(values['loss'])
    original_grad_norm_values.extend(values['grad_norm'])
    original_lr_values.extend(values['lr'])

# Calculating delta loss from averaged loss values
delta_loss = [j - i for i, j in zip(loss_values[:-1], loss_values[1:])]
abs_delta_loss = np.abs(delta_loss)

# Parameters for Gaussian smoothing
sigma = 1  # Standard deviation for Gaussian kernel

# Smoothed delta loss using Gaussian filter
gaussian_smoothed_delta_loss = gaussian_filter1d(delta_loss, sigma=sigma)

# Smoothed absolute delta loss using Gaussian filter
gaussian_smoothed_abs_delta_loss = gaussian_filter1d(abs_delta_loss, sigma=sigma)

# Smoothed gradient norm using Gaussian filter
gaussian_smoothed_grad_norm = gaussian_filter1d(grad_norm_values, sigma=sigma)

# Plotting the results
plt.figure(figsize=(12, 30))  # Increased height for better visibility

# Plot for Loss
plt.subplot(5, 1, 1)
plt.scatter(original_epoch_values, original_loss_values, label='Original Loss', color='tab:grey', alpha=0.5, s=3)
plt.plot(epoch_values, loss_values, label='Loss (Averaged Duplicates)', color='tab:red')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss over Epochs')
plt.legend()
plt.grid(True)

# Plot for Delta Loss on a linear scale
plt.subplot(5, 1, 2)
plt.plot(epoch_values[1:], delta_loss, label='Delta Loss', color='tab:blue', alpha=0.25)
plt.plot(epoch_values[1:], gaussian_smoothed_delta_loss, label=f'Gaussian Smoothed with sigma of {sigma}', color='tab:blue')
plt.xlabel('Epoch')
plt.ylabel('Delta Loss')
plt.title('Rate of Loss Reduction over Epochs (Linear Scale)')
plt.legend()
plt.grid(True)

# Plot for Absolute Delta Loss with Gaussian smoothing on a logarithmic scale
plt.subplot(5, 1, 3)
plt.plot(epoch_values[1:], abs_delta_loss, label='Absolute Delta Loss', color='tab:blue', alpha=0.25)
plt.plot(epoch_values[1:], gaussian_smoothed_abs_delta_loss, label=f'Gaussian Smoothed with sigma of {sigma}', color='tab:blue')
plt.xlabel('Epoch')
plt.ylabel('Delta Loss (Log Scale)')
plt.yscale('log')
plt.title('Rate of Loss Reduction over Epochs (Log Scale)')
plt.legend()
plt.grid(True)

# Plot for Gradient Norm
plt.subplot(5, 1, 4)
plt.scatter(original_epoch_values, original_grad_norm_values, label='Original Gradient Norm', color='tab:grey', alpha=0.5, s=3)
plt.plot(epoch_values, grad_norm_values, label='Gradient Norm (Averaged Duplicates)', color='tab:green', alpha=0.25)
plt.plot(epoch_values, gaussian_smoothed_grad_norm, label=f'Gaussian Smoothed with sigma of {sigma}', color='tab:green')
plt.xlabel('Epoch')
plt.ylabel('Gradient Norm')
plt.title('Gradient Norm over Epochs')
plt.legend()
plt.grid(True)

# Plot for Learning Rate
plt.subplot(5, 1, 5)
plt.plot(epoch_values, lr_values, label='Learning Rate', color='tab:orange')
plt.xlabel('Epoch')
plt.ylabel('Learning Rate')
plt.title('Learning Rate over Epochs')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

### Save & Show Plot

In [None]:
import re
import matplotlib.pyplot as plt
import numpy as np
from scipy.ndimage import gaussian_filter1d
from collections import defaultdict

# Your log text (truncated for brevity)
log_text = """



"""

# Regular expressions to find loss, gradient norm, learning rate, and epoch values
loss_pattern = re.compile(r"\{'loss': ([\d\.]+),")
grad_norm_pattern = re.compile(r"'grad_norm': ([\d\.]+),")
lr_pattern = re.compile(r"'learning_rate': ([\d\.e\-]+),")
epoch_pattern = re.compile(r"'epoch': ([\d\.]+)\}")

# Extracting the values into dictionaries grouped by epoch
data = defaultdict(lambda: {'loss': [], 'grad_norm': [], 'lr': []})

# Finding matches and grouping data
for match in re.finditer(r"\{'loss': ([\d\.]+), 'grad_norm': ([\d\.]+), 'learning_rate': ([\d\.e\-]+), 'epoch': ([\d\.]+)\}", log_text):
    loss, grad_norm, lr, epoch = map(float, match.groups())
    data[epoch]['loss'].append(loss)
    data[epoch]['grad_norm'].append(grad_norm)
    data[epoch]['lr'].append(lr)

# Calculate average values per epoch
averaged_data = {
    'epoch': sorted(data.keys()),
    'loss': [],
    'grad_norm': [],
    'lr': []
}

for epoch in averaged_data['epoch']:
    averaged_data['loss'].append(np.mean(data[epoch]['loss']))
    averaged_data['grad_norm'].append(np.mean(data[epoch]['grad_norm']))
    averaged_data['lr'].append(np.mean(data[epoch]['lr']))

# Extract original and averaged values
epoch_values = averaged_data['epoch']
loss_values = averaged_data['loss']
grad_norm_values = averaged_data['grad_norm']
lr_values = averaged_data['lr']

original_epoch_values = []
original_loss_values = []
original_grad_norm_values = []
original_lr_values = []

for epoch, values in data.items():
    original_epoch_values.extend([epoch] * len(values['loss']))
    original_loss_values.extend(values['loss'])
    original_grad_norm_values.extend(values['grad_norm'])
    original_lr_values.extend(values['lr'])

# Calculating delta loss from averaged loss values
delta_loss = [j - i for i, j in zip(loss_values[:-1], loss_values[1:])]
abs_delta_loss = np.abs(delta_loss)

# Parameters for Gaussian smoothing
sigma = 1  # Standard deviation for Gaussian kernel

# Smoothed delta loss using Gaussian filter
gaussian_smoothed_delta_loss = gaussian_filter1d(delta_loss, sigma=sigma)

# Smoothed absolute delta loss using Gaussian filter
gaussian_smoothed_abs_delta_loss = gaussian_filter1d(abs_delta_loss, sigma=sigma)

# Smoothed gradient norm using Gaussian filter
gaussian_smoothed_grad_norm = gaussian_filter1d(grad_norm_values, sigma=sigma)

# Plotting the results
plt.figure(figsize=(12, 30))  # Increased height for better visibility

# Plot for Loss
plt.subplot(5, 1, 1)
plt.scatter(original_epoch_values, original_loss_values, label='Original Loss', color='tab:grey', alpha=0.5, s=3)
plt.plot(epoch_values, loss_values, label='Loss (Averaged Duplicates)', color='tab:red')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss over Epochs')
plt.legend()
plt.grid(True)

# Plot for Delta Loss on a linear scale
plt.subplot(5, 1, 2)
plt.plot(epoch_values[1:], delta_loss, label='Delta Loss', color='tab:blue', alpha=0.25)
plt.plot(epoch_values[1:], gaussian_smoothed_delta_loss, label=f'Gaussian Smoothed with sigma of {sigma}', color='tab:blue')
plt.xlabel('Epoch')
plt.ylabel('Delta Loss')
plt.title('Rate of Loss Reduction over Epochs (Linear Scale)')
plt.legend()
plt.grid(True)

# Plot for Absolute Delta Loss with Gaussian smoothing on a logarithmic scale
plt.subplot(5, 1, 3)
plt.plot(epoch_values[1:], abs_delta_loss, label='Absolute Delta Loss', color='tab:blue', alpha=0.25)
plt.plot(epoch_values[1:], gaussian_smoothed_abs_delta_loss, label=f'Gaussian Smoothed with sigma of {sigma}', color='tab:blue')
plt.xlabel('Epoch')
plt.ylabel('Delta Loss (Log Scale)')
plt.yscale('log')
plt.title('Rate of Loss Reduction over Epochs (Log Scale)')
plt.legend()
plt.grid(True)

# Plot for Gradient Norm
plt.subplot(5, 1, 4)
plt.scatter(original_epoch_values, original_grad_norm_values, label='Original Gradient Norm', color='tab:grey', alpha=0.5, s=3)
plt.plot(epoch_values, grad_norm_values, label='Gradient Norm (Averaged Duplicates)', color='tab:green', alpha=0.25)
plt.plot(epoch_values, gaussian_smoothed_grad_norm, label=f'Gaussian Smoothed with sigma of {sigma}', color='tab:green')
plt.xlabel('Epoch')
plt.ylabel('Gradient Norm')
plt.title('Gradient Norm over Epochs')
plt.legend()
plt.grid(True)

# Plot for Learning Rate
plt.subplot(5, 1, 5)
plt.plot(epoch_values, lr_values, label='Learning Rate', color='tab:orange')
plt.xlabel('Epoch')
plt.ylabel('Learning Rate')
plt.title('Learning Rate over Epochs')
plt.legend()
plt.grid(True)

plt.tight_layout()

import os

# Saving the plot in specified location
directory = ''
os.makedirs(directory, exist_ok=True)
plt.savefig(os.path.join(directory, 'training_metrics_plot.png'))

plt.show()