### Linear Regression $R^2$ Analysis -- Feature vs Evaluation

#### Scatter Plot Visualization
Plotting out for each of the computed metrics the linear correlation between that metric and the 
different evaluations that we have. 

Note that we do this at each layer of the model for specific layers of interest.

In [10]:
import pickle 
import os 
from utils import MODEL_SIZES, CHECKPOINT_STEPS, METRICS, sort_and_filter_metrics, get_checkpoint_evals

from scipy.stats import linregress, pearsonr

from matplotlib import pyplot as plt

import numpy as np 

In [5]:
# for each evaluation and for each layer of interest - plot out the linear correlation 

for model_size in MODEL_SIZES:
    checkpoint_evals = get_checkpoint_evals(model_size)

    for metric_name in METRICS:

        # Reading in CKA scores per layer 
        with open(f'/home/rd654/pretraining-playground/compiled_statistics/{model_size}/{metric_name}_per_layer.pkl', 'rb') as f:
            _metrics = pickle.load(f)
            # filter out only attention.dense and dense_4h_to_h layers 
            metrics = sort_and_filter_metrics(_metrics, filter_layer_name="attention.dense")

        # Get the layer names and eval metrics
        layer_names = list(metrics.keys())
        eval_metrics = list(checkpoint_evals.keys())

        # Create a grid of scatter subplots
        fig, axs = plt.subplots(len(layer_names), len(eval_metrics), figsize=(100, 100))

        # Iterate over layer names and eval metrics
        for i, layer_name in enumerate(layer_names):
            for j, eval_metric in enumerate(eval_metrics):
                # Get the cka scores and eval scores for the layer and eval metric
                layer_metric_scores = metrics[layer_name]
                eval_scores = checkpoint_evals[eval_metric]

                # Plot the scatter plot
                axs[i, j].scatter(layer_metric_scores, eval_scores)
                axs[i, j].set_xlabel('CKA Scores')
                axs[i, j].set_ylabel('Eval Scores')
                axs[i, j].set_title(f'{layer_name} - {eval_metric}')

                # Compute the linear correlation
                slope, intercept, r_value, p_value, std_err = linregress(layer_metric_scores, eval_scores)
                axs[i, j].text(0.05, 0.9, f'R^2 = {r_value**2:.2f}', transform=axs[i, j].transAxes)

        # Adjust the spacing between subplots
        plt.tight_layout()


        plot_dir = f"/home/rd654/pretraining-playground/plots/{model_size}/"
        os.makedirs(plot_dir, exist_ok=True)
       # save out plt 

        plt.savefig(f'{plot_dir}/{metric_name}_scatter.png')
        plt.close()


#### Heatmap Visualization

Generating a heatmap plot with the $R^2$ value and the linear coefficient

In [8]:
for model_size in MODEL_SIZES:
    checkpoint_evals = get_checkpoint_evals(model_size)

    for metric_name in METRICS:

        # Reading in CKA scores per layer 
        try:
            with open(f'/home/rd654/pretraining-playground/compiled_statistics/{model_size}/{metric_name}_per_layer.pkl', 'rb') as f:
                _metrics = pickle.load(f)
                # filter out only attention.dense and dense_4h_to_h layers 
                metrics = sort_and_filter_metrics(_metrics, filter_layer_name="attention.dense")
        except:
            continue

        # Get the layer names and eval metrics
        layer_names = list(metrics.keys())
        eval_metrics = list(checkpoint_evals.keys())

        # Create empty arrays to store the R^2 values and linear coefficients
        r_squared_values = np.zeros((len(layer_names), len(eval_metrics)))

        # Iterate over layer names and eval metrics
        for i, layer_name in enumerate(layer_names):
            for j, eval_metric in enumerate(eval_metrics):
                # Get the cka scores and eval scores for the layer and eval metric
                layer_metric_scores = metrics[layer_name]
                eval_scores = checkpoint_evals[eval_metric]

                # Compute the linear regression
                slope, intercept, r_value, p_value, std_err = linregress(layer_metric_scores, eval_scores)

                # Store the R^2 value and linear coefficient in the arrays
                r_squared_values[i, j] = r_value**2

        # Create the heatmap
        fig, ax = plt.subplots(figsize=(10, 10))
        im = ax.imshow(r_squared_values, cmap='copper')

        # Add colorbar
        cbar = ax.figure.colorbar(im, ax=ax)
        cbar.ax.set_ylabel('R^2 Value', rotation=-90, va="bottom")

        # Set tick labels and axis labels
        ax.set_xticks(np.arange(len(eval_metrics)))
        ax.set_yticks(np.arange(len(layer_names)))
        ax.set_xticklabels(eval_metrics)
        ax.set_yticklabels(layer_names)
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

        # Loop over data dimensions and create text annotations
        for i in range(len(layer_names)):
            for j in range(len(eval_metrics)):
                text = ax.text(j, i, f'{r_squared_values[i, j]:.2f}', ha="center", va="center", color="w")

        # Set title and show the plot
        ax.set_title(f"R2 value for {model_size} - {metric_name} vs Eval Metrics")
        # saving out the plot
        plot_dir = f"/home/rd654/pretraining-playground/plots/{model_size}/"

        os.makedirs(plot_dir, exist_ok=True)

        plt.savefig(f'{plot_dir}/{metric_name}_r2_heatmap.png')
        plt.close()

In [11]:
for model_size in MODEL_SIZES:
    checkpoint_evals = get_checkpoint_evals(model_size)

    for metric_name in METRICS:

        # Reading in CKA scores per layer 
        try:
            with open(f'/home/rd654/pretraining-playground/compiled_statistics/{model_size}/{metric_name}_per_layer.pkl', 'rb') as f:
                _metrics = pickle.load(f)
                # filter out only attention.dense and dense_4h_to_h layers 
                metrics = sort_and_filter_metrics(_metrics, filter_layer_name="attention.dense")
        except:
            continue

        # Get the layer names and eval metrics
        layer_names = list(metrics.keys())
        eval_metrics = list(checkpoint_evals.keys())

        # Create empty arrays to store the R^2 values and linear coefficients
        correlation_coefficients = np.zeros((len(layer_names), len(eval_metrics)))

        # Iterate over layer names and eval metrics
        for i, layer_name in enumerate(layer_names):
            for j, eval_metric in enumerate(eval_metrics):
                # Get the cka scores and eval scores for the layer and eval metric
                layer_metric_scores = metrics[layer_name]
                eval_scores = checkpoint_evals[eval_metric]

                correlation_coefficients[i, j] = pearsonr(layer_metric_scores, eval_scores)[0]

        # Create the heatmap for linear correlation coefficients

        fig, ax = plt.subplots(figsize=(10, 10))
        im = ax.imshow(correlation_coefficients, cmap='coolwarm', vmin=-1, vmax=1)

        # Add colorbar
        cbar = ax.figure.colorbar(im, ax=ax)
        cbar.ax.set_ylabel('Linear Correlation Coefficient', rotation=-90, va="bottom")

        # Set tick labels and axis labels
        ax.set_xticks(np.arange(len(eval_metrics)))
        ax.set_yticks(np.arange(len(layer_names)))
        ax.set_xticklabels(eval_metrics)
        ax.set_yticklabels(layer_names)
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

        # Loop over data dimensions and create text annotations
        for i in range(len(layer_names)):
            for j in range(len(eval_metrics)):
                text = ax.text(j, i, f'{correlation_coefficients[i, j]:.2f}', ha="center", va="center", color="w")

        # Set title and show the plot
        ax.set_title(f"Linear Correlation Coefficients for {model_size} - {metric_name} vs Eval Metrics")

        plot_dir = f"/home/rd654/pretraining-playground/plots/{model_size}/"

        os.makedirs(plot_dir, exist_ok=True)

        plt.savefig(f'{plot_dir}/{metric_name}_corr_heatmap.png')
        plt.close()
