# A notebook to analyse the pressure variable effect on the LLM alignment

In [5]:
### imports 
import plotly.express as px
import plotly.graph_objects as go

import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
from math import pi

import  numpy as np 



In [None]:
import os
cur_dir = os.getcwd() 
os.chdir(cur_dir.replace("notebooks", ""))
print(os.getcwd())

In [None]:
%load_ext autoreload
%autoreload 2

## Load the data

#### NOTE: please download the .csv files from the dropbox folder (see link in the README). If the odds_ratios and pvalues are missing, please run first the logistic_regression notebook.  

In [None]:
## Load the data 
df = pd.read_csv("data/simulation_results/pressure_variables_alignment.csv") 

print("Columns:", df.columns)

## All model 
print("All models:", df["model_"].unique())

## Radar Plots

In [8]:
### Load the p-values and the odds ratios
p_values = pd.read_csv("data/simulation_results/pvalues.csv")
num_cols = p_values.select_dtypes(include=['float']).columns 
p_values[num_cols] = 1
df = pd.read_csv("data/simulation_results/odds_ratios.csv")

## mix values with p_values
for col in df.select_dtypes(include=['float']).columns:
    df[col] = np.log(df[col]) 

df[num_cols] = df[num_cols] * p_values[num_cols]

## RENAME 
mapping = {"profit_exp0" : "profit_exp_0", "profit_exp1" : "profit_exp_1", "loan0" : "loan_0", "loan1" : "loan_1"}
df = df.rename(columns=mapping)

## correct naming 
df["model"] = df["model"].replace({'phi3.5-mini' : "phi-3.5-mini", "llama3.1" : "llama-3.1-8b", "claude-sonnet-3.5" : "claude-3.5-sonnet"})

## list of models
models_by_release = ["gpt-3.5-turbo", 'gpt-4-turbo', 'claude-3-haiku', 'gpt-4o', "claude-3.5-sonnet", 'gpt-4o-mini', 'llama-3.1-8b', 'phi-3.5-mini', 'o1-mini', 'o1-preview']


## Functions to plot and visualise baseline dataframe

In [9]:
def positive_plots(df):
    models = models_by_release

    categories = ['trust', 'loan', 'gov', 'reg', 'risk', 'profitexp']

    pos_categories = [
        '     Distrust \n     in Ursus', 
        'Large \n        Loan Owed', 
        'Strong \n Governance', 
        'Regulation       ', 
        'Risk \n Averse', 
        '     Unprofitable \n Market'
    ]
    N = len(categories)
    models_per_picture = 2

    # Function to plot radar charts
    def plot_radar_charts(models_group, fig_title, legend=False):
        n_cols = models_per_picture
        n_rows = 1 

        fig, axs = plt.subplots(n_rows, n_cols, figsize=(11, 4), subplot_kw=dict(polar=True))
        
        # Ensure axs is 2D iterable, even if only one model
        if len(models_group) == 1:
            axs = np.array([[axs]])

        axs = axs.flatten()  # Flatten in case there are multiple rows

        # Generate radar chart for each model in the group
        for col_idx, selected_model in enumerate(models_group):

            # Filter DataFrame for the selected model
            filtered_df = df[df["model"] == selected_model]
            
            # Calculate angles for each axis in the plot
            angles = [n / float(N) * 2 * pi for n in range(N)]
            angles += angles[:1]  # To close the circular plot

            # Plot data for _1 (positive values)
            ax_pos = axs[col_idx]
            values_1 = filtered_df[[x + "-" for x in categories]].clip(upper=0).abs().values.flatten().tolist()
            values_1 += values_1[:1]
            ax_pos.plot(angles, values_1, linewidth=4, linestyle='solid', label='Decrease', color='blue')
            ax_pos.fill(angles, values_1, 'b', alpha=0.1)

            values_1 = filtered_df[[x + "-" for x in categories]].clip(lower=0).values.flatten().tolist()
            values_1 += values_1[:1]
            ax_pos.plot(angles, values_1, linewidth=4, linestyle='solid', label='Increase', color='red')
            ax_pos.fill(angles, values_1, 'r', alpha=0.1)

            ax_pos.set_xticks(angles[:-1])
            ax_pos.set_xticklabels(pos_categories, size=18)
            ax_pos.set_yticklabels([])
            ax_pos.set_title(f'{selected_model}', size=25)

        # Add a legend to the first model
        if legend:
            axs[-1].legend(loc='lower right', prop={'size': 18})

        # Remove any empty subplots
        if len(models_group) % n_cols != 0:
            for idx in range(len(models_group), n_rows * n_cols):
                fig.delaxes(axs[idx])

        # Adjust layout
        plt.tight_layout()
        plt.subplots_adjust(hspace=0.5) 
        plt.show()
        
    for i in range(0, len(models), models_per_picture):
        if i == len(models) - 2:
            plot_radar_charts(models[i:i+2], f'charts/decrease_radar_{i}.svg', legend=(i == len(models) - 2))
        else:
            plot_radar_charts(models[i:i+2], f'charts/decrease_radar_{i}.png', legend=(i == len(models) - 2))



def negative_plots(df):
    # Assuming df is already defined
    models = models_by_release

    categories = ['trust', 'loan', 'gov', 'reg', 'risk', 'profitexp']

    # Negation of categories
    neg_categories = [
    '  Trust \n     in Ursus', 
        'Small \n        Loan Owed', 
        'Weak \n Governance', 
        'No Regulation       ', 
        'Risk \n Seeker', 
        '     Profitable \n Market'
    ]

    N = len(categories)

    models_per_picture = 2

    # Function to plot radar charts
    def plot_radar_charts(models_group, fig_title):
        n_cols = models_per_picture
        n_rows = 1 

        fig, axs = plt.subplots(n_rows, n_cols, figsize=(11, 4), subplot_kw=dict(polar=True))
            
        # Ensure axs is 2D iterable, even if only one model
        if len(models_group) == 1:
            axs = np.array([[axs]])

        axs = axs.flatten()  # Flatten in case there are multiple rows

        # Generate radar chart for each model in the group
        for col_idx, selected_model in enumerate(models_group):

            # Filter DataFrame for the selected model
            filtered_df = df[df["model"] == selected_model]

            # Calculate angles for each axis in the plot
            angles = [n / float(N) * 2 * pi for n in range(N)]
            angles += angles[:1]  # To close the circular plot

            # Plot data for _1 (positive values)

            ax_neg = axs[col_idx]
            values_0 = filtered_df[[x + "+" for x in categories]].clip(lower=0).values.flatten().tolist()
            values_0 += values_0[:1]
            ax_neg.plot(angles, values_0, linewidth=4, linestyle='solid', label='Increase', color='red')
            ax_neg.fill(angles, values_0, 'r', alpha=0.1)

            values_0 = filtered_df[[x + "+" for x in categories]].clip(upper=0).abs().values.flatten().tolist()
            values_0 += values_0[:1]
            ax_neg.plot(angles, values_0, linewidth=4, linestyle='solid', label='Decrease', color='blue')
            ax_neg.fill(angles, values_0, 'b', alpha=0.1)


            ax_neg.set_xticks(angles[:-1])
            ax_neg.set_xticklabels(neg_categories, size=18)
            ax_neg.set_yticklabels([])
            ax_neg.set_title(f'{selected_model}', size=25)

        # Add a legend to the first model
        #axs[0].legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))

        # Remove any empty subplots
        if len(models_group) % n_cols != 0:
            for idx in range(len(models_group), n_rows * n_cols):
                fig.delaxes(axs[idx])

        # Adjust layout
        plt.tight_layout()
        plt.subplots_adjust(hspace=0.5)  # Adjust spacing between subplots if needed
        plt.show()    

    for i in range(0, len(models), models_per_picture):
        plot_radar_charts(models[i:i+2], f'charts/increase_radar_{i}')

    

## Plots

In [None]:
positive_plots(df)


In [None]:
negative_plots(df)
