In [None]:
import os
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from src.wimbd_ import BasePaths as PATHS
from src.wimbd_ import DataConfigs as CONFIG
from src.wimbd_ import post_filter
from src.utils import softmax

from datetime import datetime
pd.set_option('display.max_columns', 10)

# Generate a timestamp
timestamp = datetime.now().strftime("%Y-%m-%d-%H:%M")

In [None]:
# all_models = ['pythia-12b', 'pythia-6.9b', 'pythia-2.8b', 'pythia-1.4b', 'pythia-410m', 'pythia-160m', 'pythia-70m', 'pythia-31m', 'pythia-14m']
# large_models = ['pythia-12b', 'pythia-6.9b', 'pythia-2.8b', 'pythia-1.4b']
# small_models = ['pythia-410m', 'pythia-160m', 'pythia-70m', 'pythia-31m', 'pythia-14m']

# all_models = ['OLMo-1B', 'OLMo-7B', 'OLMo-7B-SFT', 'OLMo-7B-Instruct']
# large_models = ['OLMo-7B', 'OLMo-7B-SFT', 'OLMo-7B-Instruct']
# small_models = ['OLMo-1B']

all_models = ['pythia-12b', 'pythia-6.9b', 'open-instruct-pythia-6.9b-tulu']

N_GRAMS = 5
# BASE_DIR = "./results/n-grams/mmlu/test-set/exp_full_None"
CORPUS = "pile"
DATASET = "mmlu"
TASKS = CONFIG.mmlu_tasks['top_diff_sft_olmo']
OMMIT_TASKS = True
POST_FILTER = True

BASE_DIR = PATHS.base_ngram_paths[DATASET][CORPUS]['base_path']
BASE_PATH = os.path.join(BASE_DIR, f"{N_GRAMS}")
METHOD = "0-shot_common"
BASE_PATH_COMMON = os.path.join(BASE_PATH, "common")
BASE_PATH_ALL = os.path.join(BASE_PATH, "all")
FIG_DIR = os.path.join("/share/edc/home/antonis/LLM-Incidental-Supervision/incidental-supervision/figures", DATASET, CORPUS)
if not os.path.exists(FIG_DIR):
    os.makedirs(FIG_DIR)

print(f"BASE PATH: {BASE_PATH}")

TASKS_OMMIT = CONFIG.task_configs[DATASET]['ommit']

def remove_nested_lists(df):
    for col in df.columns:
        # Check if any element in the column is a list or an array
        if any(isinstance(x, (list, np.ndarray)) for x in df[col]):
            print(f"Flattening column: {col}")
            # Apply a lambda function to select the first element if it is a list or an array, otherwise keep the element as is
            df[col] = df[col].apply(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
    return df

def normalize_data(group):
    min_value = group.min()
    max_value = group.max()
    # Check if all values are the same
    if max_value - min_value == 0:
        # Return a default value or the original values
        return pd.Series([0.5] * len(group), index=group.index)
    else:
        return (group - min_value) / (max_value - min_value)

In [None]:
dfs_all_models_pth = os.path.join(BASE_PATH, f"examples_dfs_{METHOD}_models.pkl")
# dfs_all_models_pth = "./results/n-grams/mmlu/pile/exp4_filter/test-set/exp_full_None/5/examples_dfs_common_models.pkl"

# load pickle
with open(dfs_all_models_pth, "rb") as f:
    dfs_all_models = pickle.load(f)

# keep only specific tasks
if TASKS is not None:
    dfs_all_models = {model: df[df['task'].isin(TASKS)] for model, df in dfs_all_models.items()}

# keep only the models we are interested in
dfs_all_models = {model: dfs_all_models[model] for model in all_models}
df_all_models = pd.concat(
    [df.assign(model=model_name) for model_name, df in dfs_all_models.items()]
)
df_all_models = remove_nested_lists(df_all_models)

df_all_models.head(1)

# create a perplexity column
for model in dfs_all_models:
    dfs_all_models[model]["perplexity_gold"] = dfs_all_models[model]["probs_gold"].apply(lambda x: 2 ** -np.log2(x))

In [None]:
dfs_all_models

In [None]:
"""
note: probs_softmax is the normalized probability of the 4 choices (sums to 1)
      probs_nll is the actual probability assigned by the model (converted from nll to probability)

value: the occurences of a specific ngram
count: the number of occurences per example
sum: the sum of the occurences of the ngram in the whole dataset
"""

models = list(dfs_all_models.keys())
model_colormap = plt.cm.get_cmap('coolwarm', len(models))
model_color_mapping = {model: model_colormap(1 - i / len(models)) for i, model in enumerate(models)}
print(models)

if OMMIT_TASKS:
    print(f"OMMITING TASKS: {TASKS_OMMIT}")
    df_all_models = df_all_models[~df_all_models["task"].isin(TASKS_OMMIT)]

if POST_FILTER:
    print("POST FILTERING")
    df_all_models = post_filter(df_all_models)

In [None]:
df_all_models.iloc[0]['example']

In [None]:
df_all_models

In [None]:
# drop columns where column A != column Answer
df_all_models_correct = df_all_models[df_all_models['A'] == df_all_models['answer']]
df_all_models_incorrect = df_all_models[df_all_models['A'] != df_all_models['answer']]

p_correct = len(df_all_models_correct) / len(df_all_models)
p_incorrect = len(df_all_models_incorrect) / len(df_all_models)
print(f"Correct: {p_correct:.2f}, Incorrect: {p_incorrect:.2f}")

correct_mean_value = df_all_models_correct['value'].mean()
incorrect_mean_value = df_all_models_incorrect['value'].mean()
print(f"Correct Mean Value: {correct_mean_value} \
        Incorrect Mean Value: {incorrect_mean_value}")


In [None]:
models = list(dfs_all_models.keys())
model_colormap = plt.cm.get_cmap('coolwarm', len(models))
model_color_mapping = {model: model_colormap(1 - i / len(models)) for i, model in enumerate(models)}
variable = 'probs_softmax'
# variable = 'probs'
models_ommit = [] # ['pythia-410m'] # ['pythia-31m'] # ['pythia-6.9b']
normalize = True

n_groups = 10
lower_percentile = 0.2
upper_percentile = 1 - lower_percentile

# Calculate the mean value for each probability group across all models
probs_gold_all = df_all_models[f'{variable}_gold']
if normalize:
    # Apply the normalization function to the f'{variable}_gold' column grouped by 'model'
    df_all_models[f'{variable}_gold_normalized'] = df_all_models.groupby('model')[f'{variable}_gold'].transform(normalize_data)
    probs_gold_range = np.linspace(0, 1, n_groups+1)
    df_all_models[f'{variable}_gold_range'] = pd.cut(df_all_models[f'{variable}_gold_normalized'], bins=probs_gold_range)
else:
    probs_gold_range = np.linspace(0, np.max(probs_gold_all), n_groups+1)
    df_all_models[f'{variable}_gold_range'] = pd.cut(probs_gold_all, bins=probs_gold_range)


print(df_all_models['model'].unique())

# convert categorical to numerical
df_all_models[f'{variable}_gold_range'] = df_all_models[f'{variable}_gold_range'].apply(lambda x: x.mid).astype(float)

# cut the "sum" column
n_groups_sum = 25
df_all_models['count_range'] = pd.cut(df_all_models['count'], bins=n_groups_sum)
df_all_models['count_range'] = df_all_models['count_range'].apply(lambda x: x.mid).astype(float)

# get the accuracy per count_range
# accuracy_count = df_all_models.groupby('count_range')['accuracy'].transform('mean').reset_index(name='accuracy_count')
df_all_models['accuracy_count_model'] = df_all_models.groupby(['count_range', 'model'])['accuracy'].transform('mean')

# compute accuracy per task and split into ranzge
n_groups_acc = 10
df_all_models['accuracy_range'] = pd.cut(df_all_models['ds_score'], bins=n_groups_acc)
df_all_models['accuracy_range'] = df_all_models['accuracy_range'].apply(lambda x: x.mid).astype(float)

In [None]:
var_ = 'sum'
df_all_models[var_] = pd.to_numeric(df_all_models[var_], errors='coerce')
# top_k = df_all_models[var_].unique().quantile(0.9)
var_indexes = df_all_models[var_].nlargest(10).index
df_top_ranges = df_all_models.loc[var_indexes]
df_top_ranges

In [None]:
import plotly.express as px

# Assuming df_all_models is your DataFrame and 'task' and var_ are columns in your DataFrame

# Create the bar chart using plotly express
fig = px.histogram(df_all_models, x='value', color='task', 
                   hover_data=['task'], nbins=100,
                   title='Distribution of Values per Task')

# Show the figure
fig.show()

In [None]:
df_all_models['model'].unique()

In [None]:
print(df_all_models['task'].unique())

In [None]:
ommit_tasks = ['high_school_computer_science', 'elementary_mathematics',
               'college_computer_science', 'high_school_mathematics', 
               'public_relations', 'nutrition',
               'machine_learning', 'college_mathematics'] # miscellaneous

df_model = df_all_models[df_all_models['model'] == 'pythia-12b']

task_list = df_model['task'].unique()
df_model['value'] = pd.to_numeric(df_model['value'], errors='coerce')

# create empty df
df_ngram_task_top10 = pd.DataFrame()
for task in task_list:
    print(f"------ {task} ------")
    df_model_task = df_model[df_model['task'] == task]
    df_model_task_top10 = df_model_task.nlargest(10, 'value')
    display(df_model_task.sort_values('value', ascending=False))
    df_ngram_task_top10 = pd.concat([df_ngram_task_top10, df_model_task_top10])

# save
df_ngram_task_top10.to_csv(f"{BASE_PATH}/top10_ngrams_per_task.csv", index=False)

In [None]:
from src.wimbd_ import post_filter

task = 'nutrition'

df_examples_all_task = df_all_models[df_all_models['task'] == task]
df_examples_all_task.head(1)

# df_examples_all_task = post_filter(df_examples_all_task)

# df_examples_all_task_2 = df_examples_all_task.copy()

In [None]:
# keep 
df_examples_task_ommited = df_all_models[~df_all_models['task'].isin(ommit_tasks)]
# df_examples_all_filtered = post_filter(df_all_models.copy())
# df_examples_all_filtered = df_examples_all_filtered[~df_examples_all_filtered['task'].isin(ommit_tasks)]

In [None]:
# Create the bar chart using plotly express
fig = px.histogram(df_all_models, x='value', color='task', 
                   hover_data=['task'], nbins=100,
                   title='Distribution of Values per Task')

# Show the figure
fig.show()

In [None]:
# plot count_range vs. accuracy_count

# first groupby example
df_all_models_example = df_all_models.groupby(['example_str', 'model']).first().reset_index()

plt.figure(figsize=(5, 5))

N_SAMPLES = None

for n, model in enumerate(models):
    if model in models_ommit:
        continue
    df_model = df_all_models_example[df_all_models_example['model'] == model]
    
    # Calculate mean and standard deviation for each count range
    grouped = df_model.groupby('count_range')['accuracy_count_model']
    n_examples = grouped.count()

    print(f"---------- model: {model} ----------")
    # if n == 0:
        # print(f"{n_examples}")
    
    if N_SAMPLES is not None:
        sufficient_samples = n_examples[n_examples >= N_SAMPLES].index
        df_model_sufficient = df_model[df_model['count_range'].isin(sufficient_samples)]
        df_model_sampled = df_model_sufficient.groupby('count_range').apply(lambda x: x.sample(n=N_SAMPLES, random_state=1)).reset_index(drop=True)
    else:
        sufficient_samples = n_examples.index
        df_model_sufficient = df_model[df_model['count_range'].isin(sufficient_samples)]
        df_model_sampled = df_model_sufficient.reset_index(drop=True)
    
    print(df_model_sampled)
    
    # Recalculate mean and standard deviation for the sampled data
    grouped_sampled = df_model_sampled.groupby('count_range')['accuracy_count_model']
    mean_sampled = grouped_sampled.mean()
    std_sampled = grouped_sampled.std()

    # Plot the mean accuracy count model for the sampled data
    plt.plot(mean_sampled.index, mean_sampled.values, label=model, color=model_color_mapping[model])
    
    # Plot the scatter points for the sampled data
    plt.scatter(df_model_sampled['count_range'], df_model_sampled['accuracy_count_model'], color=model_color_mapping[model], s=10)
    
    # Add the shaded error margin for the sampled data
    plt.fill_between(mean_sampled.index, mean_sampled.values - std_sampled.values, mean_sampled.values + std_sampled.values, color=model_color_mapping[model], alpha=0.2)

plt.ylabel('Accuracy')
plt.xlabel('Count')
plt.legend(loc='upper right', bbox_to_anchor=(1.4, 1))
plt.show()

In [None]:
df_all_models_example.head(1)

In [None]:
example_str = df_all_models_example.iloc[0]['example_str']

In [None]:
df_all_models_example['value'].mean()

In [None]:
df_all_models_example

In [None]:
df_all_models_example[df_all_models_example['example_str'] == example_str]

In [None]:
from src.visualize import plot_accuracy_vs_count_with_fit

df_all_models_example_range = df_all_models_example[df_all_models_example['count'] < 500]

plot_accuracy_vs_count_with_fit(df_all_models_example_range, models, 
                                models_ommit, model_color_mapping,
                                x='count', y='probs_softmax_gold',
                                degree=1)

In [None]:
df_all_models_example[df_all_models_example['value'] == 0]

In [None]:
df_all_models_example[df_all_models_example['count'] > 40]

In [None]:
df_all_models_example['model'].unique()

In [None]:
# Cut lower and upper percentiles per group and concatenate the results
ranges = df_all_models[f'{variable}_gold_range'].unique()
filtered_df_all_models = pd.DataFrame()

for r in ranges:
    df_r = df_all_models[df_all_models[f'{variable}_gold_range'] == r]

    # if normalize:
    #     df_r[f'{variable}_gold_range'] /= df_r[f'{variable}_gold_range'].max()

    lower_value = df_r['value'].quantile(lower_percentile)
    upper_value = df_r['value'].quantile(upper_percentile)
    # Filter the group DataFrame and concatenate it to the filtered DataFrame
    filtered_df_r = df_r[(df_r['value'] >= lower_value) & (df_r['value'] <= upper_value)]

    filtered_df_all_models = pd.concat([filtered_df_all_models, filtered_df_r])

    # Print (lower_value, upper_value) and max and min for each range
    # print(f"{r}: {df_r['value'].min()} - {df_r['value'].max()} -> {lower_value} - {upper_value}")
df_all_models = filtered_df_all_models
# Calculate the mean value for each probability group across all models
mean_values_all = df_all_models.groupby(f'{variable}_gold_range')['value'].mean()

# Create a line plot of mean_value vs. probs_softmax_gold for all models
plt.plot(mean_values_all.index.unique(), 
         mean_values_all.values,
         label='All Models',
         color='black')  # choose a color that stands out

plt.scatter(mean_values_all.index.unique(),
            mean_values_all.values,
            color='red', marker='x', s=100)

plt.xlabel(f'{variable}_gold')
plt.ylabel('Mean # ngrams')
plt.legend()
plt.title(f'Mean  vs. {variable}_gold, (all models)')
plt.show()

In [None]:
mean_values_all = {}

models_ommit = []

for model in df_all_models['model'].unique():
    if model in models_ommit:
        continue
    
    df_model = df_all_models[df_all_models['model'] == model]
    if normalize:
        # Apply the normalization function to the f'{variable}_gold' column grouped by 'model'
        df_model.loc[:, f'{variable}_gold_normalized'] = df_model.groupby('model')[f'{variable}_gold'].transform(normalize_data)
        probs_gold = df_model[f'{variable}_gold_normalized']
    else:
        probs_gold = df_model[f'{variable}_gold']

    df_model.loc[:, f'{variable}_gold_range'] = pd.cut(probs_gold, bins=probs_gold_range)

    # keep percentile range of value
    # lower_value = df_model['value'].quantile(lower_percentile)
    # upper_value = df_model['value'].quantile(upper_percentile)
    # df_model = df_model[(df_model['value'] >= lower_value) & (df_model['value'] <= upper_value)]

    # Calculate the mean value for each probability group
    mean_values = df_model.groupby(f'{variable}_gold_range')['value'].mean()
    mean_values_all[model] = mean_values

    # Create a line plot of mean_value vs.f {variable}_gold_range
    plt.plot(mean_values.index.categories.mid, 
             mean_values.values,
             label=model,
             color=model_color_mapping[model])
    # scatter plot
    plt.scatter(mean_values.index.categories.mid,
                mean_values.values,
                color=model_color_mapping[model], marker='x', s=50, alpha=0.5)

    plt.xlabel(f'{variable}_gold')
    plt.ylabel('Mean # ngrams')
    plt.legend()

plt.title(f'Mean # ngrams vs. {variable}_gold_range, (per model)')

plt.savefig(os.path.join(FIG_DIR, f"mean_ngrams_vs_{variable}_gold.png"))

df_mean_values_all = pd.DataFrame(mean_values_all)

In [None]:
## Debugging code
# chosen_model = 'pythia-70m'
# chosen_cateory = "(0.98, 0.99]"
# chosen_interval = pd.Interval(0.98, 0.99, closed='right')
# df_chosen_model = df_all_models[df_all_models['model'] == chosen_model]
# df_chosen_model = df_chosen_model[df_chosen_model[f'{variable}_gold_range'].apply(lambda x: x == chosen_interval)]
# df_chosen_model

# phrase = ['reliability validity test']
# df_all_models[df_all_models['index'].str.contains('reliability validity test')]

In [None]:
df_mean_values_all

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Assuming you have a list of models to include
included_models = ['pythia-12b']

# Get unique tasks
unique_tasks = df_all_models['task'].unique()

# Set up the colormap for tasks
task_colormap = plt.cm.get_cmap('viridis', len(unique_tasks))
task_color_mapping = {task: task_colormap(i / len(unique_tasks)) for i, task in enumerate(unique_tasks)}

# df_all_models[f'{variable}_gold_range'] = pd.cut(df_all_models[f'{variable}_gold'], bins=4)  # This should be done before the loop

# Iterate over the tasks and models to plot
for n, task in enumerate(unique_tasks):
    
    for model in included_models:
        # Filter the DataFrame for the current task and model
        df_model = df_all_models[df_all_models['model'] == model]
        df_task_model = df_model[df_model['task'] == task].copy()  # Make a copy to avoid SettingWithCopyWarning
        
        # Skip if there's no data for this task in the model
        if df_task_model.empty:
            continue
        
        # Normalize the 'value' column for the current task
        df_task_model['normalized_value'] = normalize_data(df_task_model['value'])
        
        # Calculate the mean value for each probability group for the current task and model
        mean_values = df_task_model.groupby(f'{variable}_gold_range')['normalized_value'].mean()
        
        # Extract the midpoints from the interval index
        mid_points = [interval for interval in mean_values.index]
        
        # Create a line plot of mean_value vs. {variable}_gold_range for the current task and model
        line, = plt.plot(mid_points, 
                         mean_values.values,
                         label=task,
                         color=task_color_mapping[task])
        
        # Scatter plot
        plt.scatter(mid_points,
                    mean_values.values,
                    color=task_color_mapping[task], marker='x', s=50)
        

# Set labels, legend, and title
plt.xlabel(f'{variable}_gold')
plt.ylabel('Normalized Mean # ngrams')
# plt.legend(title='Task', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.title(f'Normalized Mean # ngrams vs. {variable}_gold_range, per task (selected models)')
plt.tight_layout()  # Adjust layout to make room for the legend
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Assuming you have a list of models to include
# included_models = list(dfs_all_models.keys())
# included_models = large_models
# included_models = small_models
included_models = ['pythia-12b']

# Get unique tasks where ds_score >= min_acc
min_acc = 0.01
unique_tasks = df_all_models[df_all_models['ds_score'] >= min_acc].groupby('task')['ds_score'].mean().sort_values(ascending=False).index

# Determine the layout of the subplots
n_cols = 6  # Adjust the number of columns as needed
n_rows = int(np.ceil(len(unique_tasks) / n_cols))  # Adjust the number of rows as needed

# Create a figure and a grid of subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, n_rows * 3))  # Adjust the figure size as needed
axes = axes.flatten()  # Flatten the 2D array of axes for easy iteration

# Set up the colormap for tasks
task_colormap = plt.cm.get_cmap('viridis', len(unique_tasks))
task_color_mapping = {task: task_colormap(i / len(unique_tasks)) for i, task in enumerate(unique_tasks)}

# Function to normalize data
def normalize_data(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))

points_delta = []

# Iterate over the tasks and models to plot
for n, task in enumerate(unique_tasks):
    ax = axes[n]  # Get the corresponding subplot axis
    
    for i, model in enumerate(included_models):
        # Filter the DataFrame for the current task and model
        df_model = df_all_models[df_all_models['model'] == model]
        df_task_model = df_model[df_model['task'] == task].copy()  # Make a copy to avoid SettingWithCopyWarning
        
        # Skip if there's no data for this task in the model
        if df_task_model.empty:
            continue
        
        # # Normalize the 'value' column for the current task
        # df_task_model['normalized_value'] = normalize_data(df_task_model['value'])
        
        # # Calculate the mean value for each probability group for the current task and model
        # mean_values = df_task_model.groupby(f'{variable}_gold_range')['normalized_value'].mean()

        # # keep percentile range of value
        # lower_value = df_task_model['value'].quantile(lower_percentile)
        # upper_value = df_task_model['value'].quantile(upper_percentile)
        # df_task_model = df_task_model[(df_task_model['value'] >= lower_value) & (df_task_model['value'] <= upper_value)]

        # assign probability group for task
        probs_gold_task = df_task_model[f'{variable}_gold']
        probs_gold_task_range = np.linspace(0, np.max(probs_gold_task), n_groups+1)
        df_task_model[f'{variable}_gold_range'] = pd.cut(probs_gold_task, bins=probs_gold_task_range)

        mean_values = df_task_model.groupby(f'{variable}_gold_range')['value'].mean()

        # After calculating mean_values and before plotting
        mean_values = mean_values.dropna()

        # Now, we need to make sure that the categories in the index match the mean_values
        # We will filter out the categories that correspond to NaN mean values
        valid_intervals = mean_values.index[~mean_values.isna()]

        # Extract the midpoints from the valid intervals
        mid_points = [interval.mid for interval in valid_intervals]

 
        # Create a line plot of mean_value vs. {variable}_gold_range for the current task and model
        ax.plot(mid_points, 
                mean_values.values,
                label=task,
                color=task_color_mapping[task])
        
        # Scatter plot
        ax.scatter(mid_points,
                   mean_values.values,
                   color=task_color_mapping[task], marker='x', s=50)

        ds_score = df_task_model['ds_score'].mean()
        ax.text(0.5, 0.9, f'DS Score: {ds_score:.2f}', 
            transform=ax.transAxes, ha='center', va='center', 
            fontsize=10, bbox=dict(facecolor='white', alpha=0.5))
        
        # track if last value is greater than first value
        if mean_values.values[-1] > mean_values.values[0]:
            points_delta.append(1)
        else:
            points_delta.append(0)
        

        if i == 0:
            # Set labels and title for each subplot
            ax.set_xlabel(f'{variable}_gold')
            ax.set_ylabel('Normalized Mean # ngrams')
            ax.set_title(f'Task: {task}')
            # ax.legend()

# Set the title for the entire figure
fig.suptitle(f"""Normalized Mean # ngrams vs. {variable}_gold_range,\ 
                 per task (selected models): increase 
             on {sum(points_delta)}/{len(points_delta)} tasks""",
               y=1.02, fontsize=23)
# Adjust the layout
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, f"mean_ngrams_vs_{variable}_gold_per_task.png"))

In [None]:
# Find the minimum sample size across all probability groups
group_sizes = df_all_models.groupby(f'{variable}_gold_range').size()
min_samples = group_sizes.min()

# Function to sample n points from each group
def sample_n_from_group(group, n=min_samples):
    return group.sample(n=n, random_state=1) if len(group) > n else group

# Sample uniformly from each group
df_sampled_all_models = df_all_models.groupby(f'{variable}_gold_range').apply(sample_n_from_group).reset_index(drop=True)
df_sampled_large_models = df_all_models[df_all_models['model'].isin(large_models)].groupby(f'{variable}_gold_range').apply(sample_n_from_group).reset_index(drop=True)
df_sampled_small_models = df_all_models[df_all_models['model'].isin(small_models)].groupby(f'{variable}_gold_range').apply(sample_n_from_group).reset_index(drop=True)

In [None]:
df_all_models.head(1)

In [None]:
import matplotlib.pyplot as plt
import os

import matplotlib.pyplot as plt
import os

def plot_mean_values(df, model_group, color, label, fig_dir, variable_gold_range, file_suffix,
                     log_axis=False, flip_axes=False):
    # Group by the probability range and calculate mean and standard deviation
    grouped = df.groupby(f'{variable_gold_range}')
    mean_values = grouped['value'].mean().astype(float)
    std_values = grouped['value'].std().astype(float)

    # Extract the midpoints for the x-axis if they are interval indices, otherwise use as is
    if isinstance(mean_values.index, pd.IntervalIndex):
        mid_points = mean_values.index.mid.tolist()
    else:
        mid_points = mean_values.index.tolist()

    if flip_axes:
        # Flip the axes
        plt.plot(mid_points, 
                 mean_values.values,
                 label=label,
                 color=color)
        plt.fill_between(mid_points, 
                         (mean_values.values - std_values).astype(float), 
                         (mean_values.values + std_values).astype(float), 
                         color=color, alpha=0.1)
        plt.scatter(mid_points,
                    mean_values.values,
                    color=color, marker='x', s=100)
        plt.xlabel(f'{variable_gold_range}')
        plt.ylabel('Mean # ngrams')
    else:
        # Normal axes
        plt.plot(mean_values.values, 
                 mid_points,
                 label=label,
                 color=color)
        plt.fill_betweenx(mid_points, 
                          (mean_values.values - std_values).astype(float), 
                          (mean_values.values + std_values).astype(float), 
                          color=color, alpha=0.1)
        plt.scatter(mean_values.values,
                    mid_points,
                    color=color, marker='x', s=100)
        plt.ylabel(f'{variable_gold_range}')
        plt.xlabel('Mean # ngrams')

    # Set the title and legend
    plt.title(f'Mean # ngrams vs. {variable_gold_range}')
    plt.legend(loc='upper left')

    # Optionally, set the x-axis to log scale
    if log_axis:
        plt.xscale('log')

    # Save the figure if a directory is provided
    if fig_dir:
        plt.savefig(os.path.join(fig_dir, f"mean_ngrams_vs_{variable_gold_range}_{file_suffix}.png"))



log_axis = False
flip_axes = True
# Usage example:
plot_var = f'{variable}_gold_range'
# plot_mean_values(df_all_models, 'all_models', 'black', 'All Models', FIG_DIR, plot_var, 'all')
plot_mean_values(df_all_models[df_all_models['model'].isin(large_models)], 'large_models', 'red', 'Large Models', FIG_DIR, plot_var, 'large', 
                 log_axis=log_axis, flip_axes=flip_axes)
plot_mean_values(df_all_models[df_all_models['model'].isin(small_models)], 'small_models', 'blue', 'Small Models', FIG_DIR, plot_var, 'small',
                 log_axis=log_axis, flip_axes=flip_axes)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

# Define the order of models
all_models = [
    'pythia-12b', 'pythia-6.9b', 'pythia-2.8b', 'pythia-1.4b',
    'pythia-410m', 'pythia-160m', 'pythia-70m', 'pythia-31m', 'pythia-14m'
]

# Create legend handles
legend_handles = [mpatches.Patch(color=model_color_mapping[model], label=model) for model in all_models]

# Get unique tasks
unique_tasks = sorted(df_all_models['task'].unique())

# Determine the layout of the subplots
n_cols = 5  # Adjust the number of columns as needed
n_rows = int(np.ceil(len(unique_tasks) / n_cols))  # Adjust the number of rows as needed

fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_rows * 9, n_rows * 4.5))  # Adjust the figure size as needed
axes = axes.flatten()  # Flatten the 2D array of axes for easy iteration

for n, task in enumerate(unique_tasks):
    ax = axes[n]  # Get the corresponding subplot axis
    task_avg_ds_score = df_all_models[df_all_models['task'] == task].groupby('model')['model_ds_score'].mean()
    task_avg_ds_score = task_avg_ds_score.reindex(all_models)
    task_avg_ds_score.plot(kind='bar', ax=ax, color=[model_color_mapping[model] for model in all_models])
    ax.set_title(task)
    ax.set_xlabel('Model')
    ax.set_ylabel('Average ds_score')

# Adjust the layout to make room for the titles
plt.tight_layout()

# Add the legend outside of the last subplot
plt.legend(handles=legend_handles, bbox_to_anchor=(1.05, 1), loc='upper left', title='Model')

plt.savefig(os.path.join(FIG_DIR, f"average_ds_score_per_task.png"))

# Show the plot
plt.show()

In [None]:
# plot_tasks = ['arithmetic_1dc', 'arithmetic_2da', 'arithmetic_2dm', 
#               'arithmetic_2ds', 'arithmetic_3da', 'arithmetic_3ds',]

# df_all_models_tasks = df_all_models[df_all_models['task'].isin(plot_tasks)]

flip_axes = True
log_axis = False
plot_var = 'accuracy_range'
plot_mean_values(df_all_models[df_all_models['model'].isin(large_models)], 
                 'large_models', 'red', 'Large Models', FIG_DIR, plot_var, 'large',
                 log_axis=log_axis, flip_axes=flip_axes)
plot_mean_values(df_all_models[df_all_models['model'].isin(small_models)], 'small_models', 'blue', 
                 'Small Models', FIG_DIR, plot_var, 'small',
                 flip_axes=flip_axes, log_axis=log_axis)

In [None]:
import matplotlib.pyplot as plt

def plot_mean_values_for_tasks(df, tasks, model_groups, colors, labels, fig_dir, variable_gold_range, file_suffix):
    # Determine the layout of the subplots
    n_cols = 3  # Adjust the number of columns as needed
    n_rows = int(np.ceil(len(tasks) / n_cols))  # Adjust the number of rows as needed

    # Create a figure and a grid of subplots
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, n_rows * 5))  # Adjust the figure size as needed
    axes = axes.flatten()  # Flatten the 2D array of axes for easy iteration

    for i, task in enumerate(tasks):
        ax = axes[i]  # Get the corresponding subplot axis
        plt.sca(ax)  # Set the current Axes instance to ax

        for model_group, color, label in zip(model_groups, colors, labels):
            # Filter the DataFrame for the current task and model group
            df_task_model_group = df[(df['task'] == task) & (df['model'].isin(model_group))]
            # Plot the mean values for the current task and model group
            plot_mean_values(df_task_model_group, model_group, color, label, fig_dir, variable_gold_range, f"{task}_{file_suffix}", flip_axes=True)

        # Set labels and title for each subplot
        ax.set_ylabel(f'{variable_gold_range}')  # Swapped x and y labels
        ax.set_xlabel('Mean # ngrams')  # Swapped x and y labels
        ax.set_title(f'Task: {task}')
        ax.legend()

    # Adjust the layout and save the figure
    plt.tight_layout()
    plt.savefig(os.path.join(fig_dir, f"mean_ngrams_vs_{variable_gold_range}_all_tasks.png"))

# Define your model groups, colors, and labels
model_groups = [
    all_models,
    large_models,
    small_models
]
colors = ['black', 'red', 'blue']
labels = ['All Models', 'Large Models', 'Small Models']

tasks = df_all_models['task'].unique()
# Usage example:
plot_variable = 'accuracy_count_model' 
# plot_mean_values_for_tasks(df_all_models, tasks, model_groups, colors, labels, FIG_DIR, plot_variable, 'all_tasks')

In [None]:
# Assuming df_all_models is your DataFrame and 'value' is the column you're interested in
# variable_gold_range is the name of the column containing the categorical range data

n_subgroups = 20  # Number of groups you want to create
group_freq = df_all_models.groupby(f'{variable}_gold_range').size()
group_freq_min = group_freq.min()  # Minimum frequency across groups
n_samples = group_freq_min // n_subgroups  # Number of samples per group

# Sample an equal number of data points from each range
equal_samples_df = pd.DataFrame()
for group_name, group_data in df_all_models.groupby(f'{variable}_gold_range'):
    equal_samples_df = pd.concat([equal_samples_df, group_data.sample(n=n_samples, random_state=1)])

# Calculate the mean value for each probability group in the equal samples
equal_samples_mean_values = equal_samples_df.groupby(f'{variable}_gold_range')['value'].mean()
# Extract the midpoints from the interval index for plotting the equal samples average line
equal_samples_midpoints = equal_samples_mean_values.index


# Create 10 plots
for i in range(n_subgroups):
    sampled_df = pd.DataFrame()  # Initialize a DataFrame to hold sampled data

    # Sample data from each group
    for group_name, group_data in df_all_models.groupby(f'{variable}_gold_range'):
        sampled_data = group_data.sample(n=min(n_samples, len(group_data)), random_state=i)
        sampled_df = pd.concat([sampled_df, sampled_data])

    # Calculate the mean value for each probability group in the sampled data
    mean_values_sampled = sampled_df.groupby(f'{variable}_gold_range')['value'].mean()

    # Extract the midpoints from the interval index for plotting
    midpoints = mean_values_sampled.index

    # Plot the mean values for the sampled data
    plt.plot(midpoints, 
             mean_values_sampled.values,
             label=f'Sampled Set {i+1}',
             marker='o')  # Use a marker for each point

    # Optional: make y axis log
    # plt.yscale('log')

    # Set plot title and labels
    plt.title(f'Mean # ngrams vs. {variable}_gold_range (Sampled Set {i+1})')
    plt.xlabel(f'{variable}_gold_range')
    plt.ylabel('Mean # ngrams')

# Plot the equal samples average line
plt.plot(equal_samples_midpoints, 
            equal_samples_mean_values.values,
            label='Equal Samples Average',
            color='red', linestyle='--',
            linewidth=5)

In [None]:
# Calculate the mean value for each probability group
mean_values = df_all_models.groupby(f'{variable}_gold_range')['value'].mean()

ranges = mean_values.index

# only sample according to the group with the least samples
n_samples = min([len(df_all_models[df_all_models[f'{variable}_gold_range'] == range_]) for range_ in ranges])

# plot dist plot for each range, overlaid
for i, range_ in enumerate(ranges):
    df_range = df_all_models[df_all_models[f'{variable}_gold_range'] == range_]
    indexes = np.random.choice(df_range.index, n_samples, replace=False)
    df_range = df_range.loc[indexes]
    plt.hist(df_range[f'{variable}_gold'], bins=50, alpha=0.5, label=range)
    plt.xlabel(f'{variable}_gold')
    plt.ylabel('Frequency')
    
# plot 