In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.metrics import confusion_matrix, accuracy_score

Read the model recovery data

In [2]:
# read all the model recovery results and compile them into a single df 
DATA_DIR = "../../data/model_recovery"
df = pd.DataFrame()
n_missing = 0
for i in range(111):
    try:
        df_i = pd.read_csv(f"{DATA_DIR}/recovery_results_{i}.csv")
    except FileNotFoundError:
        print(f"missing: {i}")
        n_missing += 1
        continue
    df = df.append(df_i)
df.index = range(len(df))

missing: 0
missing: 11
missing: 14
missing: 39
missing: 40
missing: 47
missing: 48
missing: 50
missing: 55
missing: 63
missing: 64
missing: 79
missing: 81
missing: 92
missing: 99
missing: 104


In [3]:
len(df[df["generating_model"] == "sparse_max_continuous"])

14

In [4]:
len(df[df["generating_model"] == "sparse_max_discrete"])

32

In [5]:
model_types = ("sparse_max_continuous", "sparse_max_discrete", "sparse_lqr", "lqr", "nm1", "nm2", "hc")
n_params = {"nm2": 2, "nm1": 4, "lqr": 2, "sparse_lqr": 3, "hc": 3, "sparse_max_continuous": 4, "sparse_max_discrete": 4}

# compile a dataframe with the AIC of each model on the simulated data
df_aic = pd.DataFrame()
df_aic["situation"] = df["situation"]
df_aic["generating_model"] = df["generating_model"]
for model_type in model_types:
    df_aic[model_type] = df[model_type].apply(lambda x: 2 * n_params[model_type] - 2 * x)

In [6]:
df_aic.loc[12]

situation                [241.0, -127.0, -192.0, 113.0, -224.0]
generating_model                                  hill_climbing
sparse_max_continuous                                 34.322136
sparse_max_discrete                                   30.647966
sparse_lqr                                           173.193162
lqr                                                  228.191936
nm1                                                   24.428534
nm2                                                   22.196334
hc                                                    23.317952
Name: 12, dtype: object

Print which models' data is classified as coming from which other models

In [7]:
all_true = []
all_pred = []
model_names = ("sparse_max_continuous", "sparse_max_discrete", "sparse_lqr", "lqr", "null_model_1", "null_model_2", "hill_climbing") 
for model_name in model_names:
    # print out the name of the generating model
    print(f"Generating model: {model_name.upper()}")
    
    # print the counts of each predicted model on data generated from this model
    df_model = df_aic[df_aic["generating_model"] == model_name].reset_index().drop("index", axis=1)
    print(f"n generating: {len(df_model)}")
    predicted_models = defaultdict(int)
    for index, row in df_model.iterrows():
        model_performances = list(df_model.iloc[index, 2:])
        best_model_idx = np.argmin(model_performances)
        predicted_models[model_names[best_model_idx]] += 1
        all_true.append(model_name)
        all_pred.append(model_names[best_model_idx])
    print("predicted models:")
    for model in predicted_models:
        print(f"{model}: {predicted_models[model]}")

Generating model: SPARSE_MAX_CONTINUOUS
n generating: 14
predicted models:
null_model_2: 12
hill_climbing: 2
Generating model: SPARSE_MAX_DISCRETE
n generating: 32
predicted models:
null_model_2: 31
sparse_max_continuous: 1
Generating model: SPARSE_LQR
n generating: 12
predicted models:
sparse_lqr: 12
Generating model: LQR
n generating: 0
predicted models:
Generating model: NULL_MODEL_1
n generating: 0
predicted models:
Generating model: NULL_MODEL_2
n generating: 0
predicted models:
Generating model: HILL_CLIMBING
n generating: 37
predicted models:
null_model_2: 31
hill_climbing: 4
sparse_max_continuous: 2


In [8]:
accuracy_score(all_true, all_pred)

0.16842105263157894

In [9]:
confusion_matrix(all_true, all_pred)

array([[ 4, 31,  0,  2,  0],
       [ 0,  0,  0,  0,  0],
       [ 0,  0, 12,  0,  0],
       [ 2, 12,  0,  0,  0],
       [ 0, 31,  0,  1,  0]], dtype=int64)