# Install

In [None]:
!pip install einops datasets jaxtyping better_abc fancy_einsum wandb netcal

# Setup

In [None]:
import sys
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
path_to_root = '/content/drive/My Drive/Colab Notebooks/BatuEl_Dissertation'
sys.path.append(path_to_root)
print("Drive mounted.")

data_path = path_to_root + '/data'

In [None]:
import torch
import tqdm
from reprshift.learning.algorithms import ERM
from reprshift.models.hparams import hparams_f
from reprshift.dataset.datasets import MultiNLI, CivilComments
from reprshift.dataset.dataloaders import InfiniteDataLoader, FastDataLoader

from reprshift.models.model_param_maps import ERM_to_HookedEncoder, load_focal, load_groupdro, load_jtt, load_lff
from reprshift.models.HookedEncoderConfig import bert_config

from transformer_lens2 import HookedEncoder, HookedTransformerConfig
import numpy as np

# Dataset

In [None]:
SEED = 0
DATASET = 'CivilComments'  # 'CivilComments' , 'MultiNLI'

if DATASET == 'MultiNLI':
    NUM_CLASSES = 3
    NUM_ATTRIBUTES = 2
    # train_dataset = MultiNLI(data_path, 'tr', hparams)
    # val_dataset = MultiNLI(data_path, 'va', hparams=hparams_f('ERM'))
    # te_dataset = MultiNLI(data_path, 'te', hparams=hparams_f('ERM'))
    models_path = path_to_root + '/models/models_mnli'
    representations_path = path_to_root + '/representations/representations_mnli'
    print(DATASET)
elif DATASET  == 'CivilComments':
    NUM_CLASSES = 2
    NUM_ATTRIBUTES = 8
    # train_dataset = CivilComments(data_path, 'tr', hparams, granularity="fine")
    # val_dataset = CivilComments(data_path, 'va', hparams=hparams_f('ERM'))
    # te_dataset = CivilComments(data_path, 'te', hparams=hparams_f('ERM'))
    models_path = path_to_root + '/models/models_civilcomments'
    representations_path = path_to_root + '/representations/representations_civilcomments'
    print(DATASET)
else:
    print('Dataset Not Implemented')

# Probe Representations

In [None]:
algorithm_names =  ['random', 'randominit', 'pretrained', 'erm', 'groupdro', 'focal', 'jtt', 'lff', ]
REPRS = torch.load(f'{representations_path}/seed{SEED}'+'_reprs')
REPRS.keys()

In [None]:
PER_GROUP_REPR = int(3600 / (NUM_CLASSES * NUM_ATTRIBUTES))
PER_GROUP_REPR

In [None]:
CAT_REPRS = {}

## Add the rest
for algorithm_key in algorithm_names:
    CAT_REPRS[algorithm_key] = {}
    for layer_key in tqdm.tqdm(REPRS[algorithm_key].keys()):
        CAT_REPRS[algorithm_key][layer_key] = []
        for y_key in REPRS[algorithm_key][layer_key].keys():
            for a_key in REPRS[algorithm_key][layer_key][y_key].keys():
                CAT_REPRS[algorithm_key][layer_key].append(REPRS[algorithm_key][layer_key][y_key][a_key])
        CAT_REPRS[algorithm_key][layer_key] = torch.cat(CAT_REPRS[algorithm_key][layer_key])

In [None]:
CAT_LABEL_A = []
CAT_LABEL_Y = []

for y_idx, y_key in enumerate(REPRS[algorithm_key][layer_key].keys()):
    for a_idx, a_key in enumerate(REPRS[algorithm_key][layer_key][y_key].keys()):
        CAT_LABEL_A += [a_idx] * PER_GROUP_REPR
        CAT_LABEL_Y += [y_idx] * PER_GROUP_REPR

CAT_LABEL_A = torch.tensor(CAT_LABEL_A)#[:CAT_REPRS['erm']['layer0'].shape[0]]
CAT_LABEL_Y = torch.tensor(CAT_LABEL_Y)#[:CAT_REPRS['erm']['layer0'].shape[0]]

In [None]:
CAT_LABEL_A.shape, CAT_LABEL_Y.shape

In [None]:
# algorithm_name = 'random'
# layer_no = 'layer10'
# X = CAT_REPRS[algorithm_name][layer_no]
# Y = CAT_LABEL_A

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score


# Probes
class OneLayerMLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(OneLayerMLP, self).__init__()
        self.layer1 = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.layer1(x)

class TwoLayerMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TwoLayerMLP, self).__init__()
        self.layer1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        return x

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# compact function for probing the representations
def ProbeReprs(X, Y):
    X_train, X_temp, Y_train, Y_temp = train_test_split(X.detach().clone(), Y.detach().clone(), test_size=0.3, random_state=0, shuffle=True)
    X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=0, shuffle=True)

    # scale the data - standard scaler
    # scaler = StandardScaler()
    # X_train = torch.tensor(scaler.fit_transform(X_train))
    # X_val = torch.tensor(scaler.transform(X_val))
    # X_test = scaler.transform(X_test)

    input_dim = X_train.shape[1]
    output_dim = len(np.unique(np.array(Y)))
    hidden_dim = 128
    learning_rate = 0.001
    num_epochs = 100

    one_layer_model = OneLayerMLP(input_dim, output_dim)
    two_layer_model = TwoLayerMLP(input_dim, hidden_dim, output_dim)
    optimizer_one_layer = optim.Adam(one_layer_model.parameters(), lr=learning_rate)
    optimizer_two_layer = optim.Adam(two_layer_model.parameters(), lr=learning_rate)
    loss_function = nn.CrossEntropyLoss()

    Y_train_tensor = torch.tensor(Y_train, dtype=torch.long)
    Y_val_tensor = torch.tensor(Y_val, dtype=torch.long)
    def train_model(model, optimizer, X_train, Y_train, X_val, Y_val):
        best_val_acc = 0
        final_best_model = None

        for epoch in range(num_epochs):
            # do a training step full-batch
            model.train()
            outputs = model(X_train)
            loss = loss_function(outputs, Y_train)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            model.eval()
            with torch.no_grad():
                # compute validation acc
                val_outputs = model(X_val)
                _, predicted = torch.max(val_outputs.data, 1)

                val_acc = accuracy_score(Y_val, predicted)
                if val_acc > best_val_acc:
                    best_val_acc = val_acc
                    final_best_model = model.state_dict()

        return final_best_model, best_val_acc

    # train one  layer model
    best_one_layer_model, best_one_layer_val_acc = train_model(one_layer_model, optimizer_one_layer, X_train_tensor, Y_train_tensor, X_val_tensor, Y_val_tensor)
    one_layer_model.load_state_dict(best_one_layer_model)
    # train two layer model
    best_two_layer_model, best_two_layer_val_acc = train_model(two_layer_model, optimizer_two_layer, X_train_tensor, Y_train_tensor, X_val_tensor, Y_val_tensor)
    two_layer_model.load_state_dict(best_two_layer_model)

    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    Y_test_tensor = torch.tensor(Y_test, dtype=torch.float32)
    def evaluate_model(model, X_test, Y_test):
        model.eval()  
        with torch.no_grad():
            test_outputs = model(X_test)
            _, predicted = torch.max(test_outputs.data, 1)
            test_acc = accuracy_score(Y_test, predicted)
            return test_acc

    return {'one layer': evaluate_model(one_layer_model, X_test_tensor, Y_test_tensor), 'two layer': evaluate_model(two_layer_model, X_test_tensor, Y_test_tensor)}


In [None]:
Y_PROBE_DICT1 = pd.DataFrame(index=CAT_REPRS['erm'].keys(), columns=CAT_REPRS.keys())
Y_PROBE_DICT2 = pd.DataFrame(index=CAT_REPRS['erm'].keys(), columns=CAT_REPRS.keys())
A_PROBE_DICT1 = pd.DataFrame(index=CAT_REPRS['erm'].keys(), columns=CAT_REPRS.keys())
A_PROBE_DICT2 = pd.DataFrame(index=CAT_REPRS['erm'].keys(), columns=CAT_REPRS.keys())

for algorithm_name in CAT_REPRS.keys():
    print(algorithm_name)
    for layer_no in tqdm.tqdm(CAT_REPRS['erm'].keys()):
        PY = ProbeReprs(CAT_REPRS[algorithm_name][layer_no], CAT_LABEL_Y)
        Y_PROBE_DICT1[algorithm_name].loc[layer_no] = PY['one layer']
        Y_PROBE_DICT2[algorithm_name].loc[layer_no] = PY['two layer']
        PA = ProbeReprs(CAT_REPRS[algorithm_name][layer_no], CAT_LABEL_A)
        A_PROBE_DICT1[algorithm_name].loc[layer_no] = PA['one layer']
        A_PROBE_DICT2[algorithm_name].loc[layer_no] = PA['two layer']

PROBES = pd.concat({'A1L': A_PROBE_DICT1,'A2L':A_PROBE_DICT2, 'Y1L':Y_PROBE_DICT1,'Y2L':Y_PROBE_DICT2,})
PROBES.to_csv(path_to_root + f'/results/Probe/{DATASET}_seed{SEED}')

In [None]:
import pandas as pd

# PROBE_PATH = path_to_root + f'/results/Probe/{DATASET}_seed{SEED}'
# df = pd.read_csv(PROBE_PATH, index_col=['Unnamed: 0','Unnamed: 1' ])

In [None]:
df.round(3)*100

# Figures

In [None]:
### Table ###

# Columns: Algorithms
# Rows: Layers
# Cells: Probe Accuracy

# Logistic Regression Probe
# Minimum Description Length Probe

PROBE_PATH =
df = pd.read_csv(path_to_root + f'/results/Probe/{DATASET}_seed{SEED}', index_col=['Unnamed: 0','Unnamed: 1' ])

In [None]:
import torch
import pandas as pd
import numpy as np

DATASET = 'CivilComments' # ['MultiNLI', 'CivilComments']
SEED = 0
ProbeTables = []

for SEED in [0,1,2]:
    df = pd.read_csv(path_to_root + f'/results/Probe/{DATASET}_seed{SEED}', index_col=['Unnamed: 0','Unnamed: 1' ])
    ProbeTables.append(df * 100)

In [None]:
### Test Table ###
dfs = ProbeTables
# dfs = [pd.DataFrame(TestTables[i].drop('Overall').mean()) for i in range(2)] # To Calculate average accuracy
stacked_dfs = np.stack(dfs)
df_mean_values = np.mean(stacked_dfs, axis=0)
df_std_values = np.std(stacked_dfs, axis=0)
df_mean = pd.DataFrame(df_mean_values, columns=dfs[0].columns, index=dfs[0].index)
df_std = pd.DataFrame(df_std_values, columns=dfs[0].columns, index=dfs[0].index)
column_names = ["Random", "Random Init", "Pretrained", "ERM", "GroupDRO", "Focal", "JTT", "LFF"]
df_mean.columns = column_names
df_std.columns = column_names