In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device('cpu')

In [None]:
#get data (put files into colab folder)

# load splits
train_df = pd.read_csv("data_train.csv")
val_df   = pd.read_csv("data_val.csv")
test_df  = pd.read_csv("data_test.csv")

print(train_df.head())

In [None]:
print(f"Train instances: {train_df.shape[0]}")
print(f"Val instances: {val_df.shape[0]}")
print(f"Test instances: {test_df.shape[0]}")

In [None]:
#load pretrained BERT
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, output_hidden_states=True).to(device)
model.eval()

In [None]:
#helper function
def get_layer(split, idx):
  """Get the layer at idx in the given split (test/val/train)"""
  data = np.load(f"{split}_layers.npz")
  return data[f"layer{idx}"]

In [None]:
#define probing function
def train_probe(X_train, y_train, X_val, y_val):
    clf = LogisticRegression(max_iter=5000, class_weight="balanced", multi_class="auto")
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    return accuracy_score(y_val, y_pred), clf

In [None]:
#taking the first 1000 instances
#texts
train_subset = train_df["text"].tolist()[:1000]
val_subset = val_df["text"].tolist()[:1000]

#train: gender age age
y_train_subset_gender = train_df["gender"].values[:1000]
y_train_subset_age    = train_df["age"].values[:1000]

#val: gender and age
y_val_subset_gender = train_df["gender"].values[:1000]
y_val_subset_age    = train_df["age"].values[:1000]

In [None]:
#run probes for gender:
results_subset_gender = []
for layer_idx in range(13):
    acc, _ = train_probe(
        get_layer("train_subset", layer_idx), y_train_subset_gender,
        get_layer("val_subset", layer_idx), y_val_subset_gender
    )
    results_subset_gender.append(acc)
    print(f"[Gender] Layer {layer_idx}: probe accuracy = {acc:.3f}")

In [None]:
results_subset_age = []
for layer_idx in range(13):
    acc, _ = train_probe(
        get_layer("train_subset", layer_idx), y_train_subset_age,
        get_layer("val_subset", layer_idx), y_train_subset_age
    )
    results_subset_age.append(acc)
    print(f"[Age] Layer {layer_idx}: probe accuracy = {acc:.3f}")

In [None]:
import matplotlib.pyplot as plt

plt.plot(range(13), results_subset_gender, marker="o", label="Gender")
plt.plot(range(13), results_subset_age, marker="s", label="Age")
plt.xlabel("Layer")
plt.ylabel("Probe Accuracy")
plt.title("Probing Pretrained BERT for Gender vs Age")
plt.legend()
plt.show()