In [1]:
import os
import torch
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [2]:
# Configuration
layers       = [15, 16, 17]
streams      = {'residual':'r'}
labels       = ['past','present','future']
corpora      = ['nontemporal','temporal']
output_dir   = './latent_outputs'
top_k = 10

torch.manual_seed(0)
np.random.seed(0)
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x14c027daaa40>

### Features from cosine similarity

In [3]:
def load_features_and_labels(layer, stream, corpora, labels, output_dir):
    H_parts, Y = [], []
    for corpus in corpora:
        for lbl in labels:
            p = os.path.join(
                output_dir,
                f"{corpus}_{lbl}_l{layer}_{stream}_feature_acts.pt"
            )
            # Load tensor once and cast to float32 to avoid BFloat16 numpy conversion error
            tensor = torch.load(p).cpu().to(torch.float32)
            # Determine reshape dimensions
            total_elems = tensor.numel()
            batch_size = tensor.shape[0]
            feat_dim = total_elems // batch_size
            feats = tensor.numpy().reshape(batch_size, feat_dim)

            # Load labels
            meta_path = p.replace("_feature_acts.pt", "_metadata.parquet")
            df = pd.read_parquet(meta_path)

            H_parts.append(feats)
            Y.extend(df['tense'].tolist())
    H = np.vstack(H_parts)
    return H, np.array(Y)

In [4]:
# 2 Train one‑vs‑rest probe once per layer/stream

le = LabelEncoder().fit(labels)
for layer in layers:
    records=[]
    for target_label in labels:
        for corpus in corpora:
            for stream in streams:
                H, y = load_features_and_labels(layer, stream, corpora, labels, output_dir)
                y_int = le.transform(y)
                probe = LogisticRegression(multi_class='ovr', solver='liblinear', C=1.0, max_iter=1000)
                probe.fit(H, y_int)
                coefs = probe.coef_  # shape [3, m]

                # compute Cohen's d denominators once
                m = H.shape[1]
                d_den = {}
                for cls_idx, tense in enumerate(le.classes_):
                    H_pos = H[y_int==cls_idx]
                    H_neg = H[y_int!=cls_idx]
                    mu_pos = H_pos.mean(axis=0); mu_neg = H_neg.mean(axis=0)
                    sd_pos = H_pos.std(axis=0, ddof=1); sd_neg = H_neg.std(axis=0, ddof=1)
                    d_den[tense] = np.sqrt((sd_pos**2+sd_neg**2)/2 + 1e-8)

                # 3 Load cosine‑selected top‑k features
                idx_path = os.path.join(
                    output_dir,
                    f"top_{corpus}_{target_label}_indices_l{layer}_{stream}.pt"
                )
                topk = torch.load(idx_path)  # list of indices

                # 4 Compute metrics on those features
                cls_idx = list(le.classes_).index(target_label)
                w_vec = coefs[cls_idx]
                mu_pos = H[y_int==cls_idx].mean(axis=0)
                mu_neg = H[y_int!=cls_idx].mean(axis=0)
                for f in topk:
                    records.append({
                        'corpus': corpus,
                        'label': target_label,
                        'layer': layer,
                        'stream': stream,
                        'feature': f,
                        'weight_abs': abs(w_vec[f]),
                        'cohen_d_abs': abs((mu_pos[f]-mu_neg[f]) / d_den[target_label][f])
                    })

    # 5 Save and display
    df = pd.DataFrame(records)
    csv = os.path.join(
        output_dir,
        f"cosine_features_strength_l{layer}_{stream}.csv"
    )
    df.to_csv(csv, index=False)
    print(f"Layer {layer}, stream {stream}: metrics saved to {csv}")



Layer 15, stream residual: metrics saved to ./latent_outputs/cosine_features_strength_l15_residual.csv




Layer 16, stream residual: metrics saved to ./latent_outputs/cosine_features_strength_l16_residual.csv




Layer 17, stream residual: metrics saved to ./latent_outputs/cosine_features_strength_l17_residual.csv


### Features from probes

In [5]:
le = LabelEncoder().fit(labels)
for layer in layers:
    for stream in streams:
        H, y = load_features_and_labels(layer, stream, ['nontemporal','temporal'], labels, output_dir)
        y_int = le.transform(y)
        probe = LogisticRegression(
            multi_class='ovr',
            solver='liblinear',
            C=1.0,
            max_iter=1000
        )
        probe.fit(H, y_int)
        W = probe.coef_       # shape [3, m]
        m = H.shape[1]

        # compute pooled‐sd denominators
        d_den = {}
        for cls_idx, tense in enumerate(le.classes_):
            H_pos = H[y_int==cls_idx]
            H_neg = H[y_int!=cls_idx]
            sd_pos = H_pos.std(axis=0, ddof=1)
            sd_neg = H_neg.std(axis=0, ddof=1)
            d_den[tense] = np.sqrt((sd_pos**2 + sd_neg**2)/2 + 1e-8)

        records = []
        for cls_idx, target in enumerate(le.classes_):
            w_vec = W[cls_idx]
            # rank features by absolute weight
            topk = np.argsort(np.abs(w_vec))[-top_k:]
            mu_pos = H[y_int==cls_idx].mean(axis=0)
            mu_neg = H[y_int!=cls_idx].mean(axis=0)

            for f in topk:
                records.append({
                    'label'       : target,
                    'layer'       : layer,
                    'stream'      : stream,
                    'feature'     : int(f),
                    'weight_abs'  : float(abs(w_vec[f])),
                    'cohen_d_abs' : float(abs((mu_pos[f] - mu_neg[f]) / d_den[target][f]))
                })

        df = pd.DataFrame(records)
        out_csv = os.path.join(
            output_dir,
            f"probe_features_strength_l{layer}_{stream}.csv"
        )
        df.to_csv(out_csv, index=False)
        print(f"Layer {layer}, stream {stream}: saved to {out_csv}")



Layer 15, stream residual: saved to ./latent_outputs/probe_features_strength_l15_residual.csv




Layer 16, stream residual: saved to ./latent_outputs/probe_features_strength_l16_residual.csv




Layer 17, stream residual: saved to ./latent_outputs/probe_features_strength_l17_residual.csv


### Getting intersection of cosine and probe

In [6]:
# Configuration
layers  = [15, 16, 17]
streams = ['residual']
labels  = ['past', 'present', 'future']
output_dir = './latent_outputs'

In [7]:
# Collect all intersection records
all_records = []

for layer in layers:
    for stream in streams:
        # Paths
        path_cos = os.path.join(
            output_dir,
            f"cosine_features_strength_l{layer}_{stream}.csv"
        )
        path_prb = os.path.join(
            output_dir,
            f"probe_features_strength_l{layer}_{stream}.csv"
        )

        # Load
        df_cos = pd.read_csv(path_cos)
        df_prb = pd.read_csv(path_prb)

        # Keep only relevant columns
        df_cos = df_cos[['label','feature','weight_abs','cohen_d_abs']].copy()
        df_prb = df_prb[['label','feature','weight_abs','cohen_d_abs']].copy()

        # Rename for clarity
        df_cos.rename(columns={
            'weight_abs':   'weight_abs_cosine',
            'cohen_d_abs':  'cohen_d_abs_cosine'
        }, inplace=True)
        df_prb.rename(columns={
            'weight_abs':   'weight_abs_probe',
            'cohen_d_abs':  'cohen_d_abs_probe'
        }, inplace=True)

        # Merge on label & feature
        df_merged = pd.merge(
            df_cos,
            df_prb,
            on=['label','feature'],
            how='inner'
        )

        # Annotate layer & stream
        df_merged['layer']  = layer
        df_merged['stream'] = stream

        # Collect
        all_records.append(df_merged)

# Concatenate all and save
df_all = pd.concat(all_records, ignore_index=True)
out_path = os.path.join(output_dir, "chosen_features.csv")
df_all.to_csv(out_path, index=False)
print(f"Saved merged intersection strengths to {out_path}")

Saved merged intersection strengths to ./latent_outputs/chosen_features.csv


- weight_abs

    - Magnitude of logistic‐regression coefficient.

    - No absolute “unit” – depends on feature scaling.

    - In your layer 15 data it spans roughly 0.00.0 to ∼2.2∼2.2.

    - Use relative ranking: features with ∣wi∣>1∣wi​∣>1 are strong contributors; <0.5<0.5 are weak.

- cohen_d_abs

    - Standardized effect‐size:

        - 0.20.2 = small

        - 0.50.5 = medium

        - 0.80.8 = large

        - >1.3>1.3 = very large

    - Features with d>0.8d>0.8 are reliably discriminative; d<0.5d<0.5 borderline.

- Conclusion

    - Aim for features with both ∣wi∣>1∣wi∣>1 and di>0.8di>0.8.

    - Lower values signal weak or noisy signals

- Visualize using table