In [1]:
import numpy as np
import pandas as pd
import math
import os

# ------------------------
# CONFIG
# ------------------------
EMB_DIM = 320                      # REE dimension
HALF = EMB_DIM // 2               # number of frequencies
SAVE_DIR = "./data/archs4/ree_embeds"
os.makedirs(SAVE_DIR, exist_ok=True)

# Input files
TRAIN_PARQUET = "./data/archs4/processed_short_proteins/train_expr_logtpm_short.parquet"
VAL_PARQUET   = "./data/archs4/processed_short_proteins/val_expr_logtpm_short.parquet"
TEST_PARQUET  = "./data/archs4/processed_short_proteins/test_expr_logtpm_short.parquet"  # if exists


# ------------------------
# 1. Load Data
# ------------------------
def load_matrix(path):
    df = pd.read_parquet(path)
    X = df.T.astype(np.float32).values  # samples × genes
    print(f"Loaded {path}: {X.shape}")
    return X

X_train = load_matrix(TRAIN_PARQUET)
X_val   = load_matrix(VAL_PARQUET)
X_test  = load_matrix(TEST_PARQUET)


# ------------------------
# 2. Compute θ frequencies
# ------------------------
# θ_i = 100^(2i / d), for i = 1 ... d/2
i = np.arange(1, HALF + 1)
theta = 100 ** (2 * i / EMB_DIM)      # shape: [HALF]

theta = theta.astype(np.float32)
print("θ shape:", theta.shape)


# ------------------------
# 3. REE function
# ------------------------
def compute_ree(X):
    """
    X : shape [samples, genes]
    Returns REE matrix: shape [samples, genes, EMB_DIM]
    """
    # X[:, :, None] → shape [samples, genes, 1]
    # multiply by θ → broadcast to [samples, genes, HALF]
    angles = X[:, :, None] * theta[None, None, :]

    sin_part = np.sin(angles)
    cos_part = np.cos(angles)

    # concatenate → final dim = 320
    ree = np.concatenate([sin_part, cos_part], axis=-1)

    return ree.astype(np.float32)


# ------------------------
# 4. Compute REE for train/val/test
# ------------------------
print("Computing REE (train)...")
REE_train = compute_ree(X_train)
print("REE train:", REE_train.shape)

print("Computing REE (val)...")
REE_val = compute_ree(X_val)
print("REE val:", REE_val.shape)

print("Computing REE (test)...")
REE_test = compute_ree(X_test)
print("REE test:", REE_test.shape)


# ------------------------
# 5. Save results
# ------------------------
np.save(f"{SAVE_DIR}/REE_train_dim320.npy", REE_train)
np.save(f"{SAVE_DIR}/REE_val_dim320.npy", REE_val)
np.save(f"{SAVE_DIR}/REE_test_dim320.npy", REE_test)

print("\nSaved:")
print(" - REE_train_dim320.npy")
print(" - REE_val_dim320.npy")
print(" - REE_test_dim320.npy")


Loaded ./data/archs4/processed_short_proteins/train_expr_logtpm_short.parquet: (76286, 19357)
Loaded ./data/archs4/processed_short_proteins/val_expr_logtpm_short.parquet: (9557, 19357)
Loaded ./data/archs4/processed_short_proteins/test_expr_logtpm_short.parquet: (9446, 19357)
θ shape: (160,)
Computing REE (train)...


: 

In [None]:
import numpy as np
import matplotlib.pyplot as plt

d = 320
half = d//2

# frequencies
i = np.arange(1, half+1)
theta = 100 ** (2*i/d)

x = np.linspace(0, 10, 2000)  # expression values (log TPM range)
plt.figure(figsize=(12,6))

for idx in [0, 50, 100, 150]:   # pick 4 θ's across the range
    freq = theta[idx]
    y = np.sin(freq * x)
    plt.plot(x, y, label=f"θ[{idx}] = {freq:.2e}")

plt.title("REE Sinusoidal Embedding Components Across Frequencies")
plt.xlabel("Expression value (log TPM)")
plt.ylabel("sin(θx)")
plt.legend()
plt.grid(alpha=0.3)
plt.show()
