In [8]:
import os
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import glob


In [9]:
# Folder containing sample blocks
block_dir = '../data/blocks'

# Number of blocks saved earlier
num_blocks = len(glob.glob('../data/blocks/X_train_block_*.csv'))

# Set fixed number of PCA components (e.g., 50)
fixed_n_components = 50

In [6]:
# Storage for transformed blocks
X_trains_pca = []
X_tests_pca = []
pca_models = []

for i in range(num_blocks):
    print(f"\nProcessing Block {i}...")

    # Load original train/test splits
    X_train = pd.read_csv(f'{block_dir}/X_train_block_{i}.csv')
    X_test = pd.read_csv(f'{block_dir}/X_test_block_{i}.csv')

    # Step 1: Standardize
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Step 2: Apply PCA with fixed number of components
    pca = PCA(n_components=fixed_n_components)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    # Step 3: Store results
    X_trains_pca.append(X_train_pca)
    X_tests_pca.append(X_test_pca)
    pca_models.append(pca)

    print(f"  Original dims: {X_train.shape[1]}")
    print(f"  Reduced dims: {X_train_pca.shape[1]}")



Processing Block 0...
  Original dims: 562
  Reduced dims: 50

Processing Block 1...
  Original dims: 562
  Reduced dims: 50

Processing Block 2...
  Original dims: 562
  Reduced dims: 50

Processing Block 3...
  Original dims: 562
  Reduced dims: 50

Processing Block 4...
  Original dims: 562
  Reduced dims: 50


In [7]:
for i in range(num_blocks):
    pd.DataFrame(X_trains_pca[i]).to_csv(f'{block_dir}/X_train_pca_block_{i}.csv', index=False)
    pd.DataFrame(X_tests_pca[i]).to_csv(f'{block_dir}/X_test_pca_block_{i}.csv', index=False)
