In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os


full_df = pd.read_csv('../data/full_merged.csv')

# Step 1: Add a random key
np.random.seed(42)
full_df['rand_key'] = np.random.rand(len(full_df))

# Step 2: Sort by rand_key (this is the core of RSP)
full_df_sorted = full_df.sort_values('rand_key').reset_index(drop=True)

# Step 3: Partition into blocks
block_size = 500  # Choose based on dataset size
total_rows = len(full_df_sorted)
num_blocks = total_rows // block_size

print(f"Total Rows: {total_rows}")
print(f"Block Size: {block_size}")
print(f"Number of Blocks: {num_blocks}")

# Step 4: Initialize storage for all splits
X_trains = []
X_tests = []
y_trains = []
y_tests = []

label_column = full_df_sorted.columns[-2]  # Assuming last column is rand_key, second last is label

# Step 5: Iterate over each block
for block_index in range(num_blocks):
    start = block_index * block_size
    end = start + block_size
    block_df = full_df_sorted.iloc[start:end].copy()
    
    # Drop rand_key
    block_df.drop(columns=['rand_key'], inplace=True)
    
    # Split into features and labels
    X = block_df.drop(columns=[label_column])
    y = block_df[label_column]
    
    # 60/40 train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.6, stratify=y, random_state=42
    )
    
    # Store splits
    X_trains.append(X_train)
    X_tests.append(X_test)
    y_trains.append(y_train)
    y_tests.append(y_test)
    
    print(f"Block {block_index} processed: {X_train.shape[0]} train / {X_test.shape[0]} test samples")

print(f"\nCompleted RSP across {num_blocks} blocks.")


Total Rows: 10299
Block Size: 500
Number of Blocks: 20
Block 0 processed: 300 train / 200 test samples
Block 1 processed: 300 train / 200 test samples
Block 2 processed: 300 train / 200 test samples
Block 3 processed: 300 train / 200 test samples
Block 4 processed: 300 train / 200 test samples
Block 5 processed: 300 train / 200 test samples
Block 6 processed: 300 train / 200 test samples
Block 7 processed: 300 train / 200 test samples
Block 8 processed: 300 train / 200 test samples
Block 9 processed: 300 train / 200 test samples
Block 10 processed: 300 train / 200 test samples
Block 11 processed: 300 train / 200 test samples
Block 12 processed: 300 train / 200 test samples
Block 13 processed: 300 train / 200 test samples
Block 14 processed: 300 train / 200 test samples
Block 15 processed: 300 train / 200 test samples
Block 16 processed: 300 train / 200 test samples
Block 17 processed: 300 train / 200 test samples
Block 18 processed: 300 train / 200 test samples
Block 19 processed: 300 

In [None]:
# Saving blocks in the local storage
for i in range(num_blocks):
    X_trains[i].to_csv(f'../data/blocks/X_train_block_{i}.csv', index=False)
    X_tests[i].to_csv(f'../data/blocks/X_test_block_{i}.csv', index=False)
    y_trains[i].to_csv(f'../data/blocks/y_train_block_{i}.csv', index=False)
    y_tests[i].to_csv(f'../data/blocks/y_test_block_{i}.csv', index=False)

print(f"All {num_blocks} sample blocks saved to /data/blocks/")

âœ… All 20 sample blocks saved to /data/blocks/
