In [2]:
#***********************************************************************
#
# split_data.ipynb - split script to ensure consistent testing 
#
#***********************************************************************
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

# ------------------------------------------------------------------
#  Config 
# ------------------------------------------------------------------
INPUT_CSV = "star_classification.csv"   
OUTPUT_DIR = "data/splits"                   # folder to store split CSVs

TRAIN_CSV = os.path.join(OUTPUT_DIR, "train.csv")
TEST_CSV = os.path.join(OUTPUT_DIR, "test.csv")
PROD_CSV = os.path.join(OUTPUT_DIR, "prod_monitor.csv")

TARGET_COL = "class"  

# ------------------------------------------------------------------
#  Load data
# ------------------------------------------------------------------
df = pd.read_csv(INPUT_CSV)

#  shuffle with a fixed seed for reproducibility
df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)

# ------------------------------------------------------------------
#  First split: 40% production / 60% remaining
#    Stratify on the target column to preserve class balance.
# ------------------------------------------------------------------
if TARGET_COL not in df.columns:
    raise ValueError(f"Target column '{TARGET_COL}' not found in dataframe")

X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

sss_prod = StratifiedShuffleSplit(
    n_splits=1,
    test_size=0.40,     # 40% - production monitoring
    random_state=42
)

for remaining_idx, prod_idx in sss_prod.split(X, y):
    df_remaining = df.iloc[remaining_idx].reset_index(drop=True)
    df_prod = df.iloc[prod_idx].reset_index(drop=True)

# ------------------------------------------------------------------
#  Second split: 80/20 of remaining â†’ train/test
#    Remaining is 60%; 80% of 60% = 48%; 20% of 60% = 12%.
# ------------------------------------------------------------------
X_rem = df_remaining.drop(columns=[TARGET_COL])
y_rem = df_remaining[TARGET_COL]

sss_train_test = StratifiedShuffleSplit(
    n_splits=1,
    test_size=0.20,     # 20% of remaining 60% - 12% of total
    random_state=42
)

for train_idx, test_idx in sss_train_test.split(X_rem, y_rem):
    df_train = df_remaining.iloc[train_idx].reset_index(drop=True)
    df_test = df_remaining.iloc[test_idx].reset_index(drop=True)



In [3]:
'''
print(df_train.head())
print(df_test.head())
print(df_prod.head())'''

'\nprint(df_train.head())\nprint(df_test.head())\nprint(df_prod.head())'

In [4]:
print("Saved splits:")
print(f"  Train: {df_train.shape} -> {TRAIN_CSV}")
print(f"  Test:  {df_test.shape}  -> {TEST_CSV}")
print(f"  Prod:  {df_prod.shape}  -> {PROD_CSV}")

# Sanity check on proportions
n_total = len(df)
print("\nProportions of total:")
print("  Train: {:.2%}".format(len(df_train) / n_total))
print("  Test:  {:.2%}".format(len(df_test) / n_total))
print("  Prod:  {:.2%}".format(len(df_prod) / n_total))

Saved splits:
  Train: (48000, 18) -> data/splits\train.csv
  Test:  (12000, 18)  -> data/splits\test.csv
  Prod:  (40000, 18)  -> data/splits\prod_monitor.csv

Proportions of total:
  Train: 48.00%
  Test:  12.00%
  Prod:  40.00%


In [5]:
# ------------------------------------------------------------------
#  Create output dir and save CSVs
# ------------------------------------------------------------------
os.makedirs(OUTPUT_DIR, exist_ok=True)

df_train.to_csv(TRAIN_CSV, index=False)
df_test.to_csv(TEST_CSV, index=False)
df_prod.to_csv(PROD_CSV, index=False)