In [4]:
import pandas as pd 
import os
from sklearn.model_selection import train_test_split

In [7]:
# 1. Define Paths
# Adjust the input path if your file name is different (e.g., 'telco_churn.csv')
input_path = r'../data/raw/Churn.csv' 
output_dir = r'../data/raw/splits'

# 2. Load the Data
df = pd.read_csv(input_path)

# 3. Split the Data
# First, split into (Train + Validation) and Test
# Test size = 20%
train_val_df, test_df = train_test_split(
    df, 
    test_size=0.2, 
    random_state=42, 
    stratify=df['Churn'] # Important: Keep churn ratio consistent
)

# Second, split (Train + Validation) into Train and Validation
# We want Validation to be 20% of the ORIGINAL total. 
# Since train_val_df is 80% of original, we need 0.25 of IT to get 20% total.
# (0.25 * 0.80 = 0.20)
train_df, val_df = train_test_split(
    train_val_df, 
    test_size=0.25, 
    random_state=42, 
    stratify=train_val_df['Churn']
)


In [8]:
# 4. Create the Output Directory
os.makedirs(output_dir, exist_ok=True)

# 5. Save the Splits
train_path = os.path.join(output_dir, 'train.csv')
val_path = os.path.join(output_dir, 'validation.csv')
test_path = os.path.join(output_dir, 'test.csv')

train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)
test_df.to_csv(test_path, index=False)

print(f"✅ Data split successfully!")
print(f"Train shape: {train_df.shape}")
print(f"Validation shape: {val_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Files saved to: {output_dir}")

✅ Data split successfully!
Train shape: (4225, 21)
Validation shape: (1409, 21)
Test shape: (1409, 21)
Files saved to: ../data/raw/splits
