# Data Splitting
This notebook demonstrates how to split the dataset into training and testing sets.

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split


# Read the CSV file
data = pd.read_csv('clean_data.csv')

## Filter Stroke and Non-Stroke Cases
Separate the data into stroke and non-stroke cases.

In [32]:
# Filter stroke and non-stroke cases
stroke_cases = data[data['y'] == 1]
non_stroke_cases = data[data['y'] == 0]

# Sample 124 cases from each
stroke_sample = stroke_cases.sample(n=124, random_state=42)
non_stroke_sample = non_stroke_cases.sample(n=124, random_state=42)

# Combine the samples to create the testing set
testing_data = pd.concat([stroke_sample, non_stroke_sample])

# Shuffle the testing data
testing_data = testing_data.sample(frac=1, random_state=42)
# Define training data as all data except the testing data
training_data = data[~data.index.isin(testing_data.index)]



## Save the Data
Save the training and testing data into separate CSV files.

In [34]:
# Simpan data training ke dalam file CSV
training_data.to_csv('training_data.csv', index=False)

# Simpan data testing ke dalam file CSV
testing_data.to_csv('testing_data.csv', index=False)