In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler

RANDOM_STATE = 42

### Load the data

In [12]:
raw_data = pd.read_csv("data/cardio.csv", header=None)
raw_data.head()

Unnamed: 0,0,1,2,3
0,-2.302585,5.371103,10.716107,0
1,-2.302585,5.088213,8.418058,0
2,-2.302585,5.464255,7.113224,0
3,-2.302585,5.451468,7.616825,0
4,-2.302585,5.476882,6.186414,0


### Check the class distribution

In [13]:
print(raw_data.iloc[:,-1].value_counts())
print("Outlier Ratio: {:.5f}".format(raw_data.iloc[:,-1].sum()/raw_data.shape[0]))

0    565287
1      2211
Name: 3, dtype: int64
Outlier Ratio: 0.00390


### Split the data into training data and test data
Stratified folds force the splits to have same ratios of classes as the original data

In [14]:
# Ratio of test samples to separate
test_size = 0.2

# The seed will guarantee that the same split is produced if code is rerun
splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=RANDOM_STATE)

features = raw_data.drop(raw_data.columns[-1],axis=1)
labels = raw_data.iloc[:,-1].copy()
scaler = StandardScaler()

for train_index,test_index in splitter.split(features, labels):
    # Note that this includes the label column
    X_train = features.iloc[train_index]
    X_train = scaler.fit_transform(X_train)
    y_train = labels.iloc[train_index].values.reshape(-1,1)
    X_train = np.concatenate((X_train, y_train), axis=1)
    X_train = pd.DataFrame(X_train, columns=raw_data.columns)
    
    X_test = features.iloc[test_index]
    X_test = scaler.transform(X_test)
    y_test = labels.iloc[test_index].values.reshape(-1,1)
    X_test = np.concatenate((X_test, y_test), axis=1)
    X_test = pd.DataFrame(X_test, columns=raw_data.columns)
      

In [15]:
X_test.head()

Unnamed: 0,0,1,2,3
0,-0.073142,0.32108,-1.158103,0.0
1,-0.073142,0.670684,-0.413108,0.0
2,-0.073142,0.274371,-0.995558,0.0
3,-0.073142,0.723031,1.390202,0.0
4,-0.073142,-0.03677,1.40873,0.0


In [16]:
print(X_train.iloc[:,-1].value_counts())
print("Outlier Ratio: {:.5f}".format(X_train.iloc[:,-1].sum()/X_train.shape[0]))

0.0    452229
1.0      1769
Name: 3, dtype: int64
Outlier Ratio: 0.00390


In [17]:
print(X_test.iloc[:,-1].value_counts())
print("Outlier Ratio: {:.5f}".format(X_test.iloc[:,-1].sum()/X_test.shape[0]))

0.0    113058
1.0       442
Name: 3, dtype: int64
Outlier Ratio: 0.00389


### Write training data and test data to different files

In [18]:
dirname = "data/"

# Change filenames according to dataset
X_train.to_csv(dirname + "cardio_train.csv", index=False)
X_test.to_csv(dirname + "cardio_test.csv", index=False)