In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler

RANDOM_STATE = 42

### Load the data

In [2]:
raw_data = pd.read_csv("data/cardio.csv", header=None)
raw_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,0.004912,0.693191,-0.20364,0.595322,0.35319,-0.061401,-0.278295,-1.650444,0.759072,-0.420487,...,-0.798376,1.854728,0.622631,0.963083,0.301464,0.193113,0.231498,-0.289786,-0.493294,0
1,0.110729,-0.079903,-0.20364,1.268942,0.396246,-0.061401,-0.278295,-1.71027,0.759072,-0.420487,...,-0.798376,1.854728,0.278625,0.963083,0.301464,0.129265,0.093563,-0.256385,-0.493294,0
2,0.216546,-0.272445,-0.20364,1.050988,0.148753,-0.061401,-0.278295,-1.71027,1.106509,-0.420487,...,-1.332931,0.314688,2.342663,-0.488279,0.061002,0.065417,0.024596,-0.256385,1.140018,0
3,0.004912,0.727346,-0.20364,1.212171,-0.683598,-0.061401,-0.278295,-1.71027,1.106509,-0.420487,...,-1.332931,0.314688,1.65465,-0.488279,0.061002,0.193113,0.093563,-0.323186,1.140018,0
4,-0.100905,0.363595,1.321366,1.02712,0.141359,-0.061401,-0.278295,-0.992364,-0.051613,-0.420487,...,-0.085638,-0.565334,0.278625,-0.488279,-0.059229,0.065417,0.024596,-0.456787,1.140018,0


### Check the class distribution

In [3]:
print(raw_data.iloc[:,-1].value_counts())
print("Outlier Ratio: {:.5f}".format(raw_data.iloc[:,-1].sum()/raw_data.shape[0]))

0    1655
1     176
Name: 21, dtype: int64
Outlier Ratio: 0.09612


### Split the data into training data and test data
Stratified folds force the splits to have same ratios of classes as the original data

In [4]:
# Ratio of test samples to separate
test_size = 0.2

# The seed will guarantee that the same split is produced if code is rerun
splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=RANDOM_STATE)

features = raw_data.drop(raw_data.columns[-1],axis=1)
labels = raw_data.iloc[:,-1].copy()
scaler = StandardScaler()

for train_index,test_index in splitter.split(features, labels):
    # Note that this includes the label column
    X_train = features.iloc[train_index]
    X_train = scaler.fit_transform(X_train)
    y_train = labels.iloc[train_index].values.reshape(-1,1)
    X_train = np.concatenate((X_train, y_train), axis=1)
    X_train = pd.DataFrame(X_train, columns=raw_data.columns)
    
    X_test = features.iloc[test_index]
    X_test = scaler.transform(X_test)
    y_test = labels.iloc[test_index].values.reshape(-1,1)
    X_test = np.concatenate((X_test, y_test), axis=1)
    X_test = pd.DataFrame(X_test, columns=raw_data.columns)
      

In [5]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,-0.084895,0.993819,-0.132189,-0.316082,-0.266745,-0.058097,-0.280042,0.645243,0.984256,-0.423521,...,0.66441,0.044474,-0.40866,-0.503625,-0.356148,0.324272,0.304523,0.146801,-0.498742,0.0
1,1.812605,-0.507759,-0.209259,-0.768025,-0.679756,-0.058097,-0.280042,0.407164,-0.523379,0.075802,...,-0.688932,0.32107,-0.068149,-0.503625,0.905742,1.285533,1.19343,-0.460691,1.145758,0.0
2,0.231355,-0.295428,-0.056341,-1.356515,-0.406597,-0.058097,-0.280042,0.704762,2.028003,-0.423521,...,-1.294374,1.095536,-1.089682,-0.503625,0.124572,0.13202,0.167768,0.146801,-0.498742,0.0
3,-0.506562,1.865213,-0.209259,1.903086,-0.379831,-0.058097,-0.280042,-1.199865,0.056481,-0.423521,...,0.66441,0.652984,-0.40866,-0.503625,1.146102,0.965113,1.056675,-0.055696,-0.498742,0.0
4,0.336772,1.361135,-0.163376,0.335726,-0.679756,-0.058097,-0.280042,0.942841,-0.639351,-0.423521,...,1.198624,-0.066164,-0.749171,-0.503625,0.725472,0.644693,0.64641,-0.49444,-0.498742,0.0


In [6]:
print(X_train.iloc[:,-1].value_counts())
print("Outlier Ratio: {:.5f}".format(X_train.iloc[:,-1].sum()/X_train.shape[0]))

0.0    1323
1.0     141
Name: 21, dtype: int64
Outlier Ratio: 0.09631


In [7]:
print(X_test.iloc[:,-1].value_counts())
print("Outlier Ratio: {:.5f}".format(X_test.iloc[:,-1].sum()/X_test.shape[0]))

0.0    332
1.0     35
Name: 21, dtype: int64
Outlier Ratio: 0.09537


### Write training data and test data to different files

In [8]:
dirname = "data/"

# Change filenames according to dataset
X_train.to_csv(dirname + "cardio_train.csv", index=False)
X_test.to_csv(dirname + "cardio_test.csv", index=False)