In [1]:
import os
import time
import math
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold

data_dir = '../data'

In [2]:
class DataSplitter():
    '''
    CLASS: DataSplitter = splits the data accordingly
    '''
    def __init__(self, X: pd.DataFrame, y: pd.Series, cv_mode: str, rand_mode: str, out_dir: str,
                 tsize: float, Kfolds: int):
        self.X = X
        self.y = y
        self.cv_mode = cv_mode
        self.rand_mode = rand_mode
        self.out_dir = out_dir
        
        self.tsize = tsize    # not used if cv_mode == 'kfold'
        self.Kfolds = Kfolds  # not used if cv_mode != 'kfold'
    
    def scale_X(self) -> None:
        sc = StandardScaler()
        self.X = sc.fit_transform(self.X)
    
    def _standard_split(self) -> None:
        '''
        METHOD: _standard_split = use train_test_split() to split the data via random seed or random_state=42
        '''
        if self.rand_mode == 'random':
            self.X_train, self.X_test, self.y_train, self.y_test = \
                train_test_split(self.X, self.y, test_size=self.tsize, stratify=self.y, random_state=math.floor(time.time()))
        else:
            self.X_train, self.X_test, self.y_train, self.y_test = \
                train_test_split(self.X, self.y, test_size=self.tsize, stratify=self.y, random_state=42)
    
    def _kfold_train_test_split(self, train: np.ndarray, test: np.ndarray) -> None:
        '''
        METHOD: _kfold_train_test_split = this acts as the K-fold'ed version of train_test_split()
        IN: train = a list of indices designating the train data (the rest of the folds)
            test = a list of indices designating the test data (a single fold)
        OUT: X_train, X_test, y_train, y_test = the actual data split by the folding
        '''
        # NOTE: StratifiedKFold() splits in terms of iloc indices
        X_train = self.X.iloc[train, :].values
        X_test = self.X.iloc[test, :].values
        y_train = self.y.iloc[train]
        y_test = self.y.iloc[test]
        return X_train, X_test, y_train, y_test
    
    def _kfold_split(self) -> None:
        '''
        METHOD: _kfold_split = this is the K-fold'ed version of _standard_split above
                               it uses StratifiedKFold() and _kfold_train_test_split() via random seed or otherwise
        '''
        if self.rand_mode == 'random':
            skf = StratifiedKFold(n_splits=self.Kfolds, shuffle=True, random_state=math.floor(time.time()))
        else:
            skf = StratifiedKFold(n_splits=self.Kfolds, shuffle=True, random_state=42)
        self.K_X_train = []
        self.K_X_test = []
        self.K_y_train = []
        self.K_y_test = []
        for train, test in skf.split(self.X, self.y):
            # print(train, test)
            X_train, X_test, y_train, y_test = self._kfold_train_test_split(train, test)
            self.K_X_train.append(X_train)
            self.K_X_test.append(X_test)
            self.K_y_train.append(y_train)
            self.K_y_test.append(y_test)
    
    def _book_keeping(self, y_train: pd.Series, y_test: pd.Series) -> None:
        '''
        METHOD: book_keeping = check the ratio of 1's to 0's in the splits
        '''
        y_train_count0 = len([sub for sub in y_train if sub == 0])
        y_train_count1 = len(y_train) - y_train_count0
        y_test_count0 = len([sub for sub in y_test if sub == 0])
        y_test_count1 = len(y_test) - y_test_count0
        
        print(f"  -- y_train has {y_train_count1} / {y_train_count0} = {y_train_count1/y_train_count0:.2f}x 1's to 0's")
        print(f"     y_test has {y_test_count1} / {y_test_count0} = {y_test_count1/y_test_count0:.2f}x 1's to 0's")
    
    def perform_splitting(self) -> None:
        '''
        METHOD: perform_splitting = run _kfold_split() or _standard_split()
        '''
        print("- some book keeping:")
        print(f"  compared to 564 / 192 = {564/192:.2f}x 1's (PD) to 0's (HC) in the full dataset,")
        if self.cv_mode == 'kfold':
            self._kfold_split()
            for kk in range(self.Kfolds):
                print(f"  < fold {kk} >")
                self._book_keeping(self.K_y_train[kk], self.K_y_test[kk])
        else:
            self._standard_split()
            self._book_keeping(self.y_train, self.y_test)
    
    def save(self):
        '''
        METHOD: save = save all the split out data into their respective csv's
        '''
        if not os.path.isdir(self.out_dir):
            os.makedirs(self.out_dir)
        
        print("- saving y's and X's to csv...")
        if self.cv_mode == 'kfold':
            for kk in range(self.Kfolds):
                print(f"  -- saving fold {kk}")
                self.K_y_train[kk].to_csv(self.out_dir + '/y_train-' + str(kk) + '_kfold' + str(self.Kfolds) + '.csv')
                self.K_y_test[kk].to_csv(self.out_dir + '/y_test-' + str(kk) + '_kfold' + str(self.Kfolds) + '.csv')
                np.savetxt(self.out_dir + '/X_train-' + str(kk) + '_kfold' + str(self.Kfolds) + '.csv', self.K_X_train[kk], delimiter=',')
                np.savetxt(self.out_dir + '/X_test-' + str(kk) + '_kfold' + str(self.Kfolds) + '.csv', self.K_X_test[kk], delimiter=',')
        else:
            self.y_train.to_csv(self.out_dir + '/y_train.csv')
            self.y_test.to_csv(self.out_dir + '/y_test.csv')
            np.savetxt(self.out_dir + '/X_train.csv', self.X_train, delimiter=',')
            np.savetxt(self.out_dir + '/X_test.csv', self.X_test, delimiter=',')
        print("  ...done.")
    
    def split_and_save(self) -> None:
        '''
        METHOD: split_and_save = run the above methods in succession
        '''
        self.perform_splitting()
        self.save()

In [9]:
# df = pd.read_csv(data_dir + '/pd_speech_features_reduced.csv')
# out_dir = data_dir + '/split/pure'

# df = pd.read_csv(data_dir + '/pd_speech_features_reduced_outrmv.csv')
# out_dir = data_dir + '/split/outrmv'

df = pd.read_csv(data_dir + '/pd_speech_features_reduced_outrmv_balanced.csv')
out_dir = data_dir + '/split/outrmv-balanced'

display(df)
print(df.shape)

y = df['class']
print(y)
features = list(df.columns)
features.remove('id')
features.remove('class')
X = df[features]

Unnamed: 0,id,gender,PPE,DFA,RPDE,numPeriodsPulses,meanPeriodPulses,stdDevPeriodPulses,locPctJitter,locAbsJitter,...,tqwt_kurtosisValue_dec_27,tqwt_kurtosisValue_dec_28,tqwt_kurtosisValue_dec_29,tqwt_kurtosisValue_dec_30,tqwt_kurtosisValue_dec_32,tqwt_kurtosisValue_dec_33,tqwt_kurtosisValue_dec_34,tqwt_kurtosisValue_dec_35,tqwt_kurtosisValue_dec_36,class
0,0,1,0.85247,0.71826,0.57227,239,0.008064,0.000087,0.00218,0.000018,...,1.5466,1.5620,2.6445,3.8686,5.1221,4.4625,2.6202,3.0004,18.9405,1
1,0,1,0.76686,0.69481,0.53966,233,0.008258,0.000073,0.00195,0.000016,...,1.5530,1.5589,3.6107,23.5155,11.0261,9.5082,6.5245,6.3431,45.1780,1
2,0,1,0.85083,0.67604,0.58982,231,0.008340,0.000060,0.00176,0.000015,...,1.5399,1.5643,2.3308,9.4959,11.0177,4.8066,2.9199,3.1495,4.7666,1
3,1,0,0.41121,0.79672,0.59257,177,0.010858,0.000183,0.00419,0.000046,...,6.9761,3.7805,3.5664,5.2558,4.2235,4.6857,4.8460,6.2650,4.0603,1
4,1,0,0.32790,0.79782,0.53028,235,0.008162,0.002669,0.00535,0.000044,...,7.8832,6.1727,5.8416,6.0805,7.7817,11.6891,8.2103,5.0559,6.1164,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,221,0,0.75490,0.62956,0.59795,497,0.003876,0.000099,0.00097,0.000004,...,2.5234,3.0159,2.7802,3.2202,5.6255,5.0766,3.5998,2.9165,3.4242,0
1118,123,0,0.80766,0.73961,0.20569,444,0.004335,0.000024,0.00072,0.000003,...,104.9547,75.3909,42.6083,16.8329,6.9441,3.7733,3.4248,4.5198,3.2221,0
1119,96,1,0.79551,0.61508,0.56689,188,0.010242,0.000062,0.00211,0.000022,...,1.7180,1.6951,1.6727,4.3372,17.0599,7.7388,7.0915,7.4624,3.2975,0
1120,123,0,0.83918,0.80293,0.29879,364,0.005275,0.001674,0.00174,0.000009,...,11.8291,7.0768,3.9566,3.7104,4.3985,2.8546,2.7479,3.3696,3.1582,0


(1122, 480)
0       1
1       1
2       1
3       1
4       1
       ..
1117    0
1118    0
1119    0
1120    0
1121    0
Name: class, Length: 1122, dtype: int64


In [10]:
# split for: standard train_test_split

ds_standard = DataSplitter(X, y, 'standard', 'non-random', out_dir + '/standard', 0.2, -1)
ds_standard.split_and_save()

- some book keeping:
  compared to 564 / 192 = 2.94x 1's (PD) to 0's (HC) in the full dataset,
  -- y_train has 449 / 448 = 1.00x 1's to 0's
     y_test has 112 / 113 = 0.99x 1's to 0's
- saving y's and X's to csv...
  ...done.


In [11]:
# split for: stratified K-fold CV split

ds_kfold = DataSplitter(X, y, 'kfold', 'non-random', out_dir + '/kfold', -1, 5)
ds_kfold.split_and_save()

- some book keeping:
  compared to 564 / 192 = 2.94x 1's (PD) to 0's (HC) in the full dataset,
  < fold 0 >
  -- y_train has 448 / 449 = 1.00x 1's to 0's
     y_test has 113 / 112 = 1.01x 1's to 0's
  < fold 1 >
  -- y_train has 449 / 448 = 1.00x 1's to 0's
     y_test has 112 / 113 = 0.99x 1's to 0's
  < fold 2 >
  -- y_train has 449 / 449 = 1.00x 1's to 0's
     y_test has 112 / 112 = 1.00x 1's to 0's
  < fold 3 >
  -- y_train has 449 / 449 = 1.00x 1's to 0's
     y_test has 112 / 112 = 1.00x 1's to 0's
  < fold 4 >
  -- y_train has 449 / 449 = 1.00x 1's to 0's
     y_test has 112 / 112 = 1.00x 1's to 0's
- saving y's and X's to csv...
  -- saving fold 0
  -- saving fold 1
  -- saving fold 2
  -- saving fold 3
  -- saving fold 4
  ...done.


In [None]:
# F- I-- N---