In [1]:
import os
import time
import math
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold

data_dir = '../data'

In [2]:
class DataSplitter():
    '''
    CLASS: DataSplitter = splits the data accordingly
    '''
    def __init__(self, X: pd.DataFrame, y: pd.Series, cv_mode: str, rand_mode: str, out_dir: str,
                 tsize: float, Kfolds: int):
        self.X = X
        self.y = y
        self.cv_mode = cv_mode
        self.rand_mode = rand_mode
        self.out_dir = out_dir
        
        self.tsize = tsize    # not used if cv_mode == 'kfold'
        self.Kfolds = Kfolds  # not used if cv_mode != 'kfold'
    
    def scale_X(self) -> None:
        sc = StandardScaler()
        self.X = sc.fit_transform(self.X)
    
    def _standard_split(self) -> None:
        '''
        METHOD: _standard_split = use train_test_split() to split the data via random seed or random_state=42
        '''
        if self.rand_mode == 'random':
            self.X_train, self.X_test, self.y_train, self.y_test = \
                train_test_split(self.X, self.y, test_size=self.tsize, stratify=self.y, random_state=math.floor(time.time()))
        else:
            self.X_train, self.X_test, self.y_train, self.y_test = \
                train_test_split(self.X, self.y, test_size=self.tsize, stratify=self.y, random_state=42)
    
    def _kfold_train_test_split(self, train: np.ndarray, test: np.ndarray) -> None:
        '''
        METHOD: _kfold_train_test_split = this acts as the K-fold'ed version of train_test_split()
        IN: train = a list of indices designating the train data (the rest of the folds)
            test = a list of indices designating the test data (a single fold)
        OUT: X_train, X_test, y_train, y_test = the actual data split by the folding
        '''
        # NOTE: StratifiedKFold() splits in terms of iloc indices
        X_train = self.X.iloc[train, :].values
        X_test = self.X.iloc[test, :].values
        y_train = self.y.iloc[train]
        y_test = self.y.iloc[test]
        return X_train, X_test, y_train, y_test
    
    def _kfold_split(self) -> None:
        '''
        METHOD: _kfold_split = this is the K-fold'ed version of _standard_split above
                               it uses StratifiedKFold() and _kfold_train_test_split() via random seed or otherwise
        '''
        if self.rand_mode == 'random':
            skf = StratifiedKFold(n_splits=self.Kfolds, shuffle=True, random_state=math.floor(time.time()))
        else:
            skf = StratifiedKFold(n_splits=self.Kfolds, shuffle=True, random_state=42)
        self.K_X_train = []
        self.K_X_test = []
        self.K_y_train = []
        self.K_y_test = []
        for train, test in skf.split(self.X, self.y):
            # print(train, test)
            X_train, X_test, y_train, y_test = self._kfold_train_test_split(train, test)
            self.K_X_train.append(X_train)
            self.K_X_test.append(X_test)
            self.K_y_train.append(y_train)
            self.K_y_test.append(y_test)
    
    def _book_keeping(self, y_train: pd.Series, y_test: pd.Series) -> None:
        '''
        METHOD: book_keeping = check the ratio of 1's to 0's in the splits
        '''
        y_train_count0 = len([sub for sub in y_train if sub == 0])
        y_train_count1 = len(y_train) - y_train_count0
        y_test_count0 = len([sub for sub in y_test if sub == 0])
        y_test_count1 = len(y_test) - y_test_count0
        
        print(f"  -- y_train has {y_train_count1} / {y_train_count0} = {y_train_count1/y_train_count0:.2f}x 1's to 0's")
        print(f"     y_test has {y_test_count1} / {y_test_count0} = {y_test_count1/y_test_count0:.2f}x 1's to 0's")
    
    def perform_splitting(self) -> None:
        '''
        METHOD: perform_splitting = run _kfold_split() or _standard_split()
        '''
        print("- some book keeping:")
        print(f"  compared to 564 / 192 = {564/192:.2f}x 1's (PD) to 0's (HC) in the full dataset,")
        if self.cv_mode == 'kfold':
            self._kfold_split()
            for kk in range(self.Kfolds):
                print(f"  < fold {kk} >")
                self._book_keeping(self.K_y_train[kk], self.K_y_test[kk])
        else:
            self._standard_split()
            self._book_keeping(self.y_train, self.y_test)
    
    def save(self):
        '''
        METHOD: save = save all the split out data into their respective csv's
        '''
        if not os.path.isdir(self.out_dir):
            os.makedirs(self.out_dir)
        
        print("- saving y's and X's to csv...")
        if self.cv_mode == 'kfold':
            for kk in range(self.Kfolds):
                print(f"  -- saving fold {kk}")
                self.K_y_train[kk].to_csv(self.out_dir + '/y_train-' + str(kk) + '_kfold' + str(self.Kfolds) + '.csv')
                self.K_y_test[kk].to_csv(self.out_dir + '/y_test-' + str(kk) + '_kfold' + str(self.Kfolds) + '.csv')
                np.savetxt(self.out_dir + '/X_train-' + str(kk) + '_kfold' + str(self.Kfolds) + '.csv', self.K_X_train[kk], delimiter=',')
                np.savetxt(self.out_dir + '/X_test-' + str(kk) + '_kfold' + str(self.Kfolds) + '.csv', self.K_X_test[kk], delimiter=',')
        else:
            self.y_train.to_csv(self.out_dir + '/y_train.csv')
            self.y_test.to_csv(self.out_dir + '/y_test.csv')
            np.savetxt(self.out_dir + '/X_train.csv', self.X_train, delimiter=',')
            np.savetxt(self.out_dir + '/X_test.csv', self.X_test, delimiter=',')
        print("  ...done.")
    
    def split_and_save(self) -> None:
        '''
        METHOD: split_and_save = run the above methods in succession
        '''
        self.perform_splitting()
        self.save()

In [33]:
# df = pd.read_csv(data_dir + '/pd_speech_features_reduced.csv')
# out_dir = data_dir + '/split/pure'

# df = pd.read_csv(data_dir + '/pd_speech_features_reduced_outrmv.csv')
# out_dir = data_dir + '/split/outrmv'

df = pd.read_csv(data_dir + '/pd_speech_features_reduced_outrmv_balanced.csv')
out_dir = data_dir + '/split/outrmv-balanced'

display(df)
print(df.shape)

y = df['class']
print(y)
features = list(df.columns)
features.remove('id')
features.remove('class')
X = df[features]

Unnamed: 0,id,gender,PPE,DFA,RPDE,numPeriodsPulses,meanPeriodPulses,stdDevPeriodPulses,ddpJitter,apq5Shimmer,...,tqwt_kurtosisValue_dec_26,tqwt_kurtosisValue_dec_27,tqwt_kurtosisValue_dec_28,tqwt_kurtosisValue_dec_29,tqwt_kurtosisValue_dec_30,tqwt_kurtosisValue_dec_31,tqwt_kurtosisValue_dec_33,tqwt_kurtosisValue_dec_35,tqwt_kurtosisValue_dec_36,class
0,0,1,0.823387,0.696370,0.567250,234.333333,0.008220,0.000073,0.001760,0.045560,...,1.591700,1.546500,1.561733,2.862000,12.293333,9.717500,6.259100,4.164333,22.961700,1
1,1,0,0.415637,0.793993,0.592453,211.000000,0.008884,0.001849,0.005473,0.036503,...,1.881900,7.049367,4.918567,4.827133,6.117633,8.599667,7.933133,4.941833,4.467233,1
2,2,1,0.801973,0.619967,0.520563,318.333333,0.006041,0.000104,0.000973,0.026073,...,1.590333,1.581967,41.129400,31.201933,14.584467,5.446800,4.772067,11.848100,5.552367,1
3,3,0,0.828707,0.626097,0.537183,492.000000,0.003913,0.000042,0.000260,0.027467,...,5.676767,2.382533,1.677633,1.908400,2.842167,3.493867,3.085267,4.032933,22.773633,1
4,4,0,0.831287,0.779397,0.726717,361.666667,0.005622,0.002023,0.003290,0.108363,...,3.614567,3.881267,4.104600,4.285233,2.953200,2.799933,2.811367,13.338833,63.766900,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,173,1,0.816993,0.590523,0.485107,343.333333,0.005610,0.000048,0.000377,0.047977,...,1.742567,7.019500,6.661367,6.797900,7.258700,4.913100,2.478833,4.062433,5.462367,0
372,189,0,0.834303,0.687160,0.294950,406.666667,0.004734,0.000037,0.000360,0.031340,...,3.222667,50.354967,20.084500,5.067233,3.050800,3.202900,2.935767,3.099533,3.476433,0
373,75,0,0.597490,0.802380,0.431527,316.000000,0.006081,0.000935,0.001600,0.062293,...,1.598133,2.165500,8.089833,6.083700,5.110700,4.043133,4.045600,4.669567,3.557500,0
374,103,0,0.695677,0.705133,0.593263,318.000000,0.007010,0.000683,0.001650,0.033240,...,54.418867,36.123100,27.244533,7.919000,3.229800,3.667867,2.823633,3.133967,3.184200,0


(376, 363)
0      1
1      1
2      1
3      1
4      1
      ..
371    0
372    0
373    0
374    0
375    0
Name: class, Length: 376, dtype: int64


In [34]:
# split for: standard train_test_split

ds_standard = DataSplitter(X, y, 'standard', 'non-random', out_dir + '/standard', 0.2, -1)
ds_standard.split_and_save()

- some book keeping:
  compared to 564 / 192 = 2.94x 1's (PD) to 0's (HC) in the full dataset,
  -- y_train has 150 / 150 = 1.00x 1's to 0's
     y_test has 38 / 38 = 1.00x 1's to 0's
- saving y's and X's to csv...
  ...done.


In [35]:
# split for: stratified K-fold CV split

ds_kfold = DataSplitter(X, y, 'kfold', 'non-random', out_dir + '/kfold', -1, 5)
ds_kfold.split_and_save()

- some book keeping:
  compared to 564 / 192 = 2.94x 1's (PD) to 0's (HC) in the full dataset,
  < fold 0 >
  -- y_train has 150 / 150 = 1.00x 1's to 0's
     y_test has 38 / 38 = 1.00x 1's to 0's
  < fold 1 >
  -- y_train has 150 / 151 = 0.99x 1's to 0's
     y_test has 38 / 37 = 1.03x 1's to 0's
  < fold 2 >
  -- y_train has 150 / 151 = 0.99x 1's to 0's
     y_test has 38 / 37 = 1.03x 1's to 0's
  < fold 3 >
  -- y_train has 151 / 150 = 1.01x 1's to 0's
     y_test has 37 / 38 = 0.97x 1's to 0's
  < fold 4 >
  -- y_train has 151 / 150 = 1.01x 1's to 0's
     y_test has 37 / 38 = 0.97x 1's to 0's
- saving y's and X's to csv...
  -- saving fold 0
  -- saving fold 1
  -- saving fold 2
  -- saving fold 3
  -- saving fold 4
  ...done.


In [None]:
# F- I-- N---