In [1]:
import os
import time
import math
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold

data_dir = '../data'

In [2]:
df = pd.read_csv(data_dir + '/pd_speech_features_reduced.csv')
display(df)
print(df.shape)

y = df['class']
print(y)
features = list(df.columns)
features.remove('id')
features.remove('class')
X = df[features]

Unnamed: 0,id,gender,PPE,DFA,RPDE,numPulses,numPeriodsPulses,meanPeriodPulses,stdDevPeriodPulses,locPctJitter,...,tqwt_kurtosisValue_dec_28,tqwt_kurtosisValue_dec_29,tqwt_kurtosisValue_dec_30,tqwt_kurtosisValue_dec_31,tqwt_kurtosisValue_dec_32,tqwt_kurtosisValue_dec_33,tqwt_kurtosisValue_dec_34,tqwt_kurtosisValue_dec_35,tqwt_kurtosisValue_dec_36,class
0,0,1,0.85247,0.71826,0.57227,240,239,0.008064,0.000087,0.00218,...,1.5620,2.6445,3.8686,4.2105,5.1221,4.4625,2.6202,3.0004,18.9405,1
1,0,1,0.76686,0.69481,0.53966,234,233,0.008258,0.000073,0.00195,...,1.5589,3.6107,23.5155,14.1962,11.0261,9.5082,6.5245,6.3431,45.1780,1
2,0,1,0.85083,0.67604,0.58982,232,231,0.008340,0.000060,0.00176,...,1.5643,2.3308,9.4959,10.7458,11.0177,4.8066,2.9199,3.1495,4.7666,1
3,1,0,0.41121,0.79672,0.59257,178,177,0.010858,0.000183,0.00419,...,3.7805,3.5664,5.2558,14.0403,4.2235,4.6857,4.8460,6.2650,4.0603,1
4,1,0,0.32790,0.79782,0.53028,236,235,0.008162,0.002669,0.00535,...,6.1727,5.8416,6.0805,5.7621,7.7817,11.6891,8.2103,5.0559,6.1164,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
751,250,0,0.80903,0.56355,0.28385,417,416,0.004627,0.000052,0.00064,...,3.0706,3.0190,3.1212,2.4921,3.5844,3.5400,3.3805,3.2003,6.8671,0
752,250,0,0.16084,0.56499,0.59194,415,413,0.004550,0.000220,0.00143,...,1.9704,1.7451,1.8277,2.4976,5.2981,4.2616,6.3042,10.9058,28.4170,0
753,251,0,0.88389,0.72335,0.46815,381,380,0.005069,0.000103,0.00076,...,51.5607,44.4641,26.1586,6.3076,2.8601,2.5361,3.5377,3.3545,5.0424,0
754,251,0,0.83782,0.74890,0.49823,340,339,0.005679,0.000055,0.00092,...,19.1607,12.8312,8.9434,2.2044,1.9496,1.9664,2.6801,2.8332,3.7131,0


(756, 695)
0      1
1      1
2      1
3      1
4      1
      ..
751    0
752    0
753    0
754    0
755    0
Name: class, Length: 756, dtype: int64


In [3]:
class DataSplitter():
    '''
    CLASS: DataSplitter = splits the data accordingly
    '''
    def __init__(self, X: pd.DataFrame, y: pd.Series, cv_mode: str, rand_mode: str, out_dir: str,
                 tsize: float, Kfolds: int):
        self.X = X
        self.y = y
        self.cv_mode = cv_mode
        self.rand_mode = rand_mode
        self.out_dir = out_dir
        
        self.tsize = tsize    # not used if cv_mode == 'kfold'
        self.Kfolds = Kfolds  # not used if cv_mode != 'kfold'
    
    def _standard_split(self) -> None:
        '''
        METHOD: _standard_split = use train_test_split() to split the data via random seed or random_state=42
        '''
        if self.rand_mode == 'random':
            self.X_train, self.X_test, self.y_train, self.y_test = \
                train_test_split(self.X, self.y, test_size=self.tsize, stratify=self.y, random_state=math.floor(time.time()))
        else:
            self.X_train, self.X_test, self.y_train, self.y_test = \
                train_test_split(self.X, self.y, test_size=self.tsize, stratify=self.y, random_state=42)
    
    def _kfold_train_test_split(self, train: np.ndarray, test: np.ndarray) -> None:
        '''
        METHOD: _kfold_train_test_split = this acts as the K-fold'ed version of train_test_split()
        IN: train = a list of indices designating the train data (the rest of the folds)
            test = a list of indices designating the test data (a single fold)
        OUT: X_train, X_test, y_train, y_test = the actual data split by the folding
        '''
        # NOTE: StratifiedKFold() splits in terms of iloc indices
        X_train = self.X.iloc[train, :].values
        X_test = self.X.iloc[test, :].values
        y_train = self.y.iloc[train]
        y_test = self.y.iloc[test]
        return X_train, X_test, y_train, y_test
    
    def _kfold_split(self) -> None:
        '''
        METHOD: _kfold_split = this is the K-fold'ed version of _standard_split above
                               it uses StratifiedKFold() and _kfold_train_test_split() via random seed or otherwise
        '''
        if self.rand_mode == 'random':
            skf = StratifiedKFold(n_splits=self.Kfolds, shuffle=True, random_state=math.floor(time.time()))
        else:
            skf = StratifiedKFold(n_splits=self.Kfolds, shuffle=True, random_state=42)
        self.K_X_train = []
        self.K_X_test = []
        self.K_y_train = []
        self.K_y_test = []
        for train, test in skf.split(self.X, self.y):
            # print(train, test)
            X_train, X_test, y_train, y_test = self._kfold_train_test_split(train, test)
            self.K_X_train.append(X_train)
            self.K_X_test.append(X_test)
            self.K_y_train.append(y_train)
            self.K_y_test.append(y_test)
    
    def _book_keeping(self, y_train: pd.Series, y_test: pd.Series) -> None:
        '''
        METHOD: book_keeping = check the ratio of 1's to 0's in the splits
        '''
        y_train_count0 = len([sub for sub in y_train if sub == 0])
        y_train_count1 = len(y_train) - y_train_count0
        y_test_count0 = len([sub for sub in y_test if sub == 0])
        y_test_count1 = len(y_test) - y_test_count0
        
        print(f"  -- y_train has {y_train_count1} / {y_train_count0} = {y_train_count1/y_train_count0:.2f}x 1's to 0's")
        print(f"     y_test has {y_test_count1} / {y_test_count0} = {y_test_count1/y_test_count0:.2f}x 1's to 0's")
    
    def perform_splitting(self) -> None:
        '''
        METHOD: perform_splitting = run _kfold_split() or _standard_split()
        '''
        print("- some book keeping:")
        print(f"  compared to 564 / 192 = {564/192:.2f}x 1's (PD) to 0's (HC) in the full dataset,")
        if self.cv_mode == 'kfold':
            self._kfold_split()
            for kk in range(self.Kfolds):
                print(f"  < fold {kk} >")
                self._book_keeping(self.K_y_train[kk], self.K_y_test[kk])
        else:
            self._standard_split()
            self._book_keeping(self.y_train, self.y_test)
    
    def save(self):
        '''
        METHOD: save = save all the split out data into their respective csv's
        '''
        if not os.path.isdir(self.out_dir):
            os.makedirs(self.out_dir)
        
        print("- saving y's and X's to csv...")
        if self.cv_mode == 'kfold':
            for kk in range(self.Kfolds):
                print(f"  -- saving fold {kk}")
                self.K_y_train[kk].to_csv(self.out_dir + '/y_train-' + str(kk) + '_kfold' + str(self.Kfolds) + '.csv')
                self.K_y_test[kk].to_csv(self.out_dir + '/y_test-' + str(kk) + '_kfold' + str(self.Kfolds) + '.csv')
                np.savetxt(self.out_dir + '/X_train-' + str(kk) + '_kfold' + str(self.Kfolds) + '.csv', self.K_X_train[kk], delimiter=',')
                np.savetxt(self.out_dir + '/X_test-' + str(kk) + '_kfold' + str(self.Kfolds) + '.csv', self.K_X_test[kk], delimiter=',')
        else:
            self.y_train.to_csv(self.out_dir + '/y_train.csv')
            self.y_test.to_csv(self.out_dir + '/y_test.csv')
            np.savetxt(self.out_dir + '/X_train.csv', self.X_train, delimiter=',')
            np.savetxt(self.out_dir + '/X_test.csv', self.X_test, delimiter=',')
        print("  ...done.")
    
    def split_and_save(self) -> None:
        '''
        METHOD: split_and_save = run the above methods in succession
        '''
        self.perform_splitting()
        self.save()

In [4]:
# split for: standard train_test_split

ds_standard = DataSplitter(X, y, 'standard', 'non-random', data_dir + '/split/standard', 0.2, -1)
ds_standard.split_and_save()

- some book keeping:
  compared to 564 / 192 = 2.94x 1's (PD) to 0's (HC) in the full dataset,
  -- y_train has 451 / 153 = 2.95x 1's to 0's
     y_test has 113 / 39 = 2.90x 1's to 0's
- saving y's and X's to csv...
  ...done.


In [5]:
# split for: stratified K-fold CV split

ds_kfold = DataSplitter(X, y, 'kfold', 'non-random', data_dir + '/split/kfold', -1, 5)
ds_kfold.split_and_save()

- some book keeping:
  compared to 564 / 192 = 2.94x 1's (PD) to 0's (HC) in the full dataset,
  < fold 0 >
  -- y_train has 451 / 153 = 2.95x 1's to 0's
     y_test has 113 / 39 = 2.90x 1's to 0's
  < fold 1 >
  -- y_train has 451 / 154 = 2.93x 1's to 0's
     y_test has 113 / 38 = 2.97x 1's to 0's
  < fold 2 >
  -- y_train has 451 / 154 = 2.93x 1's to 0's
     y_test has 113 / 38 = 2.97x 1's to 0's
  < fold 3 >
  -- y_train has 451 / 154 = 2.93x 1's to 0's
     y_test has 113 / 38 = 2.97x 1's to 0's
  < fold 4 >
  -- y_train has 452 / 153 = 2.95x 1's to 0's
     y_test has 112 / 39 = 2.87x 1's to 0's
- saving y's and X's to csv...
  -- saving fold 0
  -- saving fold 1
  -- saving fold 2
  -- saving fold 3
  -- saving fold 4
  ...done.


In [None]:
# F- I-- N---