In [1]:
import os
import time
import math
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold

data_dir = '../data'

In [2]:
df = pd.read_csv(data_dir + '/pd_speech_features_reduced.csv')
display(df)
print(df.shape)

y = df['class']
print(y)
features = list(df.columns)
features.remove('id')
features.remove('class')
X = df[features]

Unnamed: 0,id,DFA,mean_MFCC_2nd_coef,std_6th_delta,std_7th_delta,std_8th_delta,std_9th_delta,std_10th_delta,std_11th_delta,std_delta_delta_log_energy,...,tqwt_minValue_dec_12,tqwt_minValue_dec_13,tqwt_minValue_dec_14,tqwt_maxValue_dec_11,tqwt_maxValue_dec_12,tqwt_maxValue_dec_13,tqwt_maxValue_dec_14,tqwt_kurtosisValue_dec_26,tqwt_kurtosisValue_dec_27,class
0,0,0.71826,2.48740,0.028830,0.042253,0.028115,0.047180,0.035775,0.020827,0.016392,...,-0.024286,-0.048924,-0.108390,0.022796,0.024286,0.048924,0.105750,1.6058,1.5466,1
1,0,0.69481,2.89860,0.050259,0.037071,0.022166,0.039071,0.030383,0.032360,0.014222,...,-0.099695,-0.112210,-0.101340,0.080975,0.099695,0.112210,0.121580,1.5772,1.5530,1
2,0,0.67604,3.22080,0.054611,0.051511,0.032326,0.047357,0.031811,0.036249,0.039709,...,-0.026241,-0.039886,-0.081836,0.013810,0.026241,0.039886,0.081818,1.5921,1.5399,1
3,1,0.79672,3.10230,0.035774,0.035142,0.032577,0.024291,0.032300,0.029051,0.015560,...,-0.129990,-0.268930,-0.586380,0.111290,0.122970,0.273530,0.567740,1.8829,6.9761,1
4,1,0.79782,2.94510,0.029721,0.033943,0.031989,0.026002,0.028222,0.030469,0.011037,...,-0.115900,-0.227210,-0.505800,0.086621,0.102950,0.235860,0.567490,1.8821,7.8832,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
751,250,0.56355,-1.97830,0.028405,0.039347,0.032791,0.036182,0.030312,0.029681,0.004271,...,-0.491410,-0.651480,-0.974550,0.300560,0.463150,0.646970,0.983470,10.1906,3.4394,0
752,250,0.56499,-1.45210,0.056871,0.051124,0.041971,0.034773,0.057365,0.039443,0.035989,...,-0.369920,-0.591670,-1.129400,0.136800,0.270620,0.727660,1.012000,22.4043,19.6733,0
753,251,0.72335,-1.13730,0.026838,0.038261,0.029193,0.031031,0.030826,0.024491,0.006292,...,-0.298120,-0.357650,-0.335780,0.168010,0.277380,0.348020,0.318670,2.3279,137.3126,0
754,251,0.74890,-0.17925,0.039557,0.032910,0.027188,0.030468,0.031574,0.027947,0.010716,...,-0.190990,-0.269840,-0.296620,0.116660,0.211770,0.265250,0.266000,1.6052,5.4425,0


(756, 52)
0      1
1      1
2      1
3      1
4      1
      ..
751    0
752    0
753    0
754    0
755    0
Name: class, Length: 756, dtype: int64


In [3]:
class DataSplitter():
    '''
    CLASS: DataSplitter = splits the data accordingly
    '''
    def __init__(self, X: pd.DataFrame, y: pd.Series, cv_mode: str, rand_mode: str, out_dir: str,
                 tsize: float, Kfolds: int):
        self.X = X
        self.y = y
        self.cv_mode = cv_mode
        self.rand_mode = rand_mode
        self.out_dir = out_dir
        
        self.tsize = tsize    # not used if cv_mode == 'kfold'
        self.Kfolds = Kfolds  # not used if cv_mode != 'kfold'
    
    def scale_X(self) -> None:
        sc = StandardScaler()
        self.X = sc.fit_transform(self.X)
    
    def _standard_split(self) -> None:
        '''
        METHOD: _standard_split = use train_test_split() to split the data via random seed or random_state=42
        '''
        if self.rand_mode == 'random':
            self.X_train, self.X_test, self.y_train, self.y_test = \
                train_test_split(self.X, self.y, test_size=self.tsize, stratify=self.y, random_state=math.floor(time.time()))
        else:
            self.X_train, self.X_test, self.y_train, self.y_test = \
                train_test_split(self.X, self.y, test_size=self.tsize, stratify=self.y, random_state=42)
    
    def _kfold_train_test_split(self, train: np.ndarray, test: np.ndarray) -> None:
        '''
        METHOD: _kfold_train_test_split = this acts as the K-fold'ed version of train_test_split()
        IN: train = a list of indices designating the train data (the rest of the folds)
            test = a list of indices designating the test data (a single fold)
        OUT: X_train, X_test, y_train, y_test = the actual data split by the folding
        '''
        # NOTE: StratifiedKFold() splits in terms of iloc indices
        X_train = self.X.iloc[train, :].values
        X_test = self.X.iloc[test, :].values
        y_train = self.y.iloc[train]
        y_test = self.y.iloc[test]
        return X_train, X_test, y_train, y_test
    
    def _kfold_split(self) -> None:
        '''
        METHOD: _kfold_split = this is the K-fold'ed version of _standard_split above
                               it uses StratifiedKFold() and _kfold_train_test_split() via random seed or otherwise
        '''
        if self.rand_mode == 'random':
            skf = StratifiedKFold(n_splits=self.Kfolds, shuffle=True, random_state=math.floor(time.time()))
        else:
            skf = StratifiedKFold(n_splits=self.Kfolds, shuffle=True, random_state=42)
        self.K_X_train = []
        self.K_X_test = []
        self.K_y_train = []
        self.K_y_test = []
        for train, test in skf.split(self.X, self.y):
            # print(train, test)
            X_train, X_test, y_train, y_test = self._kfold_train_test_split(train, test)
            self.K_X_train.append(X_train)
            self.K_X_test.append(X_test)
            self.K_y_train.append(y_train)
            self.K_y_test.append(y_test)
    
    def _book_keeping(self, y_train: pd.Series, y_test: pd.Series) -> None:
        '''
        METHOD: book_keeping = check the ratio of 1's to 0's in the splits
        '''
        y_train_count0 = len([sub for sub in y_train if sub == 0])
        y_train_count1 = len(y_train) - y_train_count0
        y_test_count0 = len([sub for sub in y_test if sub == 0])
        y_test_count1 = len(y_test) - y_test_count0
        
        print(f"  -- y_train has {y_train_count1} / {y_train_count0} = {y_train_count1/y_train_count0:.2f}x 1's to 0's")
        print(f"     y_test has {y_test_count1} / {y_test_count0} = {y_test_count1/y_test_count0:.2f}x 1's to 0's")
    
    def perform_splitting(self) -> None:
        '''
        METHOD: perform_splitting = run _kfold_split() or _standard_split()
        '''
        print("- some book keeping:")
        print(f"  compared to 564 / 192 = {564/192:.2f}x 1's (PD) to 0's (HC) in the full dataset,")
        if self.cv_mode == 'kfold':
            self._kfold_split()
            for kk in range(self.Kfolds):
                print(f"  < fold {kk} >")
                self._book_keeping(self.K_y_train[kk], self.K_y_test[kk])
        else:
            self._standard_split()
            self._book_keeping(self.y_train, self.y_test)
    
    def save(self):
        '''
        METHOD: save = save all the split out data into their respective csv's
        '''
        if not os.path.isdir(self.out_dir):
            os.makedirs(self.out_dir)
        
        print("- saving y's and X's to csv...")
        if self.cv_mode == 'kfold':
            for kk in range(self.Kfolds):
                print(f"  -- saving fold {kk}")
                self.K_y_train[kk].to_csv(self.out_dir + '/y_train-' + str(kk) + '_kfold' + str(self.Kfolds) + '.csv')
                self.K_y_test[kk].to_csv(self.out_dir + '/y_test-' + str(kk) + '_kfold' + str(self.Kfolds) + '.csv')
                np.savetxt(self.out_dir + '/X_train-' + str(kk) + '_kfold' + str(self.Kfolds) + '.csv', self.K_X_train[kk], delimiter=',')
                np.savetxt(self.out_dir + '/X_test-' + str(kk) + '_kfold' + str(self.Kfolds) + '.csv', self.K_X_test[kk], delimiter=',')
        else:
            self.y_train.to_csv(self.out_dir + '/y_train.csv')
            self.y_test.to_csv(self.out_dir + '/y_test.csv')
            np.savetxt(self.out_dir + '/X_train.csv', self.X_train, delimiter=',')
            np.savetxt(self.out_dir + '/X_test.csv', self.X_test, delimiter=',')
        print("  ...done.")
    
    def split_and_save(self) -> None:
        '''
        METHOD: split_and_save = run the above methods in succession
        '''
        self.perform_splitting()
        self.save()

In [4]:
# split for: standard train_test_split

ds_standard = DataSplitter(X, y, 'standard', 'non-random', data_dir + '/split/standard', 0.2, -1)
ds_standard.split_and_save()

- some book keeping:
  compared to 564 / 192 = 2.94x 1's (PD) to 0's (HC) in the full dataset,
  -- y_train has 451 / 153 = 2.95x 1's to 0's
     y_test has 113 / 39 = 2.90x 1's to 0's
- saving y's and X's to csv...
  ...done.


In [5]:
# split for: stratified K-fold CV split

ds_kfold = DataSplitter(X, y, 'kfold', 'non-random', data_dir + '/split/kfold', -1, 5)
ds_kfold.split_and_save()

- some book keeping:
  compared to 564 / 192 = 2.94x 1's (PD) to 0's (HC) in the full dataset,
  < fold 0 >
  -- y_train has 451 / 153 = 2.95x 1's to 0's
     y_test has 113 / 39 = 2.90x 1's to 0's
  < fold 1 >
  -- y_train has 451 / 154 = 2.93x 1's to 0's
     y_test has 113 / 38 = 2.97x 1's to 0's
  < fold 2 >
  -- y_train has 451 / 154 = 2.93x 1's to 0's
     y_test has 113 / 38 = 2.97x 1's to 0's
  < fold 3 >
  -- y_train has 451 / 154 = 2.93x 1's to 0's
     y_test has 113 / 38 = 2.97x 1's to 0's
  < fold 4 >
  -- y_train has 452 / 153 = 2.95x 1's to 0's
     y_test has 112 / 39 = 2.87x 1's to 0's
- saving y's and X's to csv...
  -- saving fold 0
  -- saving fold 1
  -- saving fold 2
  -- saving fold 3
  -- saving fold 4
  ...done.


In [None]:
# F- I-- N---