In [1]:
import PyQt5
%config InlineBackend.figure_format = 'retina'
%matplotlib qt5

#python packages
import numpy as np
import scipy as sp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
import pac
import warnings

import seaborn as sns
sns.set_style('white')

import imp
import shape
import utils
from load_features import load_WaveformShape_features, Bandpower_features, mean_and_peak_freqs, statistics, fractal_dimensions, entropies
from val_metrics import *

imp.reload(utils)
imp.reload(shape)

<module 'shape' from 'C:\\Users\\USER\\Documents\\Yachay_Tech\\Thesis_Project\\ParkinsonsDetection\\python_scripts\\shape.py'>


# Visualizing cross-validation behavior in scikit-learn

Choosing the right cross-validation object is a crucial part of fitting a
model properly. There are many ways to split data into training and test
sets in order to avoid model overfitting, to standardize the number of
groups in test sets, etc.

This example visualizes the behavior of several common scikit-learn objects
for comparison.


In [2]:
from sklearn.model_selection import (TimeSeriesSplit, KFold, ShuffleSplit,
                                     StratifiedKFold, GroupShuffleSplit,
                                     GroupKFold, StratifiedShuffleSplit)
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
np.random.seed(1338)
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
n_splits = 10

## Visualize our data

First, we must understand the structure of our data. It has 100 randomly
generated input datapoints, 3 classes split unevenly across datapoints,
and 10 "groups" split evenly across datapoints.

As we'll see, some cross-validation objects do specific things with
labeled data, others behave differently with grouped data, and others
do not use this information.

To begin, we'll visualize our data.



In [3]:
"""Which comparison to make:
    1. Off-med vs Controls
    2. On-med vs Controls
    3. Off-med vs On-med
    """
comparison = 1
dataset = 'UNM' #UCSD or UNM

In [4]:
## Load data and meta-data
all_chan = False; EO = False
bands = [[0.5,4], [4,8], [8,12], [16,32], [32,64]] #Delta, Theta, Alpha, Beta, Gamma

In [5]:
Fs, t, S, Sc, Smed, flo, fhi = utils.loadmeta()  
eeg,rejects = utils.loadPD(EO, all_chan, dataset) # EO means Eyes Opened

In [6]:
"""This function calculates the shape measures calculated for analysis
    of the PD data set

    1. Peak and trough times(pks,trs)
    2. Peak and trough sharpness(pksharp,trsharp)
    3. Rise and decay steepnes(risteep,desteep)
    3. Sharpness ratio(ShR)
    4. Steepness ratio(StR)
    5. Peak-to-trough ratio(PTR)
    6. Rise-to-decay ratio(RDR)
    """
widthS = 3 #To calculate Waveform Shape features

pks,trs,ShR,PTR,StR,RDR = utils.measure_shape(eeg, rejects, widthS=widthS)
"""
Algorithms for estimating phase-amplitude coupling
"""
pac = utils.measure_pac(eeg,rejects,flo,fhi,Fs=Fs)

In [7]:
"""
Absolute and Relative BandPower features, from all five bands: Delta, Theta, Alpha, Beta, Gamma
"""
abs_powerOff = Bandpower_features(eeg['off'], Fs, bands, S, False, 'welch')
abs_powerOn = Bandpower_features(eeg['on'], Fs, bands, Smed, False, 'welch')
abs_powerCtl = Bandpower_features(eeg['C'], Fs, bands, Sc, False, 'welch')

rel_powerOff = Bandpower_features(eeg['off'], Fs, bands, S, True, 'welch')
rel_powerOn = Bandpower_features(eeg['on'], Fs, bands, Smed, True, 'welch')
rel_powerCtl = Bandpower_features(eeg['C'], Fs, bands, Sc, True, 'welch')

In [8]:
"""
Mean and Peak Frequency from the spectrum
"""
meanFreqsOff = mean_and_peak_freqs(eeg['off'], Fs, S)[0] 
meanFreqsOn = mean_and_peak_freqs(eeg['on'], Fs, Smed)[0]
meanFreqsCtl = mean_and_peak_freqs(eeg['C'], Fs, Sc)[0]

peakFreqsOff = mean_and_peak_freqs(eeg['off'], Fs, S)[1]
peakFreqsOn = mean_and_peak_freqs(eeg['on'], Fs, Smed)[1]
peakFreqsCtl = mean_and_peak_freqs(eeg['C'], Fs, Sc)[1]

In [9]:
"""This cell calculates statistical measures extracted from EEG for analysis
    of the PD data set

    1. Mean
    2. Standard Deviation
    3. Skewness
    4. Kurtosis
    5. Maximum
    6. Minimum
    7. 5th percentile value
    8. 25th percentile value
    9. 75th percentile value
    10. 95th percentile value
    11. Median
    12. Variance
    13. Root Mean Square value
    """
statsOff = statistics(eeg['off'], S).get()
statsOn = statistics(eeg['on'], Smed).get()
statsCtl = statistics(eeg['C'], Sc).get()

In [10]:
fractalOff = fractal_dimensions(eeg['off'], S)
fractalOn = fractal_dimensions(eeg['on'], Smed)
fractalCtl = fractal_dimensions(eeg['C'], Sc)

In [11]:
entOff = entropies(eeg['off'], S, Fs)
entOn = entropies(eeg['on'], Smed, Fs)
entCtl = entropies(eeg['C'], Sc, Fs)

In [12]:
# create features of class I
f1_B    = np.reshape(pac['off'],(S,1))
f2_B    = np.reshape(ShR['off'],(S,1))
f3_B    = np.reshape(StR['off'],(S,1))
f4_B    = np.reshape(PTR['off'],(S,1))
f5_B    = np.reshape(RDR['off'],(S,1))
cl_B    = np.ones((S,1)) # 1

In [13]:
# create features of class II
f1_C    = np.reshape(pac['on'],(Smed,1))
f2_C    = np.reshape(ShR['on'],(Smed,1))
f3_C    = np.reshape(StR['on'],(Smed,1))
f4_C    = np.reshape(PTR['on'],(Smed,1))
f5_C    = np.reshape(RDR['on'],(Smed,1))
if comparison == 1 or comparison == 3:
    cl_C    = np.zeros((Smed,1)) # transition means 0 #Original line
elif comparison == 2:
    cl_C    = np.ones((Smed,1))

In [14]:
# create features of class III
f1_E    = np.reshape(pac['C'],(Sc,1))
f2_E    = np.reshape(ShR['C'],(Sc,1))
f3_E    = np.reshape(StR['C'],(Sc,1))
f4_E    = np.reshape(PTR['C'],(Sc,1))
f5_E    = np.reshape(RDR['C'],(Sc,1))
cl_E    = np.negative(np.ones((Sc,1))) # -1

In [15]:
MftB = np.concatenate([f1_B,f2_B,f3_B, f4_B, f5_B, rel_powerOff, abs_powerOff, meanFreqsOff, peakFreqsOff, statsOff, fractalOff, entOff, cl_B],axis=1)
MftC = np.concatenate([f1_C,f2_C,f3_C, f4_C, f5_C, rel_powerOn,  abs_powerOn,  meanFreqsOn, peakFreqsOn,   statsOn,  fractalOn, entOn,  cl_C],axis=1)
MftE = np.concatenate([f1_E,f2_E,f3_E, f4_E, f5_E, rel_powerCtl, abs_powerCtl, meanFreqsCtl, peakFreqsCtl, statsCtl, fractalCtl, entCtl, cl_E],axis=1)

In [16]:
features = ['PAC','ShR','StR', 'PtT', 'RtF', 'rel_delta',
            'rel_theta','rel_alpha','rel_beta',
            'rel_gamma','abs_delta','abs_theta',
            'abs_alpha','abs_beta','abs_gamma','meanFreq','peakFreq',
            'mean','std','skewness', 'kurtosis', 'maximum', 'minimum',
            '5th perc','25th perc','75th perc','95th perc','median','variance','RMS',
           'detrended_fluctuation', 'higuchi_fd', 'katz_fd', 'petrosian_fd',
           'perm_entropy', 'svd_entropy']

In [17]:
FCM_B = pd.DataFrame(MftB,columns= features + ['class'])
FCM_C = pd.DataFrame(MftC,columns= features + ['class'])
FCM_E = pd.DataFrame(MftE,columns= features + ['class'])

In [18]:
#Classification between patients on-medication and patients off-medication   

if comparison == 3:
    TotalDataset = pd.concat([FCM_B,FCM_C],ignore_index=True)
    visDat = TotalDataset.copy(deep=True)
    visDat['class'] = visDat['class'].map({1:'off_med',0:'on_med'})

#Classification between patients on-medication and healthy control subjects        

elif comparison == 2:
    TotalDataset = pd.concat([FCM_C,FCM_E],ignore_index=True)
    visDat = TotalDataset.copy(deep=True)
    # visDat['class'] = visDat['class'].map({-1:'control',0:'on_med'}) #Original line
    visDat['class'] = visDat['class'].map({-1:'control',1:'on_med'})

#Classification between patients off-medication and healthy control subjects        

elif comparison == 1:
    TotalDataset = pd.concat([FCM_E,FCM_B],ignore_index=True)
    visDat = TotalDataset.copy(deep=True)
    visDat['class'] = visDat['class'].map({-1:'control',1:'off_med'})

In [19]:
X = TotalDataset[features]
y = TotalDataset[['class']]
X = np.asarray(X)
y = np.asarray(y)

In [26]:
# Generate the class/group data
# n_points = 100
# X = np.random.randn(100, 10)

# percentiles_classes = [.1, .3, .6]
# y = np.hstack([[ii] * int(100 * perc)
#                for ii, perc in enumerate(percentiles_classes)])

# Evenly spaced groups repeated once
groups = np.hstack([[ii] * 8 for ii in range(81)])


def visualize_groups(classes, groups, name):
    # Visualize dataset groups
    fig, ax = plt.subplots()
    ax.scatter(range(len(groups)),  [.5] * len(groups), c=groups, marker='_',
               lw=50, cmap=cmap_data)
    ax.scatter(range(len(groups)),  [3.5] * len(groups), c=classes, marker='_',
               lw=50, cmap=cmap_data)
    ax.set(ylim=[-1, 5], yticks=[.5, 3.5],
           yticklabels=['Data\ngroup', 'Data\nclass'], xlabel="Sample index")


visualize_groups(y, groups, 'no groups')

## Define a function to visualize cross-validation behavior

We'll define a function that lets us visualize the behavior of each
cross-validation object. We'll perform 4 splits of the data. On each
split, we'll visualize the indices chosen for the training set
(in blue) and the test set (in red).



In [27]:
def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=lw, cmap=cmap_cv,
                   vmin=-.2, vmax=1.2)

    # Plot the data classes and groups at the end
    ax.scatter(range(len(X)), [ii + 1.5] * len(X),
               c=y, marker='_', lw=lw, cmap=cmap_data)

    ax.scatter(range(len(X)), [ii + 2.5] * len(X),
               c=group, marker='_', lw=lw, cmap=cmap_data)

    # Formatting
    yticklabels = list(range(n_splits)) + ['class', 'group']
    ax.set(yticks=np.arange(n_splits+2) + .5, yticklabels=yticklabels,
           xlabel='Sample index', ylabel="CV iteration",
           ylim=[n_splits+2.2, -.2], xlim=[0, len(groups)])
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    return ax

Let's see how it looks for the :class:`~sklearn.model_selection.KFold`
cross-validation object:



In [28]:
fig, ax = plt.subplots()
cv = KFold(n_splits)
plot_cv_indices(cv, X, y, groups, ax, n_splits)

<AxesSubplot:title={'center':'KFold'}, xlabel='Sample index', ylabel='CV iteration'>

As you can see, by default the KFold cross-validation iterator does not
take either datapoint class or group into consideration. We can change this
by using the ``StratifiedKFold`` like so.



In [24]:
fig, ax = plt.subplots()
cv = StratifiedKFold(n_splits)
plot_cv_indices(cv, X, y, groups, ax, n_splits)

<AxesSubplot:title={'center':'StratifiedKFold'}, xlabel='Sample index', ylabel='CV iteration'>

In this case, the cross-validation retained the same ratio of classes across
each CV split. Next we'll visualize this behavior for a number of CV
iterators.

## Visualize cross-validation indices for many CV objects

Let's visually compare the cross validation behavior for many
scikit-learn cross-validation objects. Below we will loop through several
common cross-validation objects, visualizing the behavior of each.

Note how some use the group/class information while others do not.



In [25]:
cvs = [KFold, GroupKFold, ShuffleSplit, StratifiedKFold,
       GroupShuffleSplit, StratifiedShuffleSplit, TimeSeriesSplit]


for cv in cvs:
    this_cv = cv(n_splits=n_splits)
    fig, ax = plt.subplots(figsize=(6, 3))
    plot_cv_indices(this_cv, X, y, groups, ax, n_splits)

    ax.legend([Patch(color=cmap_cv(.8)), Patch(color=cmap_cv(.02))],
              ['Testing set', 'Training set'], loc=(1.02, .8))
    # Make the legend fit
    plt.tight_layout()
    fig.subplots_adjust(right=.7)
plt.show()