# Import libraries

In [1]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import pandas as pd
import numpy as np
from IPython.display import SVG, display, Markdown
from datetime import datetime
from sklearn import preprocessing
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from keras.utils import np_utils
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score
import pickle

Using TensorFlow backend.


# Load training dataset

In [2]:
train_dataframe = pd.read_csv('dataset/1. istanbul/train_data.csv')

# Visualize some training data

In [3]:
train_dataframe.head()

Unnamed: 0,Subject ID,Jitter (local),"Jitter (local, absolute)",Jitter (rap),Jitter (ppq5),Jitter (ddp),Shimmer (local),"Shimmer (local, dB)",Shimmer (apq3),Shimmer (apq5),...,Maximum pitch,Number of pulses,Number of periods,Mean period,Standard deviation of period,Fraction of locally unvoiced frames,Number of voice breaks,Degree of voice breaks,UPDRS,Class information
0,1,1.488,9e-05,0.9,0.794,2.699,8.334,0.779,4.517,4.609,...,187.576,160,159,0.006065,0.000416,0.0,0,0.0,23,1
1,1,0.728,3.8e-05,0.353,0.376,1.059,5.864,0.642,2.058,3.18,...,234.505,170,169,0.005181,0.000403,2.247,0,0.0,23,1
2,1,1.22,7.4e-05,0.732,0.67,2.196,8.719,0.875,4.347,5.166,...,211.442,1431,1427,0.006071,0.000474,10.656,1,0.178,23,1
3,1,2.502,0.000123,1.156,1.634,3.469,13.513,1.273,5.263,8.771,...,220.23,94,92,0.00491,0.00032,0.0,0,0.0,23,1
4,1,3.509,0.000167,1.715,1.539,5.145,9.112,1.04,3.102,4.927,...,225.162,117,114,0.004757,0.00038,18.182,1,13.318,23,1


# Prepare training data

In [4]:
train_x = train_dataframe.drop(['Subject ID', 'Class information', 'UPDRS'], axis=1)
train_x = train_x.as_matrix()
print(train_x.shape)

(1040, 26)


# Prepare training labels

In [5]:
train_y = train_dataframe['Class information']
train_y = train_y.as_matrix()
print(train_y.shape)
train_y = np_utils.to_categorical(train_y)
print(train_y.shape)

(1040,)
(1040, 2)


# Prepare test data

In [6]:
test_dataframe = pd.read_csv('dataset/1. istanbul/test_data.csv')
test_x = test_dataframe.drop(['Subject ID', 'Class information'], axis=1)
test_x = test_x.as_matrix()
print(test_x.shape)

(168, 26)


# Prepare test labels

In [7]:
test_y = test_dataframe['Class information']
test_y = test_y.as_matrix()
print(test_y.shape)
test_y = np_utils.to_categorical(test_y)
print(test_y.shape)

(168,)
(168, 2)


# Combine train and test set

In [8]:
x = np.vstack([train_x, test_x])
y = np.vstack([train_y, test_y])
print(x.shape)
print(y.shape)
pd.DataFrame(x).head()

(1208, 26)
(1208, 2)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,1.488,9e-05,0.9,0.794,2.699,8.334,0.779,4.517,4.609,6.802,...,10.421,142.229,187.576,160.0,159.0,0.006065,0.000416,0.0,0.0,0.0
1,0.728,3.8e-05,0.353,0.376,1.059,5.864,0.642,2.058,3.18,7.194,...,14.773,159.515,234.505,170.0,169.0,0.005181,0.000403,2.247,0.0,0.0
2,1.22,7.4e-05,0.732,0.67,2.196,8.719,0.875,4.347,5.166,7.548,...,12.981,146.445,211.442,1431.0,1427.0,0.006071,0.000474,10.656,1.0,0.178
3,2.502,0.000123,1.156,1.634,3.469,13.513,1.273,5.263,8.771,16.779,...,10.853,182.713,220.23,94.0,92.0,0.00491,0.00032,0.0,0.0,0.0
4,3.509,0.000167,1.715,1.539,5.145,9.112,1.04,3.102,4.927,12.823,...,11.499,182.821,225.162,117.0,114.0,0.004757,0.00038,18.182,1.0,13.318


# Normalize data

In [9]:
mins = np.min(x, axis=0)
maxs = np.max(x, axis=0)
rng = maxs - mins
x = 1.0 - (((1.0 - 0.0) * (maxs - x)) / rng)
pd.DataFrame(x).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.096023,0.110551,0.106928,0.054303,0.106923,0.186514,0.266893,0.162585,0.056981,0.142044,...,0.034343,0.193353,0.204351,0.107383,0.106783,0.378729,0.062423,0.0,0.0,0.0
1,0.042716,0.042501,0.038278,0.023294,0.038318,0.12526,0.215176,0.065922,0.037237,0.150903,...,0.049169,0.238354,0.295334,0.114094,0.113499,0.295624,0.060411,0.025488,0.0,0.0
2,0.077225,0.089595,0.085843,0.045104,0.085882,0.196062,0.303133,0.155902,0.064677,0.158903,...,0.043064,0.204329,0.250621,0.960403,0.958361,0.379296,0.071557,0.120874,0.083333,0.002575
3,0.167146,0.152809,0.139056,0.116617,0.139134,0.314949,0.453379,0.19191,0.114487,0.367528,...,0.035815,0.298746,0.267658,0.063087,0.061786,0.270109,0.047233,0.0,0.0,0.0
4,0.237778,0.209959,0.209212,0.10957,0.209245,0.205808,0.365421,0.106962,0.061375,0.278121,...,0.038016,0.299027,0.27722,0.078523,0.076561,0.255738,0.056711,0.206243,0.083333,0.192688


# Shuffle data

In [10]:
x, y = shuffle(x, y)
pd.DataFrame(y).head()

Unnamed: 0,0,1
0,0.0,1.0
1,0.0,1.0
2,1.0,0.0
3,0.0,1.0
4,1.0,0.0


# Split data

In [11]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=1)
for train_index, test_index in sss.split(x, y):
    X_train, X_test = x[train_index], x[test_index]
    Y_train, Y_test = y[train_index], y[test_index]

# Tuning parameters

In [12]:
# Folder paths to store outputs.
root_directory = 'dataset/6. normalized-10/'
pickle.dump(X_train, open(root_directory + 'x_train.p', "wb"))
pickle.dump(X_test, open(root_directory + 'x_test.p', "wb"))
pickle.dump(Y_train, open(root_directory + 'y_train.p', "wb"))
pickle.dump(Y_test, open(root_directory + 'y_test.p', "wb"))

# The End