In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection  import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
SEED=42

In [2]:
df = pd.read_csv("../../datasets/clean-ds.csv")
df.head()

Unnamed: 0,Heart Beat 0.8~2.5hz from Avg,Peak Hz in 2.6~10hz,Peak Hz in 21~30hz,Peak Hz in 31~40hz,Peak Hz in 41~50hz,RMS avg,Zero crossing rate avg,Spectral flatness avg,Spectral rolloff avg,Spectral centroid avg,Poly features avg,Spectral bandwidth avg,Not Normal
0,69.0,5.5,23.33,31.0,43.67,0.2528,0.028,0.0202,3931.6415,3655.5029,0.8441,5149.4551,1
1,78.0,3.12,30.0,40.0,43.25,0.1949,0.0372,0.0211,5369.9529,4887.3982,0.695,6042.2007,0
2,78.0,3.0,24.71,39.5,41.5,0.206,0.0275,0.0148,5252.5443,4598.1605,0.663,6013.8062,1
3,76.0,4.0,25.0,40.0,50.0,0.2054,0.072,0.047,7429.2183,7041.157,1.1817,6818.168,0
4,83.0,5.78,22.4,38.78,45.0,0.244,0.0008,0.0,71.4086,128.8602,0.4138,605.7551,1


In [3]:
X, y = df.drop(['Not Normal'],axis=1), df['Not Normal']

In [4]:
standard_scaler = StandardScaler()
X = standard_scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=SEED)

In [5]:
dt_classifier = DecisionTreeClassifier(max_depth=2, criterion='entropy', splitter='random', random_state=SEED)
dt_classifier.fit(X_train, y_train)
y_pred = dt_classifier.predict(X_test)

In [6]:
score = accuracy_score(y_test, y_pred)
print('Voting Classifier: {:.2f}'.format(score))

Voting Classifier: 0.50


In [7]:
print('    Classification report for DecisionTreeClassifier')
print('-----------------------------------------------------------')

target_names = ['Normal', 'Not Normal']
print(metrics.classification_report(y_test, y_pred, digits=3, target_names=target_names))

    Classification report for DecisionTreeClassifier
-----------------------------------------------------------
              precision    recall  f1-score   support

      Normal      0.667     0.500     0.571         4
  Not Normal      0.333     0.500     0.400         2

    accuracy                          0.500         6
   macro avg      0.500     0.500     0.486         6
weighted avg      0.556     0.500     0.514         6



##### Parameter tuning

In classification technique, there are some parameters that can be tuned to optimize the classification. In DecessionTreeClassifier we can tune

Decision tree is max depth (the depth of the tree)
max feature (the feature used to classify)
criterion
splitter
Grid Search explores a range of parameters and finds the best combination of parameters. Then repeat the process several times until the best parameters are discovered. lets use grid search to get best params



In [8]:
from sklearn.model_selection import StratifiedKFold

dt_classifier = DecisionTreeClassifier()

parameter_grid = {
                  'criterion': ['gini', 'entropy'],
                  'splitter': ['best', 'random'],
                  'max_depth': [2, 3, 4],
                  'max_features': [4, 5, 6]
                 }

cross_validation = StratifiedKFold(n_splits=5)

grid_search = GridSearchCV(dt_classifier, param_grid=parameter_grid, cv=cross_validation)

grid_search.fit(X_train, y_train)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

best_dt_classifier = grid_search.best_estimator_


Best score: 0.6
Best parameters: {'criterion': 'gini', 'max_depth': 4, 'max_features': 4, 'splitter': 'random'}


In [9]:
best_dt_classifier.fit(X_train, y_train)
y_pred = best_dt_classifier.predict(X_test)
score = accuracy_score(y_test, y_pred)
print('Classifier: {:.2f}'.format(score))


Classifier: 0.67


In [10]:
df_importance = df[['Zero crossing rate avg', 'Heart Beat 0.8~2.5hz from Avg', 'Peak Hz in 31~40hz', 'Spectral rolloff avg', 'Spectral centroid avg', 'Spectral bandwidth avg','Peak Hz in 21~30hz', 'Peak Hz in 41~50hz','Not Normal']]
X, y = df_importance.drop(['Not Normal'],axis=1), df_importance['Not Normal']
standard_scaler = StandardScaler()
X = standard_scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=SEED)

In [11]:
dt_classifier = DecisionTreeClassifier(max_depth=2, criterion='entropy', splitter='best', max_features=5 ,random_state=SEED)
dt_classifier.fit(X_train, y_train)
y_pred = dt_classifier.predict(X_test)
score = accuracy_score(y_test, y_pred)
print('Classifier: {:.2f}'.format(score))

Classifier: 0.67
