In [None]:
import numpy as np
import os
import json
import utils

from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix

from ml_statistical_features import load_data, get_all_scores, get_rf_grid_params, eval_classifier_paper
from notebook_md_utils import get_md_data_distribution_string, get_md_mean_accuracy_grid, get_md_test_accuracy_grid, get_md_confusion_matrix_grid

from IPython.display import Markdown, display

In [None]:
%%html
<style>
  table {margin-left: 0 !important;}
</style>

# Daten laden

In [None]:
x, y, mean_error, coverage, patient_id = load_data(segment_length=10, overlap_amount=0)

# Splitten der Daten in G1 und G2

## Labelverteilung

### Im Paper:
58% informativ

42% nicht-informativ


In [None]:
distribution = np.bincount(y)

display(Markdown("### Vorliegende Daten"))
display(Markdown("%i %s informativ" % (round(100/len(y)*distribution[1]), '%')))
display(Markdown("%i %s nicht-informativ" % (round(100/len(y)*distribution[0]), '%')))


## Gruppenverteilung
Segmente zufällig in 2 Gruppen unterteilt
### Im Paper


|   | informativ | nicht-informativ | gesamt    |   
|:--|:--------- :|:----------------:|:---------:|
| G1 |1296 (62%)  | 789 (38%)        | 2085 (57%)|
| G2 |813 (53%)   | 733 (47%)        | 1546 (43%)|

In [None]:
x_g1, x_g2, y_g1, y_g2 = train_test_split(x, y, test_size=0.43, random_state=1, stratify=y)

display(Markdown("### Vorliegende Daten"))
display(Markdown(get_md_data_distribution_string(y_g1, y_g2)))

## TODO: Coverage + Mean Error

## 10-Fold cross validation mean accuracy for G1 and G2

### Im Paper

(RF: ntrees=50, SVM: rbf kernel, NN: 50 hidden neuron)

|    | RF    | SVM   | NN    | LDA   | DT    |
|:---|:-----:|:-----:|:-----:|:-----:|:-----:|
| G1 | 98.13 | 93.38 | 91.61 | 89.26 | 97.51 |
| G2 | 92.30 | 90.49 | 85.89 | 79.37 | 89.39 |

In [None]:
scores = get_all_scores(reconstruct=False)


display(Markdown("### Vorliegende Daten"))
display(Markdown(get_md_mean_accuracy_grid(scores)))

## Accuracy results for testing G2 vs. G1 (Exp1) and testing G1 vs. G2 (Exp2)

### Im Paper

|      | RF    | SVM   | NN    | LDA   | DT    |
|:-----|:-----:|:-----:|:-----:|:-----:|:-----:|
| Exp1 | 100   | 94.44 | 92.28 | 89.40 | 97.51 |
| Exp2 | 97.99 | 97.46 | 87.10 | 90.26 | 97.41 |
| Mean | 98.995| 95.95 | 89.69 | 89.83 | 98.41 |

In [None]:
display(Markdown("### Vorliegende Daten"))
display(Markdown(get_md_test_accuracy_grid(scores)))

## Confusion matrix of random forest for Exp2

### Im Paper

|             |                  | Actual        |                  |
|:------------|:----------------:|:-------------:|:----------------:|
|             |                  | informativ    | nicht-informativ |
|             |                  |               |                  |
|**Predicted**| informativ       | 1270          | 26               |
|             | nicht-informativ | 18            | 771              |

In [None]:
rf, _ = get_rf_grid_params()
_, _, _, _, _, y_pred, y_true = eval_classifier_paper(x, y, clf=rf, grid_folder_name='RF_0717')
conf_mat = confusion_matrix(y_true, y_pred)

display(Markdown("### Vorliegende Daten"))
display(Markdown(get_md_confusion_matrix_grid(conf_mat)))