In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
%reload_ext autoreload
%autoreload 2
%matplotlib inline

Imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from src.ml.transformers import Catch22Transformer
from src.ml.predict import get_predictions, get_predictproba
from src.ml.metrics import StratifiedKFoldHandler

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import precision_score, recall_score, roc_curve, auc, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# Load data

In [None]:
data_dir = "../data/raw/"
group1_name = "is20016_zwf1egf"

In [None]:
filepath1 = data_dir + group1_name
timeseries1_filepath = filepath1 + "_timeseries.csv"
labels1_filepath = filepath1 + "_labels.csv"

timeseries_df = pd.read_csv(timeseries1_filepath, index_col=[0,1,2])
labels_df = pd.read_csv(labels1_filepath, index_col=[0,1,2])

In [None]:
timeseries_df

In [None]:
timeseries_dropna = timeseries_df.dropna()
len(timeseries_dropna)

In [None]:
labels_df

# Construct pipeline

In [None]:
np.logspace(-3, 3, 3)

In [None]:
binary_pipeline = Pipeline(
    [
        ("featurise", Catch22Transformer()),
        ("scaler", StandardScaler()),
        ("classifier", SVC(
            C=10.0,
            gamma='auto',
            probability=True,
        )),
    ]
)

# Manipulate data variables to create data and target matrices

In [None]:
features = timeseries_dropna

In [None]:
targets = labels_df.loc[features.index]

In [None]:
# Proportion of label '1', to get an idea of class imbalance
np.sum(targets.score)/len(targets)

# Train-test split

In [None]:
train_size = 0.75
features_train, features_test, targets_train, targets_test = train_test_split(
    features,
    targets,
    train_size=train_size,
    random_state=69,
)

# Predict and get results

In [None]:
binary_pipeline.fit(features_train, targets_train.to_numpy().ravel())

## Binary classifier

In [None]:
true_targets = targets_test.to_numpy().ravel()
true_targets

In [None]:
predictions_dict = get_predictions(
    binary_pipeline, features_test, pd.unique(targets.score)
)

In [None]:
predictions_dict

In [None]:
predicted_targets = binary_pipeline.predict(features_test)

In [None]:
predicted_targets

### Precision-recall

In [None]:
precision = precision_score(true_targets, predicted_targets)
print(precision)
recall = recall_score(true_targets, predicted_targets)
print(recall)

In [None]:
binary_pipeline.decision_function(features_test)

In [None]:
from sklearn.metrics import PrecisionRecallDisplay

y_score = binary_pipeline.decision_function(features_test)
y_test = true_targets

display = PrecisionRecallDisplay.from_predictions(
    y_test, y_score, name="LinearSVC", plot_chance_level=True
)
_ = display.ax_.set_title("2-class Precision-Recall curve")

### k-fold cross-validation

In [None]:
n_splits = 5

In [None]:
kfold = StratifiedKFoldHandler(binary_pipeline, features, targets, n_splits)

In [None]:
kfold.kf_scores

In [None]:
kfold.pretty_print()

In [None]:
fig, ax = plt.subplots()
kfold.barplot(ax)
ax.set_ylim((0,1))

## Probability

(Note: Results may differ from `predict`, according to `scikit-learn` documentation)

In [None]:
predictproba_df = get_predictproba(
    binary_pipeline, features_test
)

In [None]:
predictproba_df.sort_values(by=1)

In [None]:
fig, ax = plt.subplots()
sns.histplot(
    predictproba_df,
    x=1,
    binwidth=0.05,
    ax=ax,
)
ax.set_xlim((-0.05, 1.05))
ax.set_xlabel("Probability of oscillation")