Aims:
- Import flavin signals
- Process them
- Label them as oscillating, non-oscillating, or unsure if I haven't done it already
- Train SVM and evaluate its performance

Paradigms:
- Leverage `pandas`: ease transition to `stoa`, easier to manipulate with `scikit-learn`, cleaner code
- Discard unnecessary information (including births) & processes
- Ultimate goal to put the parameters in each cell together in a `dict` and put the code in a pipeline, like everything in `stoa`

**IMPORTANT NOTE: USE THE `aliby` VIRTUAL ENVIRONMENT

In [None]:
%matplotlib inline

In [None]:
%matplotlib qt
import matplotlib
plt.rcParams.update({'font.size': 18})
plt.rcParams.update({'font.family': 'Noto Sans'})
matplotlib.rcParams['axes.unicode_minus'] = False

In [None]:
pd.set_option('display.max_rows', 10)

# Import data

In [None]:
import numpy as np
import pandas as pd
import csv

# PARAMETERS
#filename_prefix = './data/arin/Omero19979_'
filename_prefix = './data/arin/Omero20016_'
#

# Import flavin signals
signal = pd.read_csv(filename_prefix+'flavin.csv')
signal.replace(0, np.nan, inplace=True) # because the CSV is constructed like that :/

# Import look-up table for strains (would prefer to directly CSV -> dict)
strainlookup_df = pd.read_csv(filename_prefix+'strains.csv')
strainlookup_dict = dict(zip(strainlookup_df.position, strainlookup_df.strain))

# Positions -> Strain (more informative)
signal = signal.replace({'position': strainlookup_dict})
signal.rename(columns = {"position": "strain"}, inplace = True)
signal = signal.drop(['distfromcentre'], axis = 1)

# Convert to multi-index dataframe
signal_temp = signal.iloc[:,2:]
multiindex = pd.MultiIndex.from_frame(signal[['strain', 'cellID']])
signal = pd.DataFrame(signal_temp.to_numpy(),
                      index = multiindex)

signal

# Choose a list of cells as working data

List strains

In [None]:
signal.index.get_level_values(0).unique().to_list()

Define `signal_wd` as working data

In [None]:
signal_wd = signal.loc[['by4741']]

signal_wd

In [None]:
signal_wd = signal

In [None]:
signal_processed.index.get_level_values(0).value_counts()

# Processing time series

## Range

Chop up time series according to `interval_start` and `interval_end`, then remove cells that have NaNs.  Print number of cells.

In [None]:
# PARAMETERS
interval_start = 25
interval_end = 168
#

signal_processed = signal_wd.iloc[:, interval_start:interval_end].dropna()

signal_processed

## Detrend

Using sliding window (Alán)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from postprocessor.core.processes.detrend import detrendParameters, detrend

fig, ax = plt.subplots()
sns.heatmap(signal_processed)
plt.title('Before detrending')
plt.show()

detrend_runner = detrend(detrendParameters.default())
signal_norm = detrend_runner.run(signal_processed)

fig, ax = plt.subplots()
sns.heatmap(signal_norm)
plt.title('After detrending')
plt.show()

signal_processed = signal_norm

signal_processed

## Normalisation

Option 1: ...

In [None]:
# code

# Assign labels (if not already done)

Assign labels by scoring oscillations (human), and save scores

In [None]:
%matplotlib inline

# PARAMETERS
filename_category = 'test.csv'
#

category_list = []
for timeseries in signal_processed.to_numpy():
    plt.plot(timeseries)
    plt.show(block=False)
    category = input('Is this oscillatory?: ')
    category_list.append(category)
category_df = pd.DataFrame(category_list, index = signal_processed.index)
category_df.to_csv(filename_category, index=True)

Or, randomise scores and save them

In [None]:
# PARAMETERS
filename_category = 'random.csv'
#category_labels = [0,1,2]
#weights = [51/294, 135/294, 108/294]
category_labels = [0,1]
weights = [345/678, 333/678]
#

category_df = pd.DataFrame(
    [np.random.choice(category_labels, 1, p=weights) for i in range(len(signal_processed))],
    index = signal_processed.index
)
category_df.to_csv(filename_category, index=True)

# Featurisation

TODO: Make choice of feature some kind of parameter within the overarching pipeline

Option 1: Use `catch22`

In [None]:
from postprocessor.core.processes.catch22 import catch22Parameters, catch22

catch22_processor = catch22(catch22Parameters.default())
features = catch22_processor.run(signal_processed)

sns.heatmap(features)

Additionally, choose a subset of the `catch22` features

In [None]:
features_subset = [
    'PD_PeriodicityWang_th0_01',
    'FC_LocalSimple_mean1_tauresrat',
    'SB_MotifThree_quantile_hh',
    'CO_Embed2_Dist_tau_d_expfit_meandiff',
    'SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1',
    #'CO_FirstMin_ac',
    #'CO_f1ecac',
]
features = features[features_subset]

sns.heatmap(features)

Option 2: FFT spectrum

(Caution: there may be slight variations between this and the old notebook -- could be yet-to-be-debugged different behaviour in `postprocessor.core.processes.fft`)

In [None]:
from postprocessor.core.processes.fft import fftParameters, fft

fft_processor = fft(fftParameters.default())
_, features = fft_processor.run(signal_processed)

sns.heatmap(features)

Option 3: concatenate both

In [None]:
from postprocessor.core.processes.catch22 import catch22Parameters, catch22
from postprocessor.core.processes.fft import fftParameters, fft

catch22_processor = catch22(catch22Parameters.default())
catch22_features = catch22_processor.run(signal_processed)
fft_processor = fft(fftParameters.default())
_, fft_features = fft_processor.run(signal_processed)
features = pd.concat([catch22_features, fft_features], axis=1)

sns.heatmap(features)

# Classifier pipeline

- Import targets (labels, e.g. oscillatory vs non-oscillatory)
- Construct pipeline: detrend, featurise with `catch22`, classifier/model
- Grid search and apply best hyperparameters

In [None]:
# Current intention: run this after import and chopping up time series.
# The `signal_processed` variable should be defined at this point.
# Refactoring (splitting each part into its own script) will come later...

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold, KFold
from sklearn.metrics import precision_score, recall_score, roc_curve, auc, roc_auc_score

from postprocessor.core.processes.detrend import detrendParameters, detrend
from postprocessor.core.processes.catch22 import catch22Parameters, catch22

# PARAMETERS
filename_targets = 'categories_20016_detrend.csv'
#

# (add import, chopping up time series)

# Import target values
targets_df = pd.read_csv(filename_targets, header = None, index_col = 0)
targets_df.index.names = ['cellID']
targets = targets_df.loc[signal_processed.index.get_level_values('cellID')].to_numpy().flatten()
# Converts whatever the target values are to 0 and 1
#targets = np.array([np.argwhere(np.unique(targets) == element).flatten()[0] for element in targets])

## TODO: option to remove class 2 ones if three classes defined but I want a binary classifier

# Wrap post-processes into objects that scikit-learn can make a pipeline from
class DetrendTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, window = 45):
        self.window = window
        
    def fit(self, x, y = None):
        self.x = x
        self.y = y
        return self
    
    def transform(self, x, y = None):
        detrend_params = detrendParameters(self.window)
        detrend_runner = detrend(detrend_params)
        return detrend_runner.run(x)
    
class Catch22Transformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, x, y = None):
        self.x = x
        self.y = y
        return self
    
    def transform(self, x):
        catch22_runner = catch22(catch22Parameters.default())
        return catch22_runner.run(x)

# Set up for hyperparameter grid search
window_range = [45]
#C_range = [1] #np.logspace(-3, 3, 3)
#gamma_range = np.logspace(-3, 3, 3)

param_grid = [
    {
        'detrend__window': window_range,
        'classifier__kernel': ['rbf'],
        #'classifier__C': C_range,
#        'classifier__gamma': gamma_range
    },
]
# Create pipeline, with classifier
my_pipeline = Pipeline(
    [
        ('detrend', DetrendTransformer(window = 45)),
        ('featurise', Catch22Transformer()),
        ('scaler', StandardScaler()),
        ('classifier', SVC(probability=True)),
        #RandomForestClassifier(),
    ]
)

if False:
    features = signal_processed
    my_pipeline.fit(features, targets)

# Grid search (takes a while...)
grid_pipeline = GridSearchCV(my_pipeline, param_grid, cv = 5)
features = signal_processed
grid_pipeline.fit(features, targets)

my_pipeline.set_params(**grid_pipeline.best_params_)

- Training, evaluate using precision & recall
- Histogram of probabilities (if SVM)
- k-fold cross-validation
- ROC curve

In [None]:
# PARAMETERS
train_size = 0.75# * len(features)
#

# Split training & testing
if True:
    features_train, features_test, targets_train, targets_test = train_test_split(
        features, targets,
        train_size = train_size,
    )

## TODO: implement expanded testing list in the case I want a binary classifier but the data was classified into three classes.

if True:
    # Fit
    my_pipeline.fit(features_train, targets_train)

    # Predict categories
    targets_predicted = my_pipeline.predict(features_test)
    # Print cellIDs predicted to be in each category
    predictions_dict = {}
    for class_label in set(targets):
        predictions_dict[class_label] = features_test.iloc[targets_predicted == class_label].index.to_numpy()
    print('Predictions')
    print(predictions_dict)

    # Get probabilities
    targets_proba = my_pipeline.predict_proba(features_test)
    #pd.set_option('display.max_rows', None)
    targets_proba_df = pd.DataFrame(targets_proba, index = features_test.index)
    targets_proba_df.sort_values(by=[1]) # sorted by probability of oscillation
    # Plot histogram of probabilities
    fig, ax = plt.subplots()
    plt.hist(targets_proba_df.iloc[:,1], 40,
            color = '#3714b0')
    plt.title('Histogram of probabilities')
    plt.xlabel('Probability of oscillation')
    plt.ylabel('Number of time series')

    # Verify by doing it again with k-fold cross-validation
    n_splits = 5
    kf = StratifiedKFold(n_splits = n_splits)
    print('k-fold cross-validation')
    kf_scores = []
    for train_index, test_index in kf.split(features, targets):
        # Split training-testing
        features_train_kf, features_test_kf = features.iloc[train_index], features.iloc[test_index]
        targets_train_kf, targets_test_kf = targets[train_index], targets[test_index]

        # Train & predict
        my_pipeline.fit(features_train_kf, targets_train_kf)
        targets_predicted_kf = my_pipeline.predict(features_test_kf)

        kf_precision = precision_score(targets_test_kf, targets_predicted_kf, average='weighted')
        kf_recall = recall_score(targets_test_kf, targets_predicted_kf, average='weighted')
        
        # Compute measures
        print(
            'Precision ' +
            '%.4f' % kf_precision +
            ' Recall ' +
            '%.4f' % kf_recall
            )
        
        kf_scores.append([kf_precision, kf_recall])
    
    fig, ax = plt.subplots()
    for split in list(range(n_splits)):
        x_pos = np.arange(2)
        x_pos = [pos + split/8 for pos in x_pos]
        plt.bar(
            x_pos,
            kf_scores[split],
            width = 0.125
         )
    plt.xticks([0.25, 1.25], ['Precision', 'Recall'])
    plt.ylabel('Value')
    plt.title(str(n_splits)+'-fold cross-validation')
        
    # ROC curve
    scores = targets_proba_df.iloc[:,1]
    false_positive_rate, true_positive_rate, _ = roc_curve(targets_test, scores)
    fig, ax = plt.subplots()
    plt.plot(false_positive_rate, true_positive_rate)
    plt.title('ROC curve')
    plt.xlabel('false positive rate')
    plt.ylabel('true positive rate')
    print('ROC curve: area under curve is ' + '%.4f' % auc(false_positive_rate, true_positive_rate))

If using a random forest classifier, get feature importances

In [None]:
feature_importances = pd.DataFrame(
    classifier['randomforestclassifier'].feature_importances_,
    index = features.columns,
    columns = ['importance'],
)
feature_importances_sorted = feature_importances.sort_values(
    ['importance'],
    ascending = False,
)
feature_importances_sorted

In [None]:
fig, axs = plt.subplots()
plt.plot(np.cumsum(feature_importances_sorted.to_numpy()))
plt.xlabel('Feature')
plt.ylabel('Cumulative importance')
plt.show()

fig, axs = plt.subplots()
plt.bar(
    x = np.arange(len(feature_importances)),
    height = feature_importances.to_numpy().T[0],
    tick_label = feature_importances.index,
)
plt.xticks(rotation = 'vertical')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.show()

TODO: Test block for training on one dataset and testing on another