Aims:
- Import flavin signals
- Process them
- Label them as oscillating, non-oscillating, or unsure if I haven't done it already
- Train SVM and evaluate its performance

Paradigms:
- Leverage `pandas`: ease transition to `stoa`, easier to manipulate with `scikit-learn`, cleaner code
- Discard unnecessary information (including births) & processes
- Ultimate goal to put the parameters in each cell together in a `dict` and put the code in a pipeline, like everything in `stoa`

**IMPORTANT NOTE: USE THE `stoa` VIRTUAL ENVIRONMENT**

In [None]:
%matplotlib inline

In [None]:
pd.set_option('display.max_rows', 10)

# Import data

In [None]:
import numpy as np
import pandas as pd
import csv

# PARAMETERS
filename_prefix = './data/arin/Omero19979_'
#filename_prefix = './data/arin/Omero20016_'
#

# Import flavin signals
signal = pd.read_csv(filename_prefix+'flavin.csv')
signal.replace(0, np.nan, inplace=True) # because the CSV is constructed like that :/

# Import look-up table for strains (would prefer to directly CSV -> dict)
strainlookup_df = pd.read_csv(filename_prefix+'strains.csv')
strainlookup_dict = dict(zip(strainlookup_df.position, strainlookup_df.strain))

# Positions -> Strain (more informative)
signal = signal.replace({'position': strainlookup_dict})
signal.rename(columns = {"position": "strain"}, inplace = True)
signal = signal.drop(['distfromcentre'], axis = 1)

# Convert to multi-index dataframe
signal_temp = signal.iloc[:,2:]
multiindex = pd.MultiIndex.from_frame(signal[['strain', 'cellID']])
signal = pd.DataFrame(signal_temp.to_numpy(),
                      index = multiindex)

signal

# Choose a list of cells as working data

List strains

In [None]:
signal.index.get_level_values(0).unique().to_list()

Define `signal_wd` as working data

In [None]:
signal_wd = signal.loc['rim11_Del']

signal_wd

In [None]:
signal_wd = signal

# Processing time series

## Range

Chop up time series according to `interval_start` and `interval_end`, then remove cells that have NaNs.  Print number of cells.

In [None]:
# PARAMETERS
interval_start = 25
interval_end = 168
#

signal_processed = signal_wd.iloc[:, interval_start:interval_end].dropna()

signal_processed

## Detrend

Using sliding window (Alán)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# PARAMETERS
window = 45
#

fig, ax = plt.subplots()
sns.heatmap(signal_processed)
plt.title('Before detrending')
plt.show()

def moving_average(input_timeseries,
                  window = 3):
    processed_timeseries = np.cumsum(input_timeseries, dtype=float)
    processed_timeseries[window:] = processed_timeseries[window:] - processed_timeseries[:-window]
    return processed_timeseries[window - 1 :] /  window

signal_processed = signal_processed.div(signal_processed.mean(axis = 1), axis = 0)
signal_movavg = signal_processed.apply(lambda x: pd.Series(moving_average(x.values, window)), axis = 1)
signal_norm = signal_processed.iloc(axis = 1)[window//2: -window//2] / signal_movavg.iloc[:,0:signal_movavg.shape[1]-1].values

fig, ax = plt.subplots()
sns.heatmap(signal_norm)
plt.title('After detrending')
plt.show()

signal_processed = signal_norm

signal_processed

## Normalisation

Option 1: ...

In [None]:
# code

# Assign labels (if not already done)

Assign labels by scoring oscillations (human), and save scores

In [None]:
%matplotlib inline

# PARAMETERS
filename_category = 'test.csv'
#

category_list = []
for timeseries in signal_processed.to_numpy():
    plt.plot(timeseries)
    plt.show(block=False)
    category = input('Is this oscillatory?: ')
    category_list.append(category)
category_df = pd.DataFrame(category_list, index = signal_processed.index)
category_df.to_csv(filename_category, index=True)

Or, randomise scores and save them

In [None]:
# PARAMETERS
filename_category = 'random.csv'
#category_labels = [0,1,2]
#weights = [51/294, 135/294, 108/294]
category_labels = [0,1]
weights = [345/678, 333/678]
#

category_df = pd.DataFrame(
    [np.random.choice(category_labels, 1, p=weights) for i in range(len(signal_processed))],
    index = signal_processed.index
)
category_df.to_csv(filename_category, index=True)

# Featurisation

TODO: Make choice of feature some kind of parameter within the overarching pipeline

Option 1: Use `catch22`

In [None]:
from postprocessor.core.processes.catch22 import catch22Parameters, catch22

catch22_processor = catch22(catch22Parameters.default())
features = catch22_processor.run(signal_processed)

sns.heatmap(features)

Additionally, choose a subset of the `catch22` features

In [None]:
features_subset = [
    'CO_Embed2_Dist_tau_d_expfit_meandiff',
    'SP_Summaries_welch_rect_area_5_1',
    'SB_MotifThree_quantile_hh',
    'FC_LocalSimple_mean1_tauresrat',
    #'CO_f1ecac',
]
features = features[features_subset]

sns.heatmap(features)

Option 2: FFT spectrum

(Caution: there may be slight variations between this and the old notebook -- could be yet-to-be-debugged different behaviour in `postprocessor.core.processes.fft`)

In [None]:
from postprocessor.core.processes.fft import fftParameters, fft

fft_processor = fft(fftParameters.default())
_, features = fft_processor.run(signal_processed)

sns.heatmap(features)

Option 3: concatenate both

In [None]:
from postprocessor.core.processes.catch22 import catch22Parameters, catch22
from postprocessor.core.processes.fft import fftParameters, fft

catch22_processor = catch22(catch22Parameters.default())
catch22_features = catch22_processor.run(signal_processed)
fft_processor = fft(fftParameters.default())
_, fft_features = fft_processor.run(signal_processed)
features = pd.concat([catch22_features, fft_features], axis=1)

sns.heatmap(features)

# Classifier pipeline

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import precision_score, recall_score, roc_curve, auc, roc_auc_score

# PARAMETERS
filename_targets = 'categories_19979_detrend.csv'
#filename_targets = 'random.csv'
train_size = 150
#

# (add import, processing)

# (add featurisation)

# Import target values
targets_df = pd.read_csv(filename_targets, header = None, index_col = 0)
targets_df.index.names = ['cellID']
targets = targets_df.loc[features.index.get_level_values('cellID')].to_numpy().flatten()

## TODO: option to remove class 2 ones if three classes defined but I want a binary classifier

# Create classifier (pipeline)
classifier = make_pipeline(
    StandardScaler(),
    #SVC(gamma='auto', probability=True),
    RandomForestClassifier(),
)

# Split training & testing
features_train, features_test, targets_train, targets_test = train_test_split(
    features, targets,
    train_size = train_size,
)

## TODO: implement expanded testing list in the case I want a binary classifier but the data was classified into three classes.

# Fit
classifier.fit(features_train, targets_train)

# Predict categories
targets_predicted = classifier.predict(features_test)
# Print cellIDs predicted to be in each category
predictions_dict = {}
for class_label in set(targets):
    predictions_dict[class_label] = features_test.iloc[targets_predicted == class_label].index.to_numpy()
print('Predictions')
print(predictions_dict)

# Get probabilities
targets_proba = classifier.predict_proba(features_test)
#pd.set_option('display.max_rows', None)
targets_proba_df = pd.DataFrame(targets_proba, index = features_test.index)
targets_proba_df.sort_values(by=[1]) # sorted by probability of oscillation
# Plot histogram of probabilities
fig, ax = plt.subplots()
plt.hist(targets_proba_df.iloc[:,1], 40)
plt.title('Histogram of probabilities')
plt.xlabel('Probability of oscillation')
plt.ylabel('Frequency')

## TODO: visualisations of time series & feature vectors of each group

# Verify by doing it again with k-fold cross-validation
kf = StratifiedKFold(n_splits = 5)
print('k-fold cross-validation')
for train_index, test_index in kf.split(features, targets):
    # Split training-testing
    features_train_kf, features_test_kf = features.iloc[train_index], features.iloc[test_index]
    targets_train_kf, targets_test_kf = targets[train_index], targets[test_index]
    
    # Train & predict
    classifier.fit(features_train_kf, targets_train_kf)
    targets_predicted_kf = classifier.predict(features_test_kf)
    
    # Compute measures
    print(
        'Precision ' +
        '%.4f' % precision_score(targets_test_kf, targets_predicted_kf, average='weighted') +
        ' Recall ' +
        '%.4f' % recall_score(targets_test_kf, targets_predicted_kf, average='weighted')
        )

# ROC curve
scores = targets_proba_df.iloc[:,1]
false_positive_rate, true_positive_rate, _ = roc_curve(targets_test, scores)
fig, ax = plt.subplots()
plt.plot(false_positive_rate, true_positive_rate)
plt.title('ROC curve')
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
print('ROC curve: area under curve is ' + '%.4f' % auc(false_positive_rate, true_positive_rate))

If using a random forest classifier, get feature importances

In [None]:
feature_importances = pd.DataFrame(
    classifier['randomforestclassifier'].feature_importances_,
    index = features.columns,
    columns = ['importance'],
)
feature_importances_sorted = feature_importances.sort_values(
    ['importance'],
    ascending = False,
)
feature_importances_sorted

In [None]:
fig, axs = plt.subplots()
plt.plot(np.cumsum(feature_importances_sorted.to_numpy()))
plt.xlabel('Feature')
plt.ylabel('Cumulative importance')
plt.show()

fig, axs = plt.subplots()
plt.bar(
    x = np.arange(len(feature_importances)),
    height = feature_importances.to_numpy().T[0],
    tick_label = feature_importances.index,
)
plt.xticks(rotation = 'vertical')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.show()

Test block for training on one dataset and testing on another

In [None]:
# PARAMETERS
filename_targets = 'categories_zwf_detrend.csv'
#filename_targets = 'random.csv'
train_size = 150
#

# (add import, processing)

# (add featurisation)

# Import target values
targets = pd.read_csv(filename_targets, header = None, index_col = 0)
targets.index.names = ['cellID']
targets = targets.to_numpy().flatten()

## TODO: option to remove class 2 ones if three classes defined but I want a binary classifier

# Split training & testing
features_train, features_test, targets_train, targets_test = train_test_split(
    features, targets,
    train_size = train_size,
)


#######################

# Predict categories
targets_predicted = classifier.predict(features_test)
# Print cellIDs predicted to be in each category
predictions_dict = {}
for class_label in set(targets):
    predictions_dict[class_label] = features_test.iloc[targets_predicted == class_label].index.to_numpy()
print('Predictions')
print(predictions_dict)

# Get probabilities
targets_proba = classifier.predict_proba(features_test)
pd.set_option('display.max_rows', None)
targets_proba_df = pd.DataFrame(targets_proba, index = features_test.index)
targets_proba_df.sort_values(by=[1]) # sorted by probability of oscillation
# Plot histogram of probabilities
fig, ax = plt.subplots()
plt.hist(targets_proba_df.iloc[:,1], 40)
plt.title('Histogram of probabilities')
plt.xlabel('Probability of oscillation')
plt.ylabel('Frequency')

## TODO: visualisations of time series & feature vectors of each group

# Verify by doing it again with k-fold cross-validation
kf = StratifiedKFold(n_splits = 5)
print('k-fold cross-validation')
for train_index, test_index in kf.split(features, targets):
    # Split training-testing
    features_train_kf, features_test_kf = features.iloc[train_index], features.iloc[test_index]
    targets_train_kf, targets_test_kf = targets[train_index], targets[test_index]
    
    # Train & predict
    classifier.fit(features_train_kf, targets_train_kf)
    targets_predicted_kf = classifier.predict(features_test_kf)
    
    # Compute measures
    print(
        'Precision ' +
        '%.4f' % precision_score(targets_test_kf, targets_predicted_kf, average='weighted') +
        ' Recall ' +
        '%.4f' % recall_score(targets_test_kf, targets_predicted_kf, average='weighted')
        )

# ROC curve
scores = targets_proba_df.iloc[:,1]
false_positive_rate, true_positive_rate, _ = roc_curve(targets_test, scores)
fig, ax = plt.subplots()
plt.plot(false_positive_rate, true_positive_rate)
plt.title('ROC curve')
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
print('ROC curve: area under curve is ' + '%.4f' % auc(false_positive_rate, true_positive_rate))

In [None]:
# Do something with this thing

Wlist_backup_MATLABids = [cell.MATLABid for cell in Wlist_backup]
Traininglist_MATLABids = [cell.MATLABid for cell in Traininglist]

Testinglist_expanded = [cell for cell in Wlist_backup if cell.MATLABid not in Traininglist_MATLABids]

testing_data_expanded = np.array([cell.flavin.feature_vector for cell in Testinglist_expanded])

len(set(Testinglist_expanded))