**Purpose:** Compute similarities between the flavin time series of different same-length sections of the same experiment.

**Aims:**
- Import flavin signals from multiple strains in the same experiment (and thus same nutrient conditions).
- Process data: cut time series to duration of interest, detrend flavin signals.
- Featurise data: use `catch22`
- Compute the mutual information between pairs of strains, treating mutual information as any other machine learning measure.
  - Mutual information asks the question: can you tell apart a typical time series from dataset A and a typical time series from dataset B?  0 means 'no', 1 means 'yes', intermediate values can be used as similarity measures.

**Paradigms:**
- Use `aliby`-style data structures and `postprocessor` processes for featurisation.

In [None]:
%matplotlib inline

# Import data

In [None]:
import numpy as np
import pandas as pd
import csv

# PARAMETERS
filename_prefix = './data/arin/Omero19972_'
#filename_prefix = './data/arin/Omero20071_'
#filename_prefix = './data/arin/Omero20212_'
#

# Import flavin signals
signal = pd.read_csv(filename_prefix+'flavin.csv')
signal.replace(0, np.nan, inplace=True) # because the CSV is constructed like that :/

# Import look-up table for strains (would prefer to directly CSV -> dict)
strainlookup_df = pd.read_csv(filename_prefix+'strains.csv')
strainlookup_dict = dict(zip(strainlookup_df.position, strainlookup_df.strain))

# Positions -> Strain (more informative)
signal = signal.replace({'position': strainlookup_dict})
signal.rename(columns = {"position": "strain"}, inplace = True)
signal = signal.drop(['distfromcentre'], axis = 1)

# Convert to multi-index dataframe
signal_temp = signal.iloc[:,2:]
multiindex = pd.MultiIndex.from_frame(signal[['strain', 'cellID']])
signal = pd.DataFrame(signal_temp.to_numpy(),
                      index = multiindex)

signal

# Choose a list of cells as working data

## Strains

List strains

In [None]:
signal.index.get_level_values(0).unique().to_list()

Define `signal_wd` as working data

In [None]:
#signal_wd = signal.loc[['htb2_mCherry_CRISPR', 'CEN_PK_Mat_A_Koetter', 'rim11_Del', 'swe1_Del', 'tsa1_Del_tsa2_Del']]
signal_wd = signal.loc[['htb2_mCherry_CRISPR']]

signal_wd

In [None]:
signal_wd = signal

## Oscillatory/Non-oscillatory

Load labels

In [None]:
filename_targets = 'categories_19979_detrend.csv'

labels_df = pd.read_csv(filename_targets, header = None, index_col = 0)
labels_df.index.names = ['cellID']
labels_df.columns = ['osc_category']

#labels_df

Specify whether to include:
- non-oscillatory cells only ([0])
- oscillatory cells only ([1])
- all cells ([0, 1])

In [None]:
osc_categories_to_include = [1]

indices_by_osc = labels_df[labels_df['osc_category'].isin(osc_categories_to_include)].index
indices_intersect = signal_wd.index.get_level_values('cellID').intersection(indices_by_osc)
signal_wd = signal_wd.loc[(slice(None), indices_intersect), :]

signal_wd

# Processing time series

## Range

Define the two durations to compare (they should be same length, but I haven't built in validation here yet), remove NaNs, and re-shape the `DataFrame` accordingly.

In [None]:
# PARAMETERS
interval1_start = 0
interval1_end = 84
interval2_start = 96
interval2_end = 180
#

signal_processed1 = signal_wd.iloc[:, interval1_start:interval1_end].dropna()
signal_processed2 = signal_wd.iloc[:, interval2_start:interval2_end].dropna()

shift = interval1_end - interval1_start
signal_processed2.columns = signal_processed1.columns
strain2 = signal_processed2.iloc[0].name[0]
signal_processed2.index = signal_processed2.index.set_levels(
    signal_processed2.index.levels[0].str.replace(strain2, strain2+'_shift'), level=0
)

signal_processed = pd.concat([signal_processed1, signal_processed2])

signal_processed

## Detrend

Using sliding window

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# PARAMETERS
window = 45
#

fig, ax = plt.subplots()
sns.heatmap(signal_processed)
plt.title('Before detrending')
plt.show()

def moving_average(input_timeseries,
                  window = 3):
    processed_timeseries = np.cumsum(input_timeseries, dtype=float)
    processed_timeseries[window:] = processed_timeseries[window:] - processed_timeseries[:-window]
    return processed_timeseries[window - 1 :] /  window

signal_processed = signal_processed.div(signal_processed.mean(axis = 1), axis = 0)
signal_movavg = signal_processed.apply(lambda x: pd.Series(moving_average(x.values, window)), axis = 1)
signal_norm = signal_processed.iloc(axis = 1)[window//2: -window//2] / signal_movavg.iloc[:,0:signal_movavg.shape[1]-1].values

fig, ax = plt.subplots()
sns.heatmap(signal_norm)
plt.title('After detrending')
plt.show()

signal_processed = signal_norm

signal_processed

# Featurisation

Option 1: use `catch22`

In [None]:
from postprocessor.core.processes.catch22 import catch22Parameters, catch22

catch22_processor = catch22(catch22Parameters.default())
features = catch22_processor.run(signal_processed)

sns.heatmap(features)

Option 2: use time points

In [None]:
features = signal_processed

sns.heatmap(features)

# Mutual information bit

## Use all strains in dataframe

Convert `DataFrame` to list of arrays as input for `estimateMI`, then compute mutual information

In [None]:
from postprocessor.core.processes.mi import miParameters, mi

mi_params = miParameters.default()
mi_params.overtime = False
mi_processor = mi(mi_params)
results = mi_processor.run(features)

In [None]:
results

## Distance matrix based on pairwise combinations of strains

Compute distance matrix

In [None]:
import itertools
from postprocessor.core.processes.mi import miParameters, mi

# There is probably a smarter way to compute a distance matrix using a custom distance metric --
# maybe there's a scipy/numpy/scikit-learn routine that simplifies code, and I can put the
# MI-computing bit as a function.

mi_params = miParameters.default()
mi_params.overtime = False
mi_params.n_bootstraps = 100
mi_processor = mi(mi_params)

strain_list = features.index.get_level_values('strain').unique().to_numpy()
# Using itertools.combinations instead of two 'for' loops, one for each axis,
# because mi.run() is computationally expensive.
# Plus, I only need the upper triangular anyway.
distance_matrix = np.zeros((len(strain_list), len(strain_list)))
for strain1, strain2 in itertools.combinations_with_replacement(strain_list, 2):
    if strain1 == strain2:
        features_copy = features.loc[[strain1]]
        features_copy.index = features_copy.index.set_levels(
            features_copy.index.levels[0].str.replace(strain1, strain1+'_copy'), level=0
        )
        features_subset = pd.concat([features.loc[[strain1]], features_copy])
    else:
        features_subset = features.loc[[strain1, strain2]]
    results = mi_processor.run(features_subset)
    median_mi = results[0][1]
    distance_matrix[np.argwhere(strain_list == strain1).item()][np.argwhere(strain_list == strain2).item()] = median_mi

# Visualise
sns.heatmap(
    pd.DataFrame(
        data = distance_matrix,
        index = strain_list,
        columns = strain_list,
    )
)

In [None]:
distance_matrix

Hierarchical clustering

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

# Flatten upper triangular for linkage() input
linkage_matrix = linkage(
    distance_matrix[np.triu_indices(len(strain_list), k = 1)],
    'average', # This parameter defines the algorithm
)

# Plot
dendrogram(
    linkage_matrix,
    orientation = 'left',
    labels = strain_list,
)
plt.xlabel('Distance')
plt.show()