In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
%reload_ext autoreload
%autoreload 2
%matplotlib inline

Imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import umap

from postprocessor.core.processes.catch22 import catch22
from postprocessor.core.processes.standardscaler import standardscaler

In [None]:
from src.umapper import umapper

# Load data

In [None]:
data_dir = "../data/raw/"
group1_name = "st01253_tsa1tsa2morgan"
group2_name = "st01253_by4742swain"

In [None]:
filepath1 = data_dir + group1_name
timeseries1_filepath = filepath1 + "_timeseries.csv"
labels1_filepath = filepath1 + "_labels.csv"

timeseries1_df = pd.read_csv(timeseries1_filepath, index_col=[0,1,2])
labels1_df = pd.read_csv(labels1_filepath, index_col=[0,1,2])


filepath2 = data_dir + group2_name
timeseries2_filepath = filepath2 + "_timeseries.csv"
labels2_filepath = filepath2 + "_labels.csv"

timeseries2_df = pd.read_csv(timeseries2_filepath, index_col=[0,1,2])
labels2_df = pd.read_csv(labels2_filepath, index_col=[0,1,2])

Join dataframes

In [None]:
timeseries_df = pd.concat([timeseries1_df, timeseries2_df])
labels_df = pd.concat([labels1_df, labels2_df])

In [None]:
timeseries_df

In [None]:
labels_df

In [None]:
labels_df.to_csv("../data/processed/labels_df.csv")

# Featurise (time series --> catch22)

In [None]:
features_df = catch22.as_function(timeseries_df)

In [None]:
features_df

# Scale (standardscaler)

In [None]:
features_scaled = standardscaler.as_function(features_df.T).T

In [None]:
features_scaled

In [None]:
features_scaled.to_csv("../data/processed/features_scaled.csv")

# UMAP

## Old way

In [None]:
# Bodge: add strain so it plays well with old umapper code
features_strain_df = pd.concat({'tsa1tsa2morgan': features_df}, names=['strain'])

In [None]:
fig_umap, ax_umap = plt.subplots(figsize=(6,6))
umapper.umap_plot(
    data=features_strain_df,
    n_neighbors=20,
    min_dist=0.5,
    n_components=2,
    label_index="strain",
    ax=ax_umap
)

## Refactor: broken down, for more flexibility

In [None]:
reducer = umap.UMAP(
    n_neighbors=10,
    min_dist=0.05,
    n_components=2,
)

In [None]:
mapper = reducer.fit(features_scaled)
embedding = mapper.embedding_

Dummy colour scheme

In [None]:
# Dummy colour scheme
test_list = ['foo']*300 + ['bar']*53
test_palette_map = {
    'foo': 'r',
    'bar': 'k',
}

fig, ax = plt.subplots(figsize=(6,6))
sns.scatterplot(
    x=embedding[:,0],
    y=embedding[:,1],
    hue=test_list,
    palette=test_palette_map,
    ax=ax,
)

Colour dots by strain

In [None]:
position_list = features_scaled.index.get_level_values("position").to_list()
strain_list = [position.split("_")[0] for position in position_list]
strain_relabel_lookup = {
    'tsa1tsa2morgan': 'tsa1Δ tsa2Δ',
    'by4742swain': 'BY4742',
}
strain_list = [strain_relabel_lookup.get(item,item) for item in strain_list]
strain_palette_map = {
    'tsa1Δ tsa2Δ': 'C0',
    'BY4742': 'C1',
}

fig, ax = plt.subplots(figsize=(6,6))
sns.scatterplot(
    x=embedding[:,0],
    y=embedding[:,1],
    hue=strain_list,
    palette=strain_palette_map,
    ax=ax,
)

Colour dots by score

In [None]:
common_idx = features_scaled.index.intersection(labels_df.index)
scores_list = labels_df.loc[common_idx].score.to_list()
scores_relabel_lookup = {
    0: 'Oscillatory',
    1: 'Non-oscillatory',
}
scores_list = [scores_relabel_lookup.get(item,item) for item in scores_list]
scores_palette_map = {
    'Oscillatory': 'C2',
    'Non-oscillatory': 'C4',
}

fig, ax = plt.subplots(figsize=(6,6))
sns.scatterplot(
    x=embedding[:,0],
    y=embedding[:,1],
    hue=scores_list,
    palette=scores_palette_map,
    ax=ax,
)

Combining strains and scores

In [None]:
label_list = []
for strain, score in zip(strain_list, scores_list):
    if score == 'Non-oscillatory':
        label_list.append(score)
    elif score == 'Oscillatory':
        label_list.append(strain)

label_palette_map = {
    'Non-oscillatory': 'lightgrey',
    'tsa1Δ tsa2Δ': 'C0',
    'BY4742': 'C1',
}

fig, ax = plt.subplots(figsize=(6,6))
sns.scatterplot(
    x=embedding[:,0],
    y=embedding[:,1],
    hue=label_list,
    palette=label_palette_map,
    ax=ax,
)

# Graph-based clustering

Distance matrix

In [None]:
features_scaled

In [None]:
# Old: sklearn doesn't produce a symmetric distance matrix
from sklearn.metrics.pairwise import euclidean_distances
distance_matrix = euclidean_distances(features_scaled)

In [None]:
# New: scipy produces a symmetric distance matrix and offers more flexibility
from scipy.spatial.distance import pdist, squareform
distances = pdist(features_scaled, metric="euclidean")
distance_matrix = squareform(distances)

Prune

In [None]:
from src.utils.utils import graph_prune

In [None]:
distance_matrix_pruned = graph_prune(distance_matrix, 7)

In [None]:
np.array_equal(distance_matrix_pruned, distance_matrix_pruned.T)

Graph

In [None]:
import igraph as ig

In [None]:
graph = ig.Graph.Weighted_Adjacency(distance_matrix_pruned.tolist(), mode="undirected")

Draw

In [None]:
fig, ax = plt.subplots()
ig.plot(graph, target=ax)