In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
%reload_ext autoreload
%autoreload 2
%matplotlib inline

Imports

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import pandas as pd
import seaborn as sns

from postprocessor.core.multisignal.crosscorr import crosscorr
from postprocessor.routines.median_plot import median_plot
from postprocessor.core.processes.findpeaks import findpeaks

# Load data

In [None]:
data_dir = "../data/raw/"
group1_name = "is20016_zwf1egf"
group2_name = "is20016_by4741"

In [None]:
filepath1 = data_dir + group1_name
timeseries1_filepath = filepath1 + "_timeseries.csv"
labels1_filepath = filepath1 + "_labels.csv"

timeseries1_df = pd.read_csv(timeseries1_filepath, index_col=[0,1,2])

filepath2 = data_dir + group2_name
timeseries2_filepath = filepath2 + "_timeseries.csv"

timeseries2_df = pd.read_csv(timeseries2_filepath, index_col=[0,1,2])

Join dataframes

In [None]:
timeseries_df = pd.concat([timeseries1_df, timeseries2_df])

In [None]:
timeseries_df

In [None]:
timeseries_dropna = timeseries_df.dropna()

# First-approach period estimation

Using autocorrelation function

In [None]:
# Choose strain
strain_name = "zwf1egf"
strain_mask = [strain_name in index for index in timeseries_dropna.index.get_level_values('position')]
timeseries_strain = timeseries_dropna.iloc[strain_mask]

In [None]:
autocorr_df = crosscorr.as_function(
    timeseries_strain, stationary=False, normalised=True, only_pos=True
)

In [None]:
fig, ax = plt.subplots()

# draw acf
median_plot(
    autocorr_df,
    xlabel="Lag (time points)",
    ylabel="Autocorrelation function",
    ax=ax,
)
# and axes
plt.axhline(0, color="k")
plt.axvline(0, color="k")

In [None]:
# find peaks & troughs
mean_acf_df = autocorr_df.mean().to_frame().T
peaks_df = findpeaks.as_function(mean_acf_df)

# datatype conversions
lagaxis = mean_acf_df.columns.to_numpy()
lagaxis = lagaxis.astype(float)
mean_acf = mean_acf_df.to_numpy()[0]
peaks_mask = peaks_df.to_numpy()[0] != 0

# Get location of first peak, as an estimate of period
est_period = lagaxis[peaks_mask][0]
print(est_period)

# Scale time axis

Target: units are in hours and oscillations are expected to have a period of 24 hours

In [None]:
timeseries_strain

In [None]:
timeaxis = timeseries_strain.columns.to_numpy().astype(float)

stop = (len(timeaxis)-1)*(24/est_period)
scaled_timeaxis = np.linspace(0, stop, num=len(timeaxis))

timeseries_scaled = timeseries_strain.copy()
timeseries_scaled.columns = scaled_timeaxis

timeseries_scaled

## Scaling back

Note: the original phase shift will be lost (i.e. time series will start from 0 rather than whatever it was), but this information is not needed.

In [None]:
timeaxis_scaled = timeseries_scaled.columns.to_numpy()

In [None]:
sampling_pd = 5
scaling_factor = (est_period/24) * sampling_pd

timeaxis_unscaled = timeaxis_scaled * scaling_factor

In [None]:
timeaxis_unscaled

## Alternatively, skip scaling

In [None]:
timeseries_scaled = timeseries_strain.copy()

# Change data format

In [None]:
timeseries_out = timeseries_scaled.T
new_columns = [strain_name] * len(timeseries_out.columns)
timeseries_out.columns = new_columns

In [None]:
timeseries_out.to_excel("../data/interim/is20016_by4741_timeseries_biodare2.xlsx")

In [None]:
# Alternatively, CSV
timeseries_out.to_csv("../data/interim/is20016_zwf1egf_timeseries_biodare2.csv")