### Connect Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive') 

### Package Imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import math
import pandas as pd
import tensorflow as tf
from scipy.signal import savgol_filter
from collections import Counter
from collections import defaultdict
from scipy.optimize import curve_fit
from scipy.signal import filtfilt
from scipy.spatial import distance
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances, cosine_distances
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
import pickle

### GPU Device

In [2]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-19e4dd9f-2cfa-fc78-1ab0-23174ea3a8e2)


In [3]:
gpu = tf.test.gpu_device_name()
print(gpu)

/device:GPU:0


### Pre-Processing Helper Functions

In [None]:
def decaying_exp(x, a, b):
    """ Returns exponential function

    Parameters
    ----------
    x : ndarray
        times
    a : double
        t(inf) value
    b : double
        slope to t=0
        
    Returns
    -------
    ndarray
        y-axis values of the function
    """
    return a*(1-np.exp(-b * x))


def fit_pixels_interpolate(time, X, interpolate_idx):
    """ Interpolates the curves for each pixel

    Parameters
    ----------
    time : ndarray
        times
    X : ndarray
        TxNM array to be interpolated
    idx_active : ndarray
        NM array specifying pixels that are active
    interpolate_idx : int
        interpolation is performed until this index

    Returns
    -------
    popt : ndarray
        optimal parameters for interpolation of each pixel, with shape 2xNM
    """
    popt = np.zeros((2, X.shape[1]))

    # for every pixel
    for i in range(X.shape[1]):

      data = filtfilt(b=np.ones(10) / 10, a=[1], x=X[:, i])

      # Fit the curve (interpolate) to the decaying exponential
      try:
        popt[:, i], pcov = curve_fit(decaying_exp, time[:interpolate_idx], data[:interpolate_idx], p0=[-10, 0.1])
      except:
        
        popt[:, i] = None

    return popt

In [None]:
def filter_by_drift(df, interpolate_idx):

  """ Filters pixels by their fitting to the drift model
  
  Parameters
  ----------
  df : pandas.DataFrame 
    DataFrame with pixel data that the fitting is applied to
  interpolate_idx : int
    Interpolation is perfromed until this index

  Returns
  -------
  df : pandas.DataFrame
    DataFrame with only the data from the active pixels
  drfit_avg : numpy.array
    Array containing the average drift value at each time stamp

  """
  
  popt = fit_pixels_interpolate(np.array(df.index), df.values, interpolate_idx)

  drift_avg = np.zeros(df.shape[0])
  pix_count = 0
  active = np.array(np.zeros(df.shape[1]), dtype=bool)

  for idx in range(df.shape[1]):

  # check if any of the drift params for the pixel are nan
    if(np.isnan(popt[0, idx]) and np.isnan(popt[1, idx])):
      active[idx] = False
    else:
      # if drift params exist then iterate over the values of the index and use these as x values for the drift curve
      y_vals = []
      for i in df.index:
        val = decaying_exp(i, popt[0,idx], popt[1,idx])
        y_vals.append(val)
      
      # subtract the extrapolated drift from the signal
      drift_error = np.abs(np.array(df.values[:, idx] - y_vals))
      
      # only keep pixels with drift error of less than 30mV
      if((drift_error < 30).all()):
        drift_avg = np.add(drift_avg, np.array(y_vals))
        pix_count += 1
        active[idx] = True
      else:
        active[idx] = False

  drift_avg/=pix_count

  df = df.loc[:, active]

  return df, drift_avg

In [None]:
def filter_by_vref(X, v_thresh=70):
    '''
    Identifies active pixels by checking if one of the first 10 derivatives d(i) is > v_thresh
    
    Parameters
    ---------

    X : ndarray
        Input 2D array (T x NM). T = time samples, NM = total number of pixels
    v_thresh : int, optional
        Minimum value of the derivative d(i)=X(i+1)-X(i) in mV. Default is 70
    Returns
    -------
    ndarray
        1D array of bool with dimension (NM). For each pixel, returns True if, during the first 10 samples,
        one of the derivatives is > v_thresh. The derivatives are calculated as d(i) = X(i+1)-X(i)
    '''
    return (np.diff(X[:10, :], axis=0) > v_thresh).any(axis=0)  # check if one of the first 10 derivatives is >v_thresh

In [None]:
def filter_by_vrange(X, v_range=(100, 900)):
    '''
    Identifies active pixels by checking that all the values are in v_range

    Parameters
    ---------
    X : ndarray
        Input 2D array (T x NM). T = time samples, NM = total number of pixels
    v_range : (int, int), optional
        tuple containing the minimum and maximum allowable voltage in mV. Default is (100, 900)
        
    Returns
    -------
    ndarray
        1D array of bool with dimension (NM). For each pixel, returns True if the value is always in v_range
    '''
    return (X < v_range[1]).all(axis=0) & (X > v_range[0]).all(axis=0)  # for each pixel, check if all the values are
    # within the given range


In [None]:
def filter_by_derivative(X, vthresh=5):
    """ Identifies active pixels by checking that the absolute value of the derivative is always below vthresh

    Parameters
    ----------
    X : ndarray
        input 2D array of shape TxNM
    vthresh : int
        threshold for active pixels. Default is 5
        
    Returns
    -------
    ndarray
        1D array of bool with dimension (NM). For each pixel, returns True if all the derivatives are below vthresh
    """
    x_diff = np.abs(np.diff(X, axis=0))
    return (x_diff < vthresh).all(axis=0)

In [None]:
def filter_active_pixels(df, v_thresh_ref=50, v_range=(100, 900), v_thresh_deriv=5): 
  """ Filters pixels by reference electrode voltage, derivative and voltage range 

  Parameters
  ----------
  df : pandas.DataFrame
    Dataframe containing pixel data which will be filtered 
  v_thresh_ref : int, optional
    Threshold for active pixels for filtering by reference electrode volatge . Default is 50
  v_range : (int, int), optional
    Tuple containing the minimum and maximum allowable voltage in mV for the voltage range filteration. Default is (100, 900)
  v_thresh_deriv : int, optional
    Threshold for filtering pixels by derivative. Default is 5

  Returns
  -------
  df : pandas.DataFrame
    DataFrame after data from inactive pixels is removed
  """

  active = filter_by_vref(df.values, v_thresh_ref) & filter_by_vrange(df.values, v_range) & filter_by_derivative(df.values, v_thresh_deriv)

  # drop pixels 
  df = df.loc[: , active]

  return df

In [None]:
def filter_active_pixels_deriv(df, v_thresh_deriv=5): 
  """ Filters pixels by derivative  

  Parameters
  ----------
  df : pandas.DataFrame
    Dataframe containing pixel data which will be filtered 
  v_thresh_deriv : int, optional
    Threshold for filtering pixels by derivative. Default is 5

  Returns
  -------
  df : pandas.DataFrame
    DataFrame after data from inactive pixels is removed
  """

  active = filter_by_derivative(df.values, v_thresh_deriv)
  
  # drop pixels 
  df = df.loc[: , active]
  return df

In [None]:
def filter_active_pixels_range(df, v_range=(100, 900)):
  """ Filters pixels by voltage range 

  Parameters
  ----------
  df : pandas.DataFrame
    Dataframe containing pixel data which will be filtered 
  v_range : (int, int), optional
    Tuple containing the minimum and maximum allowable voltage in mV for the voltage range filteration. Default is (100, 900)

  Returns
  -------
  df : pandas.DataFrame
    DataFrame after data from inactive pixels is removed
  """
  active = filter_by_vrange(df.values, v_range)

  # drop pixels 
  df = df.loc[: , active]
  return df

In [None]:
def reshape_data(df, rows, cols):
  """ Reshapes TxNM data into TxNxM where (T = Number of time samples, N = Number of Rows, M = Number of Columns)

  Parameters
  ----------
  df : pandas.DataFrame
    Dataframe containing pixel data which will be reshaped
  rows : int
    Number of rows 
  cols : int
    Number of columns

  Returns
  -------
  X : numpy.ndarray
    3D array containing the reshaped data 
  """

  X = df.values #pandas.DataFrame.values: Return a Numpy representation of the DataFrame.
  X = X.reshape(-1, rows, cols, order='F') #or C. different reshaping row by row or column by column but this works
  return X

In [None]:
def filter_chemical_pixels(df, arr_rows, arr_cols):
  """ Removes all the temperature pixels from the data 

  Parameters
  ----------
  df : pandas.DataFrame
    Dataframe containing pixel data which will filtered
  rows : int
    Number of rows 
  cols : int
    Number of columns

  Returns
  -------
  df : pandas.DataFrame
    DataFrame after temperature pixels are removed
  """

  X = reshape_data(df, arr_rows, arr_cols) # reshape data to T x 78 x 56
  X_mean = np.mean(X, axis=0) # get mean to have 78 x 56 shape
  X_mean[1::3, 1::3] = np.nan # set temperature pixels to nan
  X_mean = X_mean.flatten('F') # restore shape to 4068 

  active_chemical = ~(np.isnan(X_mean)) # get bool array of all chemical pixels

  # drop pixels 
  df = df.loc[: , active_chemical]
  return df


In [None]:
def time_to_index(times, time_vect):
    '''Returns index of the times closest to the desired ones time_vect

    Parameters
    ---------
    times : list
        list of integers containing the desired times
    time_vect : nparray
        array of the times at which the values are sampled
    Returns
    -------
    list
        for each element in the input list times, return an element in the output list
        with the index of the sample closest to the desired time
    '''
    indices = []
    for time in times:  # for each time in the input list
        indices.append( np.argmin(np.abs(time_vect - time)) )
        # find index of the sampled time (in time_vect) closest to the desired one (time)
    return indices


def find_loading_time(time_vect, X, bounds=(600, 900)):  # for v2
    ''' Finds loading and settling time for the data of v2 chip

    Parameters
    ----------
    time_vect : ndarray
        1D array with dimension T containing the sampling times
    X : ndarray
        2D array with dimension TxNM containing the sampled data
    bounds : list, optional
        tuple containing the minimum and maximum times (in ms) where the loading time has to be searched.
        Default is (600, 900)
        
    Returns
    -------
    tuple
        - settled_index : index at which the settling occurs
        - settled_time : time at which the settling occurs
    '''

    search_start, search_end = time_to_index(bounds, time_vect)  # for each time in bounds, find the index
    # of the sample (in time_vect) that is closest to the desired one (in bounds)
    X_mean = np.mean(X, axis=1)  # for each sample, calculate the mean of all pixels
    X_mean_diff = np.diff(X_mean)  # find the derivative

    loading_index = np.argmax(X_mean_diff[search_start:search_end]) + search_start + 1  # find the index
    # where the derivative is max in the specified interval
    loading_index = loading_index  # add settling time
    settled_index = loading_index + 10  # add settling time
    settled_time = time_vect[settled_index]  # find the time that index corresponds to

    return settled_index, settled_time

In [None]:
def preprocess_data(df, deriv_thresh, deriv_thresh_bgsub=5):
  """Applies all pre-processing steps to single well experimental data

  Parameters
  ----------
  df : pandas.DataFrame
    Dataframe containing pixel data which will pre-processed
  deriv_thresh : int
    Threshold for filtering by derivative
  deriv_thresh_bgsub : int, optional
    Threshold for filtering by derivative after background subtraction step

  Returns
  -------
  df : pandas.DataFrame
    DataFrame with only data from active pixels after pre-processing
  """

  
  df = filter_chemical_pixels(df, 78, 56) # filter all chemical pixels
  
  df = filter_active_pixels(df=df, v_thresh_deriv=deriv_thresh, v_range=(100,900))

  settle_idx, settle_time = find_loading_time(df.index, df, bounds=(600, 900)) # find settling point
  df = df.iloc[settle_idx + 10:, :] # use only the data after the settling time + 30s to allow reaction to settle

  df = df.sub(df.iloc[0, :], axis='columns') # subtract value of first pixel from all pixels

  if(len(filter_active_pixels_deriv(df=df, v_thresh_deriv=deriv_thresh_bgsub).columns) != 0): # check if there is still data present after filtering
    df = filter_active_pixels_deriv(df=df, v_thresh_deriv=deriv_thresh_bgsub) # if data is present do filtering otherwise don't

  df = df.iloc[0:150+250, :] # take only 400 samples after settling point (approx 19-20mins) 
  
  df.index = df.index - df.index[0] # set the first time value to 0
  
  X, drift = filter_by_drift(df, 40) # filter by fitting of pixel to drift model

  if(len(X.columns) != 0): 
    df = X

  df['Average Output'] = df.mean(axis=1) # compute the mean value after filtering inactive pixels 

  df['Average Drift'] = drift # add new column for average drift

  if(len(X.columns) != 0):
    df['Average Output'] = df['Average Output'] - drift # drift compensation 

  df['Average Output'] = savgol_filter(df['Average Output'],101, 3) # filter to smooth out the noise in the data
   
  return df

In [None]:
def preprocess_partial_data(df, deriv_thresh, deriv_thresh_bgsub=5):
  """Applies all pre-processing steps to double well experimental data

  Parameters
  ----------
  df : pandas.DataFrame
    Dataframe containing pixel data which will pre-processed
  deriv_thresh : int
    Threshold for filtering by derivative
  deriv_thresh_bgsub : int, optional
    Threshold for filtering by derivative after background subtraction step

  Returns
  -------
  df : pandas.DataFrame
    DataFrame with only data from active pixels after pre-processing
  """

  df = filter_active_pixels_range(df=df, v_range=(100,900)) # filter by range incase of any saturation
  
  df = filter_active_pixels_deriv(df=df, v_thresh_deriv=deriv_thresh) # filter pixels by deriv

  df = df.sub(df.iloc[0, :], axis='columns') # subtract value of first pixel from all pixels

  if(len(filter_active_pixels_deriv(df=df, v_thresh_deriv=deriv_thresh_bgsub).columns) != 0): # check if there is still data present after filtering
    df = filter_active_pixels_deriv(df=df, v_thresh_deriv=deriv_thresh_bgsub) # if data is present do filtering otherwise dont

  df = df.iloc[0:150+250, :] # take only 400 samples after settling point (approx 19-20mins)
  
  df.index = df.index - df.index[0] # set the first time value to 0

  X, drift = filter_by_drift(df, 40) # filter by fitting of pixel to drift model

  if(len(X.columns) != 0):
    df = X
  
  df['Average Output'] = df.mean(axis=1) # compute the mean value after filtering inactive pixels 

  df['Average Drift'] = drift # average drift column

  if(len(X.columns) != 0):
    df['Average Output'] = df['Average Output'] - drift # drift compensation

  df['Average Output'] = savgol_filter(df['Average Output'],101, 3) # filter to smooth out the noise in the data
    
  return df

### Data Loading Helper Functions

In [None]:
def load_partial_covid_exp(filepath):
  """ Loading in double well experimental data from csv file

  Parameters
  ----------
  filepath : string
    Path to the csv file that loads the double well data
  
  Returns
  -------
  df_pos : pandas.DataFrame
    DataFrame with pixel data from the positive well 
  df_neg : panads.DataFrame
    DataFrame with pixel data from the negative well
  """
  
  bot_filepath = filepath[:-4] + "_bot.csv"
  top_filepath = filepath[:-4] + "_top.csv"

  ## load in 2 sheets
  df_neg = pd.read_csv(top_filepath, header=0, index_col=0)
  df_pos = pd.read_csv(bot_filepath, header=0, index_col=0)

  return df_pos, df_neg

In [None]:
def save_data(filepath, dictionary):
  """ Saves a dictionary to a pickle file

  Parameters
  ----------
  filepath : string
    Path to the pickle file that saves the dictionary
  dictionary : dict(panadas.DataFrame)
    Dictionary of DataFrames that will be saved
  """
  with open(filepath, 'wb') as f:
      pickle.dump(dictionary, f)

In [None]:
def load_data(filepath): 
  """ Loads a dictionary from a pickle file

  Parameters
  ----------
  filepath : string
    Path to the pickle file that loads the dictionary

  Returns
  -------
  loaded_dict : dict(panadas.DataFrame)
    Dictionary of DataFrames that is loaded from the file
  """

  with open(filepath, 'rb') as f:
      loaded_dict = pickle.load(f)
  return loaded_dict

### Evaluation Metric Helper Functions

In [None]:
def accuracy(classifications):
  """ Returns the value of the accuracy from predicted outputs and true outputs

  Parameters
  ----------
  classifications : dictionary(tuple(int,int))
    Dictionary containing a tuple which holds the true output and prediction output from classification

  Returns
  -------
  accuracy : double
    Classification Accuracy 
  """
  total = len(classifications)
  total_correct = 0
  for i in classifications.values():
    
    if(i[0] == None or i[1] == None): ## if any predictions are inconclusive 
      continue

    if(i[0] == i[1]):
      total_correct +=1

  accuracy = (total_correct/total)

  return accuracy

In [None]:
def sensitivity(classifications):
  """ Returns the value of the sensitivity from predicted outputs and true outputs

  Parameters
  ----------
  classifications : dictionary(tuple(int,int))
    Dictionary containing a tuple which holds the true output and prediction output from classification

  Returns
  -------
  sensitivity : double
    Classification sensitivity 
  """
  true_pos = 0
  false_neg = 0

  for i in classifications.values():

    true_label = int(i[1])
    predicted = int(i[0])

    if(true_label == 1 and predicted == 1):
      true_pos += 1
    
    if(true_label == 1 and predicted == 0):
      false_neg += 1

  sensitivity = (true_pos/(true_pos + false_neg))

  return sensitivity

In [None]:
def specificity(classifications):
  """ Returns the value of the specificity from predicted outputs and true outputs

  Parameters
  ----------
  classifications : dictionary(tuple(int,int))
    Dictionary containing a tuple which holds the true output and prediction output from classification

  Returns
  -------
  specificity : double
    Classification specificity 
  """

  true_neg = 0
  false_pos = 0

  for i in classifications.values():
    true_label = int(i[1])
    predicted = int(i[0])
    
    if(true_label == 0 and predicted == 0):
      true_neg += 1
    
    if(true_label == 0 and predicted == 1):
      false_pos += 1

  specificity = (true_neg/(true_neg + false_pos))

  return specificity

In [None]:
def precision(classifications):
  """ Returns the value of the precision from predicted outputs and true outputs

  Parameters
  ----------
  classifications : dictionary(tuple(int,int))
    Dictionary containing a tuple which holds the true output and prediction output from classification

  Returns
  -------
  precision : double
    Classification precision 
  """
  true_pos = 0
  false_pos = 0

  for i in classifications.values():
    true_label = int(i[1])
    predicted = int(i[0])
    
    if(true_label == 1 and predicted == 1):
      true_pos += 1
    
    if(true_label == 0 and predicted == 1):
      false_pos += 1

  precision = (true_pos/(true_pos + false_pos))

  return precision

In [None]:
def f1(classifications):
  """ Returns the value of the F1 score from predicted outputs and true outputs

  Parameters
  ----------
  classifications : dictionary(tuple(int,int))
    Dictionary containing a tuple which holds the true output and prediction output from classification

  Returns
  -------
  double
    Classification F1 score 
  """
  numerator = 2*precision(classifications)*sensitivity(classifications)
  denominator = precision(classifications) + sensitivity(classifications)
  return numerator/denominator

### Array Dims

In [None]:
arr_rows = 78
arr_cols = 56

### Load Data

#### Positive Samples

In [None]:
## Average pixel value for all samples 

with tf.device(gpu):
  ## Gamma 1
  avg_data_g1_file = "/DNAPositives/100921_DNA/100921_DNA/Data/gamma1.app.1e5/gamma1.app.1e5_data_export.csv"
  avg_g1 = pd.read_csv(avg_data_g1_file, header=0)

  ## Gamma 2
  avg_data_g2_file = "/DNAPositives/100921_DNA/100921_DNA/Data/gamma2.app.1e4/gamma2.app.1e4_data_export.csv"
  avg_g2 = pd.read_csv(avg_data_g2_file, header=0)

  ## Gamma 3
  avg_data_g3_file = "/DNAPositives/100921_DNA/100921_DNA/Data/gamma3.app.1e5/gamma3.app.1e5_data_export.csv"
  avg_g3 = pd.read_csv(avg_data_g3_file, header=0)
  
  ## Gamma 5 
  avg_data_g5_file = "/DNAPositives/100921_DNA/100921_DNA/Data/gamma5.app.1e4/gamma5.app.1e4_data_export.csv"
  avg_g5 = pd.read_csv(avg_data_g5_file, header=0)

  ## 22RV1.ap1
  avg_data_22rv1_ap1_file = "/DNAPositives/22RV1.ap1/22RV1.ap1_data_export.csv"
  avg_22rv1_ap1 = pd.read_csv(avg_data_22rv1_ap1_file, header=0)

  ## 22RV1.ap2
  avg_data_22rv1_ap2_file = "/DNAPositives/22RV1.ap2/22RV1.ap2_data_export.csv"
  avg_22rv1_ap2 = pd.read_csv(avg_data_22rv1_ap2_file, header=0)

  ## 22RV1y.p3
  avg_data_22rv1y_p3_file = "/DNAPositives/22Rv1y.p3/22Rv1y.p3_data_export.csv"
  avg_22rv1y_p3 = pd.read_csv(avg_data_22rv1y_p3_file, header=0)

  ## 22RV1y.p4
  avg_data_22rv1y_p4_file = "/DNAPositives/22Rv1y.p4/22Rv1y.p4_data_export.csv"
  avg_22rv1y_p4 = pd.read_csv(avg_data_22rv1y_p4_file, header=0)

  ## ARV7.p1
  avg_data_arv7_p1_file = "/DNAPositives/ARV7.p1/ARV7.p1_data_export.csv"
  avg_arv7_p1 = pd.read_csv(avg_data_arv7_p1_file, header=0).iloc[1:, :].reset_index(drop=True) # row 0 was NAN

  ## ARV7.p3
  avg_data_arv7_p3_file = "/DNAPositives/ARV7.p3/ARV7.p3_data_export.csv"
  avg_arv7_p3 = pd.read_csv(avg_data_arv7_p3_file, header=0)

  ## ARV7.p4
  avg_data_arv7_p4_file = "/DNAPositives/ARV7.p4/ARV7.p4_data_export.csv"
  avg_arv7_p4 = pd.read_csv(avg_data_arv7_p4_file, header=0)

  ## Beta 1
  avg_data_b1_file = "/DNAPositives/100921_DNA/100921_DNA/Data/beta1.app.1e4/beta1.app.1e4_data_export.csv"
  avg_b1 = pd.read_csv(avg_data_b1_file, header=0)

  ## Beta 2
  avg_data_b2_file = "/DNAPositives/100921_DNA/100921_DNA/Data/beta2.app.1e5/beta2.app.1e5_data_export.csv"
  avg_b2 = pd.read_csv(avg_data_b2_file, header=0)

  ## Beta 5
  avg_data_b5_file = "/DNAPositives/100921_DNA/100921_DNA/Data/beta5.app.1e5/beta5.app.1e5_data_export.csv"
  avg_b5 = pd.read_csv(avg_data_b5_file, header=0)
  

In [None]:
## All pixel values for each time stamp

with tf.device(gpu):
  ## Gamma 1
  g1_file = "/DNAPositives/100921_DNA/100921_DNA/Data/gamma1.app.1e5/gamma1.app.1e5_vsChem_export.csv"
  g1 = pd.read_csv(g1_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  g1.index = avg_g1["Time Elapsed"]

  ## Gamma 2
  g2_file = "/DNAPositives/100921_DNA/100921_DNA/Data/gamma2.app.1e4/gamma2.app.1e4_vsChem_export.csv"
  g2 = pd.read_csv(g2_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  g2.index = avg_g2["Time Elapsed"]

  ## Gamma 3
  g3_file = "/DNAPositives/100921_DNA/100921_DNA/Data/gamma3.app.1e5/gamma3.app.1e5_vsChem_export.csv"
  g3 = pd.read_csv(g3_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  g3.index = avg_g3["Time Elapsed"]

  ## Gamma 5
  g5_file = "/DNAPositives/100921_DNA/100921_DNA/Data/gamma5.app.1e4/gamma5.app.1e4_vsChem_export.csv"
  g5 = pd.read_csv(g5_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  g5.index = avg_g5["Time Elapsed"]

  ## 22RV1.ap1
  rv1_ap1_file = "/DNAPositives/22RV1.ap1/22RV1.ap1_vsChem_export.csv"
  rv1_ap1 = pd.read_csv(rv1_ap1_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  rv1_ap1.index = avg_22rv1_ap1['Time Elapsed']

  ## 22RV1.ap2
  rv1_ap2_file = "/DNAPositives/22RV1.ap2/22RV1.ap2_vsChem_export.csv"
  rv1_ap2 = pd.read_csv(rv1_ap2_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  rv1_ap2.index = avg_22rv1_ap2['Time Elapsed']

  ## 22RV1y.p3
  rv1y_p3_file = "/DNAPositives/22Rv1y.p3/22Rv1y.p3_vsChem_export.csv"
  rv1y_p3 = pd.read_csv(rv1y_p3_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  rv1y_p3.index = avg_22rv1y_p3['Time Elapsed']

  ## 22RV1y.p4
  rv1y_p4_file = "/DNAPositives/22Rv1y.p4/22Rv1y.p4_vsChem_export.csv"
  rv1y_p4 = pd.read_csv(rv1y_p4_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  rv1y_p4.index = avg_22rv1y_p4['Time Elapsed']

  ## ARV7.p1 
  arv7_p1_file = "/DNAPositives/ARV7.p1/ARV7.p1_vsChem_export.csv"
  arv7_p1 = pd.read_csv(arv7_p1_file, header=None).iloc[:, :(arr_rows*arr_cols)] 
  arv7_p1.index = avg_arv7_p1["Time Elapsed"]

  ## ARV7.p3 
  arv7_p3_file = "/DNAPositives/ARV7.p3/ARV7.p3_vsChem_export.csv"
  arv7_p3 = pd.read_csv(arv7_p3_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  arv7_p3.index = avg_arv7_p3["Time Elapsed"]

  ## ARV7.p4 
  arv7_p4_file = "/DNAPositives/ARV7.p4/ARV7.p4_vsChem_export.csv"
  arv7_p4 = pd.read_csv(arv7_p4_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  arv7_p4.index = avg_arv7_p4["Time Elapsed"]

  ## Beta 1
  b1_file = "/DNAPositives/100921_DNA/100921_DNA/Data/beta1.app.1e4/beta1.app.1e4_vsChem_export.csv"
  b1 = pd.read_csv(b1_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  b1.index = avg_b1["Time Elapsed"]

  ## Beta 2
  b2_file = "/DNAPositives/100921_DNA/100921_DNA/Data/beta2.app.1e5/beta2.app.1e5_vsChem_export.csv"
  b2 = pd.read_csv(b2_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  b2.index = avg_b2["Time Elapsed"]

  ## Beta 5
  b5_file = "/DNAPositives/100921_DNA/100921_DNA/Data/beta5.app.1e5/beta5.app.1e5_vsChem_export.csv"
  b5 = pd.read_csv(b5_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  b5.index = avg_b5["Time Elapsed"]

#### Negative Samples

In [7]:
## Average pixel value for all samples 

with tf.device(gpu):  
  ## ARV7.n1
  avg_data_arv7_file = "/DNANegatives/ARV7.n1/ARV7.n1_data_export.csv"
  avg_arv7 = pd.read_csv(avg_data_arv7_file, header=0)

  ## Yap.n2
  avg_data_yap_file = "/DNANegatives/yap.n2/yap.n2_data_export.csv"
  avg_yap = pd.read_csv(avg_data_yap_file, header=0)

  ## Yap1.n2
  avg_data_yap1_file = "/DNANegatives/yap1.n2/yap1.n2_data_export.csv"
  avg_yap1 = pd.read_csv(avg_data_yap1_file, header=0).iloc[1:, :].reset_index() # row 0 was NAN

  ## Yap1.n1.1 
  avg_data_yap1n1_file = "/DNANegatives/yap1.n1.1/yap1.n1.1_data_export.csv"
  avg_yap1n1 = pd.read_csv(avg_data_yap1n1_file, header=0).iloc[1:, :].reset_index() # row 0 was NAN

  ## ARV7.n2
  avg_data_arv72_file = "/DNANegatives/ARV7.n2/ARV7.n2_data_export.csv"
  avg_arv72 = pd.read_csv(avg_data_arv72_file, header=0)

  ## ARV7.n3
  avg_data_arv73_file = "/DNANegatives/ARV7.n3/ARV7.n3_data_export.csv"
  avg_arv73 = pd.read_csv(avg_data_arv73_file, header=0)

  ## DU145y.n1
  avg_data_du145y_n1_file = "/DNANegatives/DU145y.n1/DU145y.n1_data_export.csv"
  avg_du145y_n1 = pd.read_csv(avg_data_du145y_n1_file, header=0)

In [None]:
## All pixel values for each time stamp

with tf.device(gpu):   
  ## ARV7.n1 
  arv7_file = "/DNANegatives/ARV7.n1/ARV7.n1_vsChem_export.csv"
  arv7 = pd.read_csv(arv7_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  arv7.index = avg_arv7["Time Elapsed"]

  ## Yap.n2
  yap_file = "/DNANegatives/yap.n2/yap.n2_vsChem_export.csv"
  yap = pd.read_csv(yap_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  yap.index = avg_yap["Time Elapsed"]

  ## Yap1.n2
  yap1_file = "/DNANegatives/yap1.n2/yap1.n2_vsChem_export.csv"
  yap1 = pd.read_csv(yap1_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  yap1.index = avg_yap1["Time Elapsed"]

  ## Yap1.n1.1
  yap1n1_file = "/DNANegatives/yap1.n1.1/yap1.n1.1_vsChem_export.csv"
  yap1n1 = pd.read_csv(yap1n1_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  yap1n1.index = avg_yap1n1["Time Elapsed"]

  ## ARV7.n2
  arv72_file = "/DNANegatives/ARV7.n2/ARV7.n2_vsChem_export.csv"
  arv72 = pd.read_csv(arv72_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  arv72.index = avg_arv72["Time Elapsed"]

  ## ARV7.n3
  arv73_file = "/DNANegatives/ARV7.n3/ARV7.n3_vsChem_export.csv"
  arv73 = pd.read_csv(arv73_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  arv73.index = avg_arv73["Time Elapsed"]

  ## DU145y.n1
  du145y_n1_file = "/DNANegatives/DU145y.n1/DU145y.n1_vsChem_export.csv"
  du145y_n1 = pd.read_csv(du145y_n1_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  du145y_n1.index = avg_du145y_n1["Time Elapsed"]

#### Partial Covid Data

In [None]:
## 150520_2_118
avg_118_file = "/COVIDPartialData/150520_2_118/exp_summary_118.csv"
exp_118_pos, exp_118_neg = load_partial_covid_exp(avg_118_file)

## 150520_4_2_86
avg_86_file = "/COVIDPartialData/150520_4_2_86/exp_summary_86.csv"
exp_86_pos, exp_86_neg = load_partial_covid_exp(avg_86_file)

## 150520_5_129
avg_129_file = "/COVIDPartialData/150520_5_129/exp_summary_129.csv"
exp_129_pos, exp_129_neg = load_partial_covid_exp(avg_129_file)

## 180520_4_165
avg_165_file = "/COVIDPartialData/180520_4_165/exp_summary_165.csv"
exp_165_pos, exp_165_neg = load_partial_covid_exp(avg_165_file)

## 180520_6_35
avg_35_file = "/COVIDPartialData/180520_6_35/exp_summary_35.csv"
exp_35_pos, exp_35_neg = load_partial_covid_exp(avg_35_file)

## 190520_1_28
avg_28_file = "/COVIDPartialData/190520_1_28/exp_summary_28.csv"
exp_28_pos, exp_28_neg = load_partial_covid_exp(avg_28_file) 

## 190520_2_14
avg_14_file = "/COVIDPartialData/190520_2_14/exp_summary_14.csv"
exp_14_pos, exp_14_neg = load_partial_covid_exp(avg_14_file)

## 210520_2_40
avg_40_file = "/COVIDPartialData/210520_2_40/exp_summary_40.csv"
exp_40_pos, exp_40_neg = load_partial_covid_exp(avg_40_file)

## 210520_3_88
avg_88_file = "/COVIDPartialData/210520_3_88/exp_summary_88.csv"
exp_88_pos, exp_88_neg = load_partial_covid_exp(avg_88_file)

## 210520_6_27
avg_27_file = "/COVIDPartialData/210520_6_27/exp_summary_27.csv"
exp_27_pos, exp_27_neg = load_partial_covid_exp(avg_27_file)

## 250520_1_134
avg_134_file = "/COVIDPartialData/250520_1_134/exp_summary_134.csv"
exp_134_pos, exp_134_neg = load_partial_covid_exp(avg_134_file)

## 250520_2_97
avg_97_file = "/COVIDPartialData/250520_2_97/exp_summary_97.csv"
exp_97_pos, exp_97_neg = load_partial_covid_exp(avg_97_file)

## 250520_6_2D1
avg_2d1_file = "/COVIDPartialData/250520_6_2D1/exp_summary_2D1.csv"
exp_2d1_pos, exp_2d1_neg = load_partial_covid_exp(avg_2d1_file)

## 250520_7_64
avg_64_file = "/COVIDPartialData/250520_7_64/exp_summary_64.csv"
exp_64_pos, exp_64_neg = load_partial_covid_exp(avg_64_file)

### Preprocessing

#### Positive Samples

In [None]:
g1 = preprocess_data(g1, 500)
g2 = preprocess_data(g2, 500)
g3 = preprocess_data(g3, 500)
g5 = preprocess_data(g5, 500)
rv1_ap1 = preprocess_data(rv1_ap1, 500)
rv1_ap2 = preprocess_data(rv1_ap2, 500)
rv1y_p3 = preprocess_data(rv1y_p3, 500)
rv1y_p4 = preprocess_data(rv1y_p4, 500)
arv7_p1 = preprocess_data(arv7_p1, 500)
arv7_p3 = preprocess_data(arv7_p3, 500)
arv7_p4 = preprocess_data(arv7_p4, 500)
b1 = preprocess_data(b1, 500)
b2 = preprocess_data(b2, 500)
b5 = preprocess_data(b5, 500)

#### Negative Samples

In [None]:
arv7 = preprocess_data(arv7, 500)
yap = preprocess_data(yap, 500)
yap1 = preprocess_data(yap1, 500)
yap1n1 = preprocess_data(yap1n1, 500)
arv72 = preprocess_data(arv72, 500)
arv73 = preprocess_data(arv73, 500)
du145y_n1 = preprocess_data(du145y_n1, 500)

#### Covid Partial Data

In [None]:
exp_118_pos = preprocess_partial_data(exp_118_pos, 500)
exp_86_pos = preprocess_partial_data(exp_86_pos, 500)
exp_129_pos = preprocess_partial_data(exp_129_pos, 500)
exp_165_pos = preprocess_partial_data(exp_165_pos, 500)
exp_35_pos = preprocess_partial_data(exp_35_pos, 500)
exp_28_pos = preprocess_partial_data(exp_28_pos, 500)
exp_14_pos = preprocess_partial_data(exp_14_pos, 500)
exp_40_pos = preprocess_partial_data(exp_40_pos, 500)
exp_88_pos = preprocess_partial_data(exp_88_pos, 500)
exp_27_pos = preprocess_partial_data(exp_27_pos, 500)
exp_134_pos = preprocess_partial_data(exp_134_pos, 500)
exp_97_pos = preprocess_partial_data(exp_97_pos, 500)
exp_2d1_pos = preprocess_partial_data(exp_2d1_pos, 500)
exp_64_pos = preprocess_partial_data(exp_64_pos, 500)

In [None]:
exp_118_neg = preprocess_partial_data(exp_118_neg, 500)
exp_86_neg = preprocess_partial_data(exp_86_neg, 500)
exp_129_neg = preprocess_partial_data(exp_129_neg, 500)
exp_165_neg = preprocess_partial_data(exp_165_neg, 500)
exp_35_neg = preprocess_partial_data(exp_35_neg, 500)
exp_28_neg = preprocess_partial_data(exp_28_neg, 500)
exp_14_neg = preprocess_partial_data(exp_14_neg, 500)
exp_40_neg = preprocess_partial_data(exp_40_neg, 500)
exp_88_neg = preprocess_partial_data(exp_88_neg, 500)
exp_27_neg = preprocess_partial_data(exp_27_neg, 500)
exp_134_neg = preprocess_partial_data(exp_134_neg, 500)
exp_97_neg = preprocess_partial_data(exp_97_neg, 500)
exp_2d1_neg = preprocess_partial_data(exp_2d1_neg, 500)
exp_64_neg = preprocess_partial_data(exp_64_neg, 500)

### Machine Learning - KNN Ensemble

#### Helper Functions

In [None]:
def get_training_data_knn(positive_samples, negative_samples, timestamp, test_samples=[]):
  """ Gets the training data for the KNN classifier at a given timestamp

  Parameters
  ----------
  positive_samples : dict(pandas.DataFrame)
    Dictionary of DataFrames with data from all the positive experiements
  negative_samples : dict(pandas.DataFrame)
    Dictionary of DataFrames with data from all the negative experiements
  timestamp : int
    Number of data-points which the data for each experiement must be truncated to
  test_samples : array, optional
    Array containing the name (key used in the dictionary) of the test samples, Default is []

  Returns
  -------
  numpy.ndarray
    Contains the training data for the classifier at the given time stamp
  training_labels : numpy.array
    1D array with the true labels for each of the training experiements
  """

  training_data = []
  pos_count = 0
  neg_count = 0


  ## iterate postive samples dict
  for key, sample in positive_samples.items():

    ## if dataset is test data do not add to training set
    if(key in test_samples):
      continue

    ## truncate sample to length t = timestamp
    pos_subsample = sample['Average Output'].to_numpy()[0:timestamp]

    ## append subsample of length t to training data
    training_data.append(pos_subsample)
    pos_count += 1


  ## iterate negative samples dict
  for key, sample in negative_samples.items():

    ## if dataset is test data do not add to training set
    if(key in test_samples):
      continue

    ## truncate sample to length t = timestamp
    neg_subsample = sample['Average Output'].to_numpy()[0:timestamp]

    ## append subsample of length t to training data
    training_data.append(neg_subsample)
    neg_count += 1

  ## create positive and negative (1 and 0) label based on sample 
  pos_labels = np.ones(pos_count)
  neg_labels = np.zeros(neg_count)

  ## concatenate labels for final training labels
  training_labels = np.concatenate((pos_labels, neg_labels), axis=0)

  return np.asarray(training_data), training_labels ## np.asarry() converts list to 2D np array

In [None]:
def get_test_data_knn(sample, timestamp):
  """ Gets the test data for the KNN classifier at a given timestamp for a test sample

  Parameters
  ----------
  sample : pandas.DataFrame
    The test sample to be used 
  timestamp : int
    Number of data-points which the data for each experiement must be truncated to

  Returns
  -------
  numpy.ndarray
    Contains the test data for the classifier at the given time stamp
  """
  subsample = []
  subsample.append(sample['Average Output'].to_numpy()[0:timestamp])

  return np.asarray(subsample)

In [None]:
def get_time_index(timestamps, predictions):
  """ Get the timestamp (in terms of number of data points) where the majority vote has been achived for an ensmeble of classifiers

  Parameters
  ----------
  timestamps : array
    Array of ints with timestamps when each classification is made
  predictions : array
    Array containing the prediction made by the classifiers at each of the timestamps

  Returns
  -------
    int
      The number of data-points after which the majority vote is determined for an ensemble of classifiers 
  """

  ## create dict to hold count of predictions
  label_counters = defaultdict(int)

  ## add entries to dict
  for index, pred in enumerate(predictions):
    label_counters[pred] += 1

    ## if label count == half of total possible predictions then majority is achieved
    if(label_counters[pred] == int(len(predictions)/2)+1):
      return timestamps[index]
  
  return -1

#### Training Data

In [None]:
positives = {"exp_118_pos":exp_118_pos, "exp_86_pos":exp_86_pos,"exp_129_pos":exp_129_pos, "exp_165_pos":exp_165_pos, 
             "exp_35_pos":exp_35_pos, "exp_28_pos":exp_28_pos, "exp_14_pos":exp_14_pos, "exp_40_pos":exp_40_pos, 
             "exp_88_pos":exp_88_pos, "exp_27_pos":exp_27_pos, 
             "exp_134_pos":exp_134_pos, "exp_97_pos":exp_97_pos, "exp_2d1_pos":exp_2d1_pos, "exp_64_pos":exp_64_pos, 
             "g1":g1, "g2":g2, "g3":g3, "g5":g5, "rv1_ap1":rv1_ap1, "rv1_ap2":rv1_ap2,  
             "arv7_p3":arv7_p3,"rv1y_p3":rv1y_p3, "rv1y_p4":rv1y_p4, 
             "arv7_p1":arv7_p1, "arv7_p4":arv7_p4, "b1":b1, "b2":b2, "b5":b5}

negatives = {"exp_118_neg":exp_118_neg, "exp_86_neg":exp_86_neg, "exp_129_neg":exp_129_neg, "exp_165_neg":exp_165_neg, 
             "exp_35_neg":exp_35_neg, "exp_28_neg":exp_28_neg, "exp_14_neg":exp_14_neg, "exp_40_neg":exp_40_neg, 
             "exp_88_neg":exp_88_neg, "exp_27_neg":exp_27_neg, "exp_134_neg":exp_134_neg, "exp_97_neg":exp_97_neg, 
             "exp_2d1_neg":exp_2d1_neg, "exp_64_neg":exp_64_neg, "yap":yap, "yap1":yap1, "yap1n1":yap1n1, "arv72":arv72, 
             "arv73":arv73, "du145y_n1":du145y_n1, "arv7":arv7}

#### Timestamps

In [None]:
number_of_samples = len(g1['Average Output'])
number_of_timestamps = 50

timestep = int(number_of_samples/number_of_timestamps)
timestamps = [*range(timestep, number_of_samples+timestep, timestep)]

#### Model

In [None]:
def KNN(k, test_sample, train_data, train_labels, distance_metric):

  """ Generates classification output for a test sample using the KNN classifier

  Parameters
  ----------
  k : int 
    The value of the hyper-parameter k in the KNN classifier
  test_sample : numpy.ndarray
    The test sample that the prediction is made for
  training_data : numpy.ndarray
    Training data used for classification
  train_labels : numpy.array
    Training labels used for classification
  distance_metric : string
    Distance metric used for the KNN classifier

  Returns
  -------
  final_pred : double
    Final classfication for the test sample
  """ 

  distances = None

  if(distance_metric.lower() == 'manhattan' or distance_metric.lower() == 'cityblock'):
    distances = manhattan_distances(test_sample, train_data) # get pair wise manhattan distance for every row
  elif(distance_metric.lower() == 'euclidean'):
    distances = euclidean_distances(test_sample, train_data) # get pair wise euclidean distance for every row 
  elif(distance_metric.lower() == 'cosine'):
    distances = cosine_distances(test_sample, train_data) # get pair wise cosine distance for every row 

  distances = np.squeeze(distances, axis=0) # remove redundant dimension

  min_indexes = np.argsort(distances)[:k] # get k smallest indexes

  knn_labels = list(train_labels[min_indexes]) # get k predictions

  final_pred = max(set(knn_labels), key=knn_labels.count)

  return final_pred

In [None]:
def KNN_auto(k, test_sample, train_data, train_labels, distance_metric):
  """ Generates classification output for a test sample using the KNN classifier using the scikit-learn implementation

  Parameters
  ----------
  k : int 
    The value of the hyper-parameter k in the KNN classifier
  test_sample : numpy.ndarray
    The test sample that the prediction is made for
  training_data : numpy.ndarray
    Training data used for classification
  train_labels : numpy.array
    Training labels used for classification
  distance_metric : string
    Distance metric used for the KNN classifier

  Returns
  -------
  final_pred : double
    Final classfication for the test sample
  """ 
  KnnClassifier = KNeighborsClassifier(n_neighbors=k, weights="uniform")
  KnnClassifier.fit(X=train_data, y=train_labels)
  # dist, ind = KnnClassifier.kneighbors(X=test_sample, n_neighbors=k, return_distance=True)
  pred = KnnClassifier.predict(X=test_sample)
  
  return pred[0] # pred is of type np.ndarray with one value so return indx 0 to get raw value


#### Model Predictions

##### Cross Validation

In [None]:
positives = {"exp_118_pos":exp_118_pos, "exp_86_pos":exp_86_pos,"exp_129_pos":exp_129_pos, "exp_165_pos":exp_165_pos, 
             "exp_35_pos":exp_35_pos, "exp_28_pos":exp_28_pos, "exp_14_pos":exp_14_pos, "exp_40_pos":exp_40_pos, 
             "exp_88_pos":exp_88_pos, "exp_27_pos":exp_27_pos, 
             "exp_134_pos":exp_134_pos, "exp_97_pos":exp_97_pos, "exp_2d1_pos":exp_2d1_pos, "exp_64_pos":exp_64_pos, 
             "g1":g1, "g2":g2, "g3":g3, "g5":g5, "rv1_ap1":rv1_ap1, "rv1_ap2":rv1_ap2,  
             "arv7_p3":arv7_p3,"rv1y_p3":rv1y_p3, "rv1y_p4":rv1y_p4, 
             "arv7_p1":arv7_p1, "arv7_p4":arv7_p4, "b1":b1, "b2":b2, "b5":b5}

negatives = {"exp_118_neg":exp_118_neg, "exp_86_neg":exp_86_neg, "exp_129_neg":exp_129_neg, "exp_165_neg":exp_165_neg, 
             "exp_35_neg":exp_35_neg, "exp_28_neg":exp_28_neg, "exp_14_neg":exp_14_neg, "exp_40_neg":exp_40_neg, 
             "exp_88_neg":exp_88_neg, "exp_27_neg":exp_27_neg, "exp_134_neg":exp_134_neg, "exp_97_neg":exp_97_neg, 
             "exp_2d1_neg":exp_2d1_neg, "exp_64_neg":exp_64_neg, "yap":yap, "yap1":yap1, "yap1n1":yap1n1, "arv72":arv72, 
             "arv73":arv73, "du145y_n1":du145y_n1, "arv7":arv7}

In [None]:
## combine positive and negative sample dicts
all_samples = {}
all_samples.update(positives)
all_samples.update(negatives)

## create dict of samples with true label
keys = list(all_samples.keys())
true_labels = list(np.concatenate((np.ones(len(positives)),np.zeros(len(negatives)))))
true_label_dict = dict(zip(keys, true_labels))

In [None]:
with tf.device(gpu):

  final_classifications = {}
  final_predictions = []
  final_TTP = []
  prediction_correctness = []

  ## use KNN to evaluate the prediction for each of the samples individually
  for key, value in all_samples.items():
    test_sample_name = key
    test_sample = value

    print(f"Testing sample {test_sample_name}")

    ## generate predictions for sample using KNN ensemble
    predictions = []
    for t in timestamps:
      train_data, train_labels = get_training_data_knn(positive_samples=positives, negative_samples=negatives, timestamp=t, test_samples=[test_sample_name])
      test_data = get_test_data_knn(test_sample, t)
      pred = KNN(3, test_data, train_data, train_labels, 'cosine')
      predictions.append(pred)
    
    ## get time to result in seconds using majority voting
    time_index = get_time_index(timestamps, predictions) # get the value of the sample at which the sample needs to be indexed
    time_to_result = test_sample.index[time_index-1] - test_sample.index[0] # get actual time acorrding the experiment at which result is obtained

    ## get final prediction
    classification = Counter(predictions).most_common(1)[0][0] # final prediction

    ## update arrays 
    final_classifications[key] = (classification, true_label_dict[key])
    final_predictions.append(classification) 
    prediction_correctness.append("Yes" if classification == true_label_dict[key] else "No")
    print(f"Predicted Label: {classification} \t True Label: {true_label_dict[key]} \t Correct?: {classification == true_label_dict[key]}")

    ## if prediction is positive get TTP
    if(classification == 1.0):
      final_TTP.append(round((time_to_result+30)/60, 2)) # 30 added because sample was taken 30s after actual reaction start
      print(f"TTP: {time_to_result + 30}s \t {round((time_to_result+30)/60, 2)} mins")
    else:
      final_TTP.append(np.nan)

    print("")
    

In [None]:
print(f"Accuracy: {accuracy(final_classifications)}")
print(f"Sensitivity/Recall: {sensitivity(final_classifications)}")
print(f"Specificity: {specificity(final_classifications)}")
print(f"Precision: {precision(final_classifications)}")
print(f"F1 Score: {f1(final_classifications)}")

##### Confusion Matrix

In [None]:
cm = confusion_matrix(true_labels, final_predictions, labels=[0, 1])
fig, ax = plt.subplots(1,1,figsize=(7,5))
heatmap = sns.heatmap(cm, annot=True, annot_kws={"size": 15}, linewidth=0.75, 
            xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"], cbar=False, cmap='RdPu')

heatmap.set_xticklabels(heatmap.get_xmajorticklabels(), fontsize = 15)
heatmap.set_yticklabels(heatmap.get_ymajorticklabels(), fontsize = 15)

ax.set_ylabel('True Output', fontsize=15, labelpad=15)
ax.set_xlabel('Predicted Output', fontsize=15, labelpad=15)

#### Elbow Plot

In [None]:
positives = {"exp_118_pos":exp_118_pos, "exp_86_pos":exp_86_pos,"exp_129_pos":exp_129_pos, "exp_165_pos":exp_165_pos, 
             "exp_35_pos":exp_35_pos, "exp_28_pos":exp_28_pos, "exp_14_pos":exp_14_pos, "exp_40_pos":exp_40_pos, 
             "exp_88_pos":exp_88_pos, "exp_27_pos":exp_27_pos, 
             "exp_134_pos":exp_134_pos, "exp_97_pos":exp_97_pos, "exp_2d1_pos":exp_2d1_pos, "exp_64_pos":exp_64_pos, 
             "g1":g1, "g2":g2, "g3":g3, "g5":g5, "rv1_ap1":rv1_ap1, "rv1_ap2":rv1_ap2,  
             "arv7_p3":arv7_p3,"rv1y_p3":rv1y_p3, "rv1y_p4":rv1y_p4, 
             "arv7_p1":arv7_p1, "arv7_p4":arv7_p4, "b1":b1, "b2":b2, "b5":b5}

negatives = {"exp_118_neg":exp_118_neg, "exp_86_neg":exp_86_neg, "exp_129_neg":exp_129_neg, "exp_165_neg":exp_165_neg, 
             "exp_35_neg":exp_35_neg, "exp_28_neg":exp_28_neg, "exp_14_neg":exp_14_neg, "exp_40_neg":exp_40_neg, 
             "exp_88_neg":exp_88_neg, "exp_27_neg":exp_27_neg, "exp_134_neg":exp_134_neg, "exp_97_neg":exp_97_neg, 
             "exp_2d1_neg":exp_2d1_neg, "exp_64_neg":exp_64_neg, "yap":yap, "yap1":yap1, "yap1n1":yap1n1, "arv72":arv72, 
             "arv73":arv73, "du145y_n1":du145y_n1, "arv7":arv7}

In [None]:
## combine positive and negative sample dicts
all_samples = {}
all_samples.update(positives)
all_samples.update(negatives)

## create dict of samples with true label
keys = list(all_samples.keys())
true_labels = list(np.concatenate((np.ones(len(positives)),np.zeros(len(negatives)))))
true_label_dict = dict(zip(keys, true_labels))

In [None]:
with tf.device(gpu):
  accuracies = []
  errors = []

  ## iterate over different values of k 
  for k in range(1,30,2):
    final_classifications = {}

    ## use KNN to evaluate the prediction for each of the samples individually
    for key, value in all_samples.items():
      test_sample_name = key
      test_sample = value

      ## get predictions for samples using KNN ensembles
      predictions = []
      for t in timestamps:
        train_data, train_labels = get_training_data_knn(positive_samples=positives, negative_samples=negatives, timestamp=t, test_samples=[test_sample_name])
        test_data = get_test_data_knn(test_sample, t)
        pred = KNN(k, test_data, train_data, train_labels, 'cosine')
        predictions.append(pred)
      
      time_index = get_time_index(timestamps, predictions) # get the value of the sample at which the sample needs to be indexed
      
      classification = Counter(predictions).most_common(1)[0][0] # final prediction
      final_classifications[key] = (classification, true_label_dict[key])

    ## save accuracy for each K 
    acc = accuracy(final_classifications)
    accuracies.append(acc)
    errors.append(1-acc)
    print(f"K: {k} \t Accuracy: {acc}")


In [None]:
fig, axes = plt.subplots(1,1, figsize=(10,5))
x = np.arange(1,30,2)
y = errors
axes.set_xlabel("K")
axes.set_ylabel("Error Rate")
axes.plot(x,y)

### Confidence 

#### Helper Functions

In [None]:
def get_rH_00(true_vals, classifier_outputs):
  """ Calculate ratio rH(y=0|y'=0) where y is the true label and y' is the predicted label

  Parameters
  ----------
  true_vals : array
    True labels for each of the samples in the training data
  classifier_outputs : array
    Predicted output for each sample by a classifier at a given timestamp

  Returns
  -------
  double
    rH(y=0|y'=0) where y is the true label and y' is the predicted label
  """
  correct_pred_0 = 0
  total_pred_0 = 0

  for idx, pred in enumerate(classifier_outputs):
    if(int(pred) == 0):
      total_pred_0 += 1
    if(int(pred) == 0 and int(true_vals[idx] == 0)):
      correct_pred_0 += 1

  return correct_pred_0/total_pred_0


def get_rH_01(true_vals, classifier_outputs):
  """ Calculate ratio rH(y=0|y'=1) where y is the true label and y' is the predicted label

  Parameters
  ----------
  true_vals : array
    True labels for each of the samples in the training data
  classifier_outputs : array
    Predicted output for each sample by a classifier at a given timestamp

  Returns
  -------
  double
    rH(y=0|y'=1) where y is the true label and y' is the predicted label
  """
  wrong_pred_1 = 0
  total_pred_1 = 0

  for idx, pred in enumerate(classifier_outputs):
    if(int(pred) == 1):
      total_pred_1 += 1
    if(int(pred) == 1 and int(true_vals[idx] == 0)):
      wrong_pred_1 += 1
    
  return wrong_pred_1/total_pred_1

def get_rH_10(true_vals, classifier_outputs):
  """ Calculate ratio rH(y=1|y'=0) where y is the true label and y' is the predicted label

  Parameters
  ----------
  true_vals : array
    True labels for each of the samples in the training data
  classifier_outputs : array
    Predicted output for each sample by a classifier at a given timestamp

  Returns
  -------
  double
    rH(y=1|y'=0) where y is the true label and y' is the predicted label
  """
  wrong_pred_0 = 0
  total_pred_0 = 0

  for idx, pred in enumerate(classifier_outputs):
    if(int(pred) == 0):
      total_pred_0 += 1
    if(int(pred) == 0 and int(true_vals[idx] == 1)):
      wrong_pred_0 += 1
  
  return wrong_pred_0/total_pred_0

def get_rH_11(true_vals, classifier_outputs):
  """ Calculate ratio rH(y=1|y'=1) where y is the true label and y' is the predicted label

  Parameters
  ----------
  true_vals : array
    True labels for each of the samples in the training data
  classifier_outputs : array
    Predicted output for each sample by a classifier at a given timestamp

  Returns
  -------
  double
    rH(y=1|y'=1) where y is the true label and y' is the predicted label
  """
  correct_pred_1 = 0
  total_pred_1 = 0

  for idx, pred in enumerate(classifier_outputs):
    if(int(pred) == 1):
      total_pred_1 += 1
    if(int(pred) == 1 and int(true_vals[idx] == 1)):
      correct_pred_1 += 1

  return correct_pred_1/total_pred_1

In [None]:
def get_confidence_multipliers(sample_predictions, true_labels):

  """ Get the multpliers that are used to update the confidence score for each classifier

  Parameters
  ----------
  sample_predictions : numpy.ndarray
    Array of predictions by each classifier for each sample in the training set 
  true_labels : numpy.array
    True output labels for each sample in the training set

  Returns
  -------
  multipliers_final : numpy.ndarray
    Array of multipliers for each classifer
  """

  sample_predictions = np.asarray(sample_predictions) # array of all predictions made by every classifer for all samples

  #2d array of all possible multipliers for each classifier
  multipliers_final = []

  # generate 4 multipliers for each classifier
  for classifier in range(len(sample_predictions[0])):
    classifier_output = sample_predictions[:, classifier]

    rH_00 = get_rH_00(true_labels, classifier_output)
    rH_01 = get_rH_01(true_labels, classifier_output)
    rH_10 = get_rH_10(true_labels, classifier_output)
    rH_11 = get_rH_11(true_labels, classifier_output)
    multipliers_classifier = [rH_00, rH_01, rH_10, rH_11] 
    
    # add multipliers to 2d array
    multipliers_final.append(multipliers_classifier)

  return multipliers_final

In [None]:
def get_confidence(preds, multipliers):
  """ Calculates the fused confidence for a given prediction

  Parameters
  ----------
  preds : array(double)
    Array of all predictions upto and including the one that the confidence is being calculated for 
  multipliers : numpy.ndarray
    Array of all multipliers for each classifier upto and including the one that made the given prediction being evaluated

  Returns
  -------
  confidence : double
    Fused confidence for the prediction being evaluated
  """

  # initialise variable
  confidence = 1

  # the prediction for which the confidence is being calculated -- predication at time t by classifier Ht (most recent prediction)
  pred_t = preds[-1]

  for idx , pred in enumerate(preds):
    # prediction at time k made by classifier Hk
    pred_k = pred

    # array of multipliers for Hk 
    multiplier_k = multipliers[idx]

    if(pred_t == 0 and pred_k == 0):
        confidence*=(1-multiplier_k[0])
    elif(pred_t == 0 and pred_k == 1):
        confidence*=(1-multiplier_k[1])
    elif(pred_t == 1 and pred_k == 0):
        confidence*=(1-multiplier_k[2])  
    elif(pred_t == 1 and pred_k == 1):
        confidence*=(1-multiplier_k[3])
        
        

  confidence = 1 - confidence

  return confidence
  

In [None]:
def generate_predictions_table(positives, negatives, timestamps):

  """ Generate predictions for each sample in the training set one at a time

  Parameters
  ----------
  positives : dict(pandas.DataFrame)
    Dictionary containing the data from the positive experiements 
  negatives : dict(pandas.DataFrame)
    Dictionary containing the data from the negatives experiements 
  timestamps : array(int)
    Array of timestamps at which predictions are made 
  
  Returns
  -------
  sample_predictions : ndarray
    Array containing all predictions made by every classifier in the ensemble for experiments in the training set
  true_labels : array(int)
    Array of true class labels for every experiment in the training set
  """
  sample_predictions = []

  all_samples = {}
  all_samples.update(positives)
  all_samples.update(negatives)

  true_labels = list(np.concatenate((np.ones(len(positives)),np.zeros(len(negatives)))))

  ## use KNN to evaluate the prediction for each of the samples individually
  for key, value in all_samples.items():
    test_sample_name = key
    test_sample = value

    ## generate array of predictions
    predictions = []
    for t in timestamps:
      train_data, train_labels = get_training_data_knn(positive_samples=positives, negative_samples=negatives, timestamp=t, test_samples=[test_sample_name])
      test_data = get_test_data_knn(test_sample, t)
      pred = KNN(3, test_data, train_data, train_labels, 'cosine')
      predictions.append(pred)
      
    sample_predictions.append(predictions)

  return sample_predictions, true_labels

#### Learning best threshold

In [None]:
positives = {"exp_118_pos":exp_118_pos, "exp_86_pos":exp_86_pos,"exp_129_pos":exp_129_pos, "exp_165_pos":exp_165_pos, 
             "exp_35_pos":exp_35_pos, "exp_28_pos":exp_28_pos, "exp_14_pos":exp_14_pos, "exp_40_pos":exp_40_pos, 
             "exp_88_pos":exp_88_pos, "exp_27_pos":exp_27_pos, 
             "exp_134_pos":exp_134_pos, "exp_97_pos":exp_97_pos, "exp_2d1_pos":exp_2d1_pos, "exp_64_pos":exp_64_pos, 
             "g1":g1, "g2":g2, "g3":g3, "g5":g5, "rv1_ap1":rv1_ap1, "rv1_ap2":rv1_ap2,  
             "arv7_p3":arv7_p3,"rv1y_p3":rv1y_p3, "rv1y_p4":rv1y_p4, 
             "arv7_p1":arv7_p1, "arv7_p4":arv7_p4, "b1":b1, "b2":b2, "b5":b5}

negatives = {"exp_118_neg":exp_118_neg, "exp_86_neg":exp_86_neg, "exp_129_neg":exp_129_neg, "exp_165_neg":exp_165_neg, 
             "exp_35_neg":exp_35_neg, "exp_28_neg":exp_28_neg, "exp_14_neg":exp_14_neg, "exp_40_neg":exp_40_neg, 
             "exp_88_neg":exp_88_neg, "exp_27_neg":exp_27_neg, "exp_134_neg":exp_134_neg, "exp_97_neg":exp_97_neg, 
             "exp_2d1_neg":exp_2d1_neg, "exp_64_neg":exp_64_neg, "yap":yap, "yap1":yap1, "yap1n1":yap1n1, "arv72":arv72, 
             "arv73":arv73, "du145y_n1":du145y_n1, "arv7":arv7}

In [None]:
number_of_samples = len(g1['Average Output'])
number_of_timestamps = 50

timestep = int(number_of_samples/number_of_timestamps)
timestamps = [*range(timestep, number_of_samples+timestep, timestep)]

In [None]:
## combine positive and negative sample dicts
all_samples = {}
all_samples.update(positives)
all_samples.update(negatives)

## create dict of samples with true label
keys = list(all_samples.keys())
true_labels_array = list(np.concatenate((np.ones(len(positives)),np.zeros(len(negatives)))))
true_label_dict = dict(zip(keys, true_labels_array))

##### Generating candidates

In [None]:
with tf.device(gpu):

  # create nN predictions using each sample in the training dataset as the test sample (n = no of exps in train set, N = no of classifiers)
  sample_predictions, true_labels = generate_predictions_table(positives, negatives, timestamps)

  # create multipliers for every classifier
  multipliers_2d = get_confidence_multipliers(sample_predictions, true_labels)

  # sample index
  sample_idx = 0

  # create set for all confidence values
  confidence_set = set()
  
  for key, value in all_samples.items():
    test_sample_name = key
    test_sample = value

    # get KNN predictions for the sample
    predictions = sample_predictions[sample_idx]

    confidences = []

    # for each prediction get the confidence and add to confidence array for the sample
    for i in range(len(predictions)):
      c = get_confidence(predictions[:i+1], multipliers_2d[:i+1]) # i+1 needed because slicing does not include last index 
      confidences.append(c)
    
    # update set with confidence values
    confidence_set = confidence_set.union(set(confidences))
    
    sample_idx += 1

In [None]:
confidence_set = sorted(confidence_set)

In [None]:
threshold_candidates = set()

# threshold candidates are the set of the mean of every pair of values in confidence set after sorting
for i in range(1,len(confidence_set)):
  mean = 0.5*(confidence_set[i] + confidence_set[i-1])
  threshold_candidates.add(mean) 

# sort candidates (only for ordering purposes)
threshold_candidates = sorted(threshold_candidates)

In [None]:
len(threshold_candidates)

##### Evaluating candidates

In [None]:
with tf.device(gpu):

  # array to hold cost function value for each candidate
  cost_function_values = []

  accs = []

  # create nN predictions using each sample in the training dataset as the test sample (n = no of exps in train set, N = no of classifiers)
  sample_predictions, true_labels = generate_predictions_table(positives, negatives, timestamps)

  # create multipliers for every classifier
  multipliers_2d = get_confidence_multipliers(sample_predictions, true_labels)

  # alpha
  alpha = 0.85

  # evaluate every candidate
  for th in threshold_candidates:

    print(f"Candidate: {th} ")

    # array to hold earliness values for the samples 
    earliness = []  

    # dict to hold predictions vs true values for the samples  
    final_classifications = {}

    # sample index
    sample_idx = 0

    for key, value in all_samples.items():
      test_sample_name = key
      test_sample = value

      # get KNN predictions for the sample
      predictions = sample_predictions[sample_idx]

      for i in range(len(predictions)):
        
        # get the confidence for that prediction 
        c = get_confidence(predictions[:i+1], multipliers_2d[:i+1]) # i+1 needed because slicing does not include last index 

        if(c >= th): # check if confidence is at or above confidence threshold

          time_index = timestamps[i] # get the value of the sample number at which the sample needs to be indexed
          time_to_result = test_sample.index[time_index-1] - test_sample.index[0] # get actual time according to the experiment at which the result is obtained
          
          # predicted class for the sample is given by the prediction which led to the given confidence value
          pred = predictions[i]

          # update final outcomes dict
          final_classifications[test_sample_name] = (pred, true_label_dict[test_sample_name])

          # add to earliness array
          earliness.append(time_index/timestamps[-1])

          break

        if(i == len(predictions) - 1): # if threshold is not met ever -- result is inconclusive
          final_classifications[test_sample_name] = (None, true_label_dict[test_sample_name])
      
      sample_idx += 1

    # get avg accuracy and avg earliness for this threshold
    if(len(final_classifications) > 0):
      avg_accuracy = accuracy(final_classifications)
      avg_earliness = sum(earliness)/len(earliness)
      accs.append(avg_accuracy)
      
      # compute value of cost function and add to array 
      cf_score = alpha*(1-avg_accuracy) + (1-alpha)*avg_earliness
      cost_function_values.append(cf_score)
      print(f"Score: {cf_score}")
      print("")


In [None]:
# aim is to minimise cost function -- find index in array where this is the case
lowest_cf_score = np.min(np.array(cost_function_values))
index_best_th = np.argmin(np.array(cost_function_values))

In [None]:
lowest_cf_score

In [None]:
index_best_th

In [None]:
best_th = list(threshold_candidates)[index_best_th]
best_th

#### Testing with best threshold

In [None]:
with tf.device(gpu):

  final_classifications = {}
  final_predictions = []
  final_TTP = []
  earliness = []
  prediction_correctness = []

  # create nN predictions using each sample in the training dataset as the test sample (n = no of exps in train set, N = no of classifiers)
  sample_predictions, true_labels = generate_predictions_table(positives, negatives, timestamps)

  # create multipliers for every classifier
  multipliers_2d = get_confidence_multipliers(sample_predictions, true_labels)

  # sample index
  sample_idx = 0

  # count inconclusive results
  inconc_count = 0
  
  for key, value in all_samples.items():
    test_sample_name = key
    test_sample = value

    print(f"Sample {test_sample_name}")  
    predictions = sample_predictions[sample_idx]

    # get confidence for each prediction
    for i in range(len(predictions)):

      c = get_confidence(predictions[:i+1], multipliers_2d[:i+1]) # i+1 needed because slicing does not include last index 

      # check if confidence meets threshold
      if(c >= best_th): # best confidence threshold from cost function
        time_index = timestamps[i] # get the value of the sample number at which the sample needs to be indexed
        time_to_result = test_sample.index[time_index-1] - test_sample.index[0] # get actual time acorrding the experiment at which result is obtained

        # final prediction
        pred = predictions[i]
        
        # update arrays with final output
        final_classifications[test_sample_name] = (pred, true_label_dict[test_sample_name])
        final_predictions.append(pred)
        prediction_correctness.append("Yes" if pred == true_label_dict[key] else "No")
        print(f"Predicted Label: {pred} \t True Label: {true_label_dict[test_sample_name]} \t Correct?: {pred == true_label_dict[test_sample_name]}")

        earliness.append(time_index/timestamps[-1])

        ## if final output is positive then get TTPs
        if(pred == 1.0):
          final_TTP.append(round((time_to_result+30)/60, 2)) # 30 added because sample was taken 30s after actual reaction start
          print(f"TTP: {time_to_result + 30}s \t {round((time_to_result+30)/60, 2)} mins")
        else:
          final_TTP.append(np.nan)


        break

      # if confidence is never met then then result is inconclusive
      if(i == len(predictions)-1):
        final_classifications[test_sample_name] = (None, true_label_dict[test_sample_name])
        final_predictions.append(None)
        print("Inconclusive")
        inconc_count += 1
    
    sample_idx += 1
    print("")

  print(f"Accuracy: {accuracy(final_classifications)}")
  print(f"Sensitivity/Recall: {sensitivity(final_classifications)}")
  print(f"Specificity: {specificity(final_classifications)}")
  print(f"Precision: {precision(final_classifications)}")
  print(f"F1 Score: {f1(final_classifications)}")
  print(f"Average Earliness: {sum(earliness)/len(earliness)}")
  print(f"Total Inconclusive: {inconc_count}/{len(sample_predictions)}")

#### Confusion Matrix

In [None]:
cm = confusion_matrix(true_labels, final_predictions, labels=[0, 1])
fig, ax = plt.subplots(1,1,figsize=(7,5))
heatmap = sns.heatmap(cm, annot=True, annot_kws={"size": 15}, linewidth=0.75, 
            xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"], cbar=False, cmap='RdPu')

heatmap.set_xticklabels(heatmap.get_xmajorticklabels(), fontsize = 15)
heatmap.set_yticklabels(heatmap.get_ymajorticklabels(), fontsize = 15)

ax.set_ylabel('True Output', fontsize=15, labelpad=15)
ax.set_xlabel('Predicted Output', fontsize=15, labelpad=15)

#### Testing with different alpha values

In [None]:
positives = {"exp_118_pos":exp_118_pos, "exp_86_pos":exp_86_pos,"exp_129_pos":exp_129_pos, "exp_165_pos":exp_165_pos, 
             "exp_35_pos":exp_35_pos, "exp_28_pos":exp_28_pos, "exp_14_pos":exp_14_pos, "exp_40_pos":exp_40_pos, 
             "exp_88_pos":exp_88_pos, "exp_27_pos":exp_27_pos, 
             "exp_134_pos":exp_134_pos, "exp_97_pos":exp_97_pos, "exp_2d1_pos":exp_2d1_pos, "exp_64_pos":exp_64_pos, 
             "g1":g1, "g2":g2, "g3":g3, "g5":g5, "rv1_ap1":rv1_ap1, "rv1_ap2":rv1_ap2,  
             "arv7_p3":arv7_p3,"rv1y_p3":rv1y_p3, "rv1y_p4":rv1y_p4, 
             "arv7_p1":arv7_p1, "arv7_p4":arv7_p4, "b1":b1, "b2":b2, "b5":b5}

negatives = {"exp_118_neg":exp_118_neg, "exp_86_neg":exp_86_neg, "exp_129_neg":exp_129_neg, "exp_165_neg":exp_165_neg, 
             "exp_35_neg":exp_35_neg, "exp_28_neg":exp_28_neg, "exp_14_neg":exp_14_neg, "exp_40_neg":exp_40_neg, 
             "exp_88_neg":exp_88_neg, "exp_27_neg":exp_27_neg, "exp_134_neg":exp_134_neg, "exp_97_neg":exp_97_neg, 
             "exp_2d1_neg":exp_2d1_neg, "exp_64_neg":exp_64_neg, "yap":yap, "yap1":yap1, "yap1n1":yap1n1, "arv72":arv72, 
             "arv73":arv73, "du145y_n1":du145y_n1, "arv7":arv7}

In [None]:
number_of_samples = len(g1['Average Output'])
number_of_timestamps = 50

timestep = int(number_of_samples/number_of_timestamps)
timestamps = [*range(timestep, number_of_samples+timestep, timestep)]

In [None]:
print(timestamps)

In [None]:
## combine positive and negative sample dicts
all_samples = {}
all_samples.update(positives)
all_samples.update(negatives)

## create dict of samples with true labels
keys = list(all_samples.keys())
true_labels_array = list(np.concatenate((np.ones(len(positives)),np.zeros(len(negatives)))))
true_label_dict = dict(zip(keys, true_labels_array))

In [None]:
with tf.device(gpu):

  # create nN predictions using each sample in the training dataset as the test sample (n = no of exps in train set, N = no of classifiers)
  sample_predictions, true_labels = generate_predictions_table(positives, negatives, timestamps)

  # create multipliers for every classifier
  multipliers_2d = get_confidence_multipliers(sample_predictions, true_labels)

  # sample index
  sample_idx = 0

  # create set for all confidence values
  confidence_set = set()
  
  for key, value in all_samples.items():
    test_sample_name = key
    test_sample = value

    # get KNN predictions for the sample
    predictions = sample_predictions[sample_idx]

    confidences = []

    # for each prediction get the confidence and add to confidence array for the sample
    for i in range(len(predictions)):
      c = get_confidence(predictions[:i+1], multipliers_2d[:i+1]) # i+1 needed because slicing does not include last index 
      confidences.append(c)
    
    # update set with confidence values
    confidence_set = confidence_set.union(set(confidences))
    
    sample_idx += 1

In [None]:
confidence_set = sorted(confidence_set)

In [None]:
threshold_candidates = set()

# threshold candidates are mad of the mean of every pair of values in confidence set after sorting
for i in range(1,len(confidence_set)):
  mean = 0.5*(confidence_set[i] + confidence_set[i-1])
  threshold_candidates.add(mean) 

# sort candidates (only for ordering purposes)
threshold_candidates = sorted(threshold_candidates)

In [None]:
len(threshold_candidates)

In [None]:
with tf.device(gpu):

  acc = []
  ear = []

######################################################## Optimal Threshold Calculation #############################################################
  # iterate over possible alpha values
  for i in range(0,100,5):

    # alpha
    alpha = i/100

    print(f"Alpha: {alpha}")

    # array to hold cost function value for each candidate
    cost_function_values = []

    # create nN predictions using each sample in the training dataset as the test sample (n = no of exps in train set, N = no of classifiers)
    sample_predictions, true_labels = generate_predictions_table(positives, negatives, timestamps)

    # create multipliers for every classifier
    multipliers_2d = get_confidence_multipliers(sample_predictions, true_labels)

    # evaluate every threshold candidate
    for th in threshold_candidates:

      # array to hold earliness values for the samples 
      earliness = []  

      # dict to hold predictions vs true values for the samples  
      final_classifications = {}

      # sample index
      sample_idx = 0

      for key, value in all_samples.items():
        test_sample_name = key
        test_sample = value
  
        # get KNN predicition for the sample
        predictions = sample_predictions[sample_idx]

        for i in range(len(predictions)):

          # get the confidence for that prediction 
          c = get_confidence(predictions[:i+1], multipliers_2d[:i+1]) # i+1 needed because slicing does not include last index 

          if(c >= th): # check if confidence is above confidence threshold

            time_index = timestamps[i] # get the value of the sample number at which the sample needs to be indexed
            time_to_result = test_sample.index[time_index-1] - test_sample.index[0] # get actual time acorrding the experiment at which result is obtained

            # predicted class for the sample is given by the prediction which led to the gien confidence value
            pred = predictions[i]

            # update final outcomes dict
            final_classifications[test_sample_name] = (pred, true_label_dict[test_sample_name])

            # add to earliness array
            earliness.append(time_index/timestamps[-1])

            break
          
          if(i == len(predictions) - 1): # if threshold is not met ever -- result is inconclusive
            final_classifications[test_sample_name] = (None, true_label_dict[test_sample_name])

        sample_idx += 1

      # get avg accuracy and avg earliness for this threshold
      if(len(final_classifications) > 0):
        avg_accuracy = accuracy(final_classifications)
        avg_earliness = sum(earliness)/len(earliness)

        # compute value of cost function and add to array 
        cf_score = alpha*(1-avg_accuracy) + (1-alpha)*avg_earliness
        cost_function_values.append(cf_score)

    index_best_th = np.argmin(np.array(cost_function_values))    
    best_th = list(threshold_candidates)[index_best_th]

###################################################### Testing with optimal theshold ############################################################

    final_classifications = {}
    earliness = []

    # create nN predictions using each sample in the training dataset as the test sample (n = no of exps in train set, N = no of classifiers)
    sample_predictions, true_labels = generate_predictions_table(positives, negatives, timestamps)

    # create multipliers for every classifier
    multipliers_2d = get_confidence_multipliers(sample_predictions, true_labels)

    # sample index
    sample_idx = 0

    # count inconclusive results
    inconc_count = 0
    
    ## use KNN to evaluate the prediction for each of the samples individually
    for key, value in all_samples.items():
      test_sample_name = key
      test_sample = value

      predictions = sample_predictions[sample_idx]

      for i in range(len(predictions)):
        c = get_confidence(predictions[:i+1], multipliers_2d[:i+1]) # i+1 needed because slicing does not include last index 
      
        if(c >= best_th): # best confidence threshold from cost function
          time_index = timestamps[i] # get the value of the sample number at which the sample needs to be indexed
          time_to_result = test_sample.index[time_index-1] - test_sample.index[0] # get actual time acorrding the experiment at which result is obtained

          # final prediction 
          pred = predictions[i]

          # update arrays with final outcome and time to result
          final_classifications[test_sample_name] = (pred, true_label_dict[test_sample_name])
          earliness.append(time_index/timestamps[-1])
          break

        # if threshold is not met then result is inconclusive
        if(i == len(predictions)-1):
          final_classifications[test_sample_name] = (None, true_label_dict[test_sample_name])
          inconc_count += 1
      
      sample_idx += 1

    print(f"Avg Accuracy: {accuracy(final_classifications)}")
    print(f"Avg Earliness: {sum(earliness)/len(earliness)}")
    print("")
    acc.append(accuracy(final_classifications))
    ear.append(sum(earliness)/len(earliness))

In [None]:
# plot earlines vs accuracy graph

fig, axes = plt.subplots(1,1, figsize=(10,5))
x = acc
y = ear
axes.set_xlabel("Accuracy")
axes.set_ylabel("Earliness")
axes.plot(x,y, '-o')

### Dummy Data

#### Experimental Data (only use after pre-processing done)

In [None]:
positives = {"exp_118_pos":exp_118_pos, "exp_86_pos":exp_86_pos,"exp_129_pos":exp_129_pos, "exp_165_pos":exp_165_pos, 
             "exp_35_pos":exp_35_pos, "exp_28_pos":exp_28_pos, "exp_14_pos":exp_14_pos, "exp_40_pos":exp_40_pos, 
             "exp_88_pos":exp_88_pos, "exp_27_pos":exp_27_pos, 
             "exp_134_pos":exp_134_pos, "exp_97_pos":exp_97_pos, "exp_2d1_pos":exp_2d1_pos, "exp_64_pos":exp_64_pos, 
             "g1":g1, "g2":g2, "g3":g3, "g5":g5, "rv1_ap1":rv1_ap1, "rv1_ap2":rv1_ap2,  
             "arv7_p3":arv7_p3,"rv1y_p3":rv1y_p3, "rv1y_p4":rv1y_p4, 
             "arv7_p1":arv7_p1, "arv7_p4":arv7_p4, "b1":b1, "b2":b2, "b5":b5}

negatives = {"exp_118_neg":exp_118_neg, "exp_86_neg":exp_86_neg, "exp_129_neg":exp_129_neg, "exp_165_neg":exp_165_neg, 
             "exp_35_neg":exp_35_neg, "exp_28_neg":exp_28_neg, "exp_14_neg":exp_14_neg, "exp_40_neg":exp_40_neg, 
             "exp_88_neg":exp_88_neg, "exp_27_neg":exp_27_neg, "exp_134_neg":exp_134_neg, "exp_97_neg":exp_97_neg, 
             "exp_2d1_neg":exp_2d1_neg, "exp_64_neg":exp_64_neg, "yap":yap, "yap1":yap1, "yap1n1":yap1n1, "arv72":arv72, 
             "arv73":arv73, "du145y_n1":du145y_n1, "arv7":arv7}

In [None]:
## combine positive and negative sample dicts
all_samples = {}
all_samples.update(positives)
all_samples.update(negatives)

## create dict of samples with true labels
keys = list(all_samples.keys())
true_labels = list(np.concatenate((np.ones(len(positives)),np.zeros(len(negatives)))))
true_label_dict = dict(zip(keys, true_labels))

In [None]:
# plot average drift for each experiement calculated during pre-processing stage

fig, axes = plt.subplots(1,1, figsize=(10,5))
for key, sample in all_samples.items():
  axes.plot(sample.index, np.array(sample['Average Drift']))

#### Get drift params from average drift curves

In [None]:
fig, axes = plt.subplots(1,1, figsize=(10,5))
opt = np.zeros((2, len(keys)))

for idx, sample in enumerate(all_samples.values()):
  try:
    # extrapolate to fit the drift curves to estimate params
    opt[:2, idx], _ = curve_fit(decaying_exp, sample.index, np.array(sample['Average Drift']), p0=[-10, 0.1])

    # plot generated curves
    axes.plot(sample.index, decaying_exp(sample.index, *opt[:2,idx]))
  except:
    opt[:, idx] = np.nan
    continue

#### Plotting drift params and corresponding outcomes

In [None]:
# plot drift model params for the positive and negative reactions

fig, axes = plt.subplots(1,1, figsize=(10,5))
axes.set_xlabel("a")
axes.set_ylabel("b")
axes.set_title("Drift model parameters for positives and negatives")

no_pos_samples = len(positives) 
axes.scatter(opt[0, :no_pos_samples], opt[1, :no_pos_samples], color='red', alpha=0.5, label="Positives")
axes.scatter(opt[0, no_pos_samples:], opt[1, no_pos_samples:], color='blue', alpha=0.5, label="Negatives")

plt.legend(fontsize=12)

#### Mean and standard deviation of drift params (calculated from experimenal data)

In [None]:
mean_a = np.nanmean(opt[0, :])
mean_b = np.nanmean(opt[1, :])

std_a = np.nanstd(opt[0, :])
std_b = np.nanstd(opt[1, :])

In [None]:
print(f"Mean a: {mean_a}")
print(f"Mean b: {mean_b}")
print(f"Std a: {std_a}")
print(f"Std b: {std_b}")

#### Dummy Negatives

In [None]:
# mean and std for the drift params used for data generation 

mean_a_dummy_data = mean_a
std_a_dummy_data = 10

mean_b_dummy_data = 0.0025
std_b_dummy_data = 0.001

In [None]:
x_range = [*range(0,1200,3)]

# fig, ax = plt.subplots(1,1, figsize=(10,5))

dummy_negatives = {}

for i in range(100):

  # generate params for drift curve
  params = np.zeros((2,1))
  params[0, 0] = np.random.normal(mean_a_dummy_data, std_a_dummy_data)
  params[1, 0] = np.random.normal(mean_b_dummy_data, std_b_dummy_data)
  
  # create curve 
  y_vals = decaying_exp(x_range, *params)
  y_vals -= 9 ## must be done to ensure both positives and neagtives start at the same y value
  
  # create df for negative sample
  df = pd.DataFrame(y_vals, index=x_range, columns=['Average Output'])
  dummy_negatives[f"Train_Neg{i}"] = df

  # plotting graphs
  # ax.set_xlabel("Time (s)")
  # ax.set_ylabel("Average Output (mV)")
  # ax.set_title("Dummy Negatives Train")
  # ax.plot(x_range, y_vals)

#### Dummy Negatives Test

In [None]:
x_range = [*range(0,1200,3)]

# fig, ax = plt.subplots(1,1, figsize=(10,5))

dummy_negatives_test = {}

for i in range(100):

  # generate params for drift curve
  params = np.zeros((2,1))
  params[0, 0] = np.random.normal(mean_a_dummy_data, std_a_dummy_data)
  params[1, 0] = np.random.normal(mean_b_dummy_data, std_b_dummy_data)
  
  # create curve
  y_vals = decaying_exp(x_range, *params)
  y_vals -= 9 ## must be done to ensure both positives and neagtives start at the same y value
  
  # create df for negative sample
  df = pd.DataFrame(y_vals, index=x_range, columns=['Average Output'])
  dummy_negatives_test[f"Test_Neg{i}"] = df

  # plotting graphs
  # ax.set_xlabel("Time (s)")
  # ax.set_ylabel("Average Output (mV)")
  # ax.set_title("Dummy Negatives Test")
  # ax.plot(x_range, y_vals)

#### Dummy Positives Model

In [None]:
def log_sigmoid(x, ttp):

  """ Generate the log of a sigmoid to model the positive amplification curve

  Parameters
  ----------
  x : array
    Times at which the curve is to be generated for 
  ttp : int
    The translation of the curve in the time axis

  Returns
  -------
  sig : array
    The values generated when the sigmoid function is applied to the x values
  log_sig : array 
    The values generated when aplying the sigmoid then taking log 
  """  
  
  sig = []
  for val in x:
    s_val = 10e-10 + (10e-8 - 10e-10)/( 1+math.exp( -0.02 * (val-ttp) ) ) ## 10e-10 added to start at 10^-9 and also subtracted from t_inf because the graph should saturate at 10^-8
    sig.append(s_val)

  log_sig = np.log10(np.array(sig))

  return sig, log_sig

#### Dummy Positives

In [None]:
# mean and std for the ttp when being sampled from a normal distribution

mean_ttp = 900
std_ttp = 100

In [None]:
x_range = [*range(0,1200,3)]

# fig, ax = plt.subplots(3,1, figsize=(10,15))

dummy_positives = {}

for i in range(100):
  params = np.zeros((2,1))
  params[0, 0] = np.random.normal(mean_a_dummy_data, std_a_dummy_data) # a for drift
  params[1, 0] = np.random.normal(mean_b_dummy_data, std_b_dummy_data) # b for drift

  ## create drfit
  y_drift_vals = decaying_exp(x_range, *params)

  ## create sigmoid
  ttp = np.random.normal(mean_ttp, std_ttp) # ttp for drift mean of 15 mins with std of 5/3 mins (99% of ttps within 10-20 mins which is 3 std away)
  y_sig_vals, y_log_sig_vals = log_sigmoid(x_range, ttp)

  ## add sigmoid to drift
  pos_vals = y_drift_vals + y_log_sig_vals

  ## convert to df as positive sample
  df = pd.DataFrame(pos_vals, index=x_range, columns=['Average Output'])
  dummy_positives[f"Train_Pos{i}"] = df

  # plotting graphs
  # ax[0].set_xlabel("Time (s)")
  # ax[0].set_title("Sigmoid")

  # ax[1].set_xlabel("Time (s)")
  # ax[1].set_title("Log10 Sigmoid")

  # ax[2].set_xlabel("Time (s)")
  # ax[2].set_ylabel("Average Output (mV)")
  # ax[2].set_title("Dummy Positives Train")

  # ax[0].plot(x_range, y_sig_vals)
  # ax[1].plot(x_range, y_log_sig_vals)
  # ax[2].plot(x_range, pos_vals)


#### Dummy Positives Test

In [None]:
x_range = [*range(0,1200,3)]

# fig, ax = plt.subplots(3,1, figsize=(10,15))

dummy_positives_test = {}

for i in range(100):
  params = np.zeros((2,1))
  params[0, 0] = np.random.normal(mean_a_dummy_data, std_a_dummy_data) # a for drift
  params[1, 0] = np.random.normal(mean_b_dummy_data, std_b_dummy_data) # b for drift

  ## create drfit
  y_drift_vals = decaying_exp(x_range, *params)

  ## create sigmoid
  ttp = np.random.normal(mean_ttp, std_ttp) # ttp for drift mean of 15 mins with std of 5/3 mins (99% of ttps within 10-20 mins which is 3 std away)
  y_sig_vals, y_log_sig_vals = log_sigmoid(x_range, ttp)

  ## add sigmoid to drift
  pos_vals = y_drift_vals + y_log_sig_vals

  ## convert to df as positive sample
  df = pd.DataFrame(pos_vals, index=x_range, columns=['Average Output'])
  dummy_positives_test[f"Test_Pos{i}"] = df

  # plotting graphs
  # ax[0].set_xlabel("Time (s)")
  # ax[0].set_title("Sigmoid")

  # ax[1].set_xlabel("Time (s)")
  # ax[1].set_title("Log10 Sigmoid")

  # ax[2].set_xlabel("Time (s)")
  # ax[2].set_ylabel("Average Output (mV)")
  # ax[2].set_title("Dummy Positives Test")

  # ax[0].plot(x_range, y_sig_vals)
  # ax[1].plot(x_range, y_log_sig_vals)
  # ax[2].plot(x_range, pos_vals)


#### Load Sythetic Data (if needed)

In [None]:
# pos_train_file = '/SyntheticData/Iteration6/PosTrain.pkl'
# pos_test_file = '/SyntheticData/Iteration6/PosTest.pkl'

# neg_train_file = '/SyntheticData/Iteration6/NegTrain.pkl'
# neg_test_file = '/SyntheticData/Iteration6/NegTest.pkl'

# dummy_positives = load_data(pos_train_file)
# dummy_positives_test = load_data(pos_test_file)
# dummy_negatives = load_data(neg_train_file)
# dummy_negatives_test = load_data(neg_test_file)

#### Learning Best Threshold

In [None]:
x_range = [*range(0,1200,3)]

number_of_samples = len(x_range)
number_of_timestamps = 50

timestep = int(number_of_samples/number_of_timestamps)
timestamps = [*range(timestep, number_of_samples+timestep, timestep)]

In [None]:
## combine positive and negative sample dicts for the training data
all_samples = {}
all_samples.update(dummy_positives)
all_samples.update(dummy_negatives)

## create dict of samples with true labels for the training data
keys = list(all_samples.keys())
true_labels_array = list(np.concatenate((np.ones(len(dummy_positives)),np.zeros(len(dummy_negatives)))))
true_label_dict = dict(zip(keys, true_labels_array))

##### Generating candidates

In [None]:
with tf.device(gpu):

  # create nN predictions using each sample in the training dataset as the test sample (n = no of exps in train set, N = no of classifiers)
  sample_predictions, true_labels = generate_predictions_table(dummy_positives, dummy_negatives, timestamps)

  # create multipliers for every classifier
  multipliers_2d = get_confidence_multipliers(sample_predictions, true_labels)

  # sample index
  sample_idx = 0

  # create set for all confidence values
  confidence_set = set()
  
  for key, value in all_samples.items():
    test_sample_name = key
    test_sample = value

    # get KNN predictions for the sample
    predictions = sample_predictions[sample_idx]

    confidences = []

    # for each prediction get the confidence and add to confidence array for the sample
    for i in range(len(predictions)):
      c = get_confidence(predictions[:i+1], multipliers_2d[:i+1]) # i+1 needed because slicing does not include last index 
      confidences.append(c)

    # update set with confidence values
    confidence_set = confidence_set.union(set(confidences))
    
    sample_idx += 1

In [None]:
confidence_set = sorted(confidence_set)

In [None]:
threshold_candidates = set()

# threshold candidates are the set of the mean of every pair of values in confidence set after sorting
for i in range(1,len(confidence_set)):
  mean = 0.5*(confidence_set[i] + confidence_set[i-1])
  threshold_candidates.add(mean) 

# sort candidates (only for ordering purposes)
threshold_candidates = sorted(threshold_candidates)

In [None]:
len(threshold_candidates)

##### Evaluating candidates

In [None]:
with tf.device(gpu):

  # array to hold cost function value for each candidate
  cost_function_values = []

  accs = []

  # create nN predictions using each sample in the training dataset as the test sample (n = no of exps in train set, N = no of classifiers)
  sample_predictions, true_labels = generate_predictions_table(dummy_positives, dummy_negatives, timestamps)

  # create multipliers for every classifier
  multipliers_2d = get_confidence_multipliers(sample_predictions, true_labels)

  # alpha
  alpha = 0.85

  # evaluate every threshold candidate
  for th in threshold_candidates:

    print(f"Candidate: {th} ")

    # array to hold earliness values for the samples 
    earliness = []  

    # dict to hold predictions vs true values for the samples  
    final_classifications = {}

    # sample index
    sample_idx = 0

    for key, value in all_samples.items():
      test_sample_name = key
      test_sample = value

      # get KNN predictions for the sample
      predictions = sample_predictions[sample_idx]

      for i in range(len(predictions)):
        
        # get the confidence for that prediction 
        c = get_confidence(predictions[:i+1], multipliers_2d[:i+1]) # i+1 needed because slicing does not include last index 

        if(c >= th): # check if confidence is at or above confidence threshold

          time_index = timestamps[i] # get the value of the sample number at which the sample needs to be indexed
          time_to_result = test_sample.index[time_index-1] - test_sample.index[0] # get actual time according to the experiment at which the result is obtained
          
          # predicted class for the sample is given by the prediction which led to the given confidence value
          pred = predictions[i]

          # update final outcomes dict
          final_classifications[test_sample_name] = (pred, true_label_dict[test_sample_name])

          # add to earliness array
          earliness.append(time_index/timestamps[-1])

          break
          
        if(i == len(predictions) - 1): # if threshold is not met ever -- result is inconclusive
          final_classifications[test_sample_name] = (None, true_label_dict[test_sample_name])

      sample_idx += 1

    # get avg accuracy and avg earliness for this threshold
    if(len(final_classifications) > 0):
      avg_accuracy = accuracy(final_classifications)
      avg_earliness = sum(earliness)/len(earliness)
      accs.append(avg_accuracy)
      
      # compute value of cost function and add to array 
      cf_score = alpha*(1-avg_accuracy) + (1-alpha)*avg_earliness
      cost_function_values.append(cf_score)
      print(f"Score: {cf_score}")
      print("")


In [None]:
# aim is to minimise cost function -- find index in array where this is the case
lowest_cf_score = np.min(np.array(cost_function_values))
index_best_th = np.argmin(np.array(cost_function_values))

In [None]:
lowest_cf_score

In [None]:
index_best_th

In [None]:
best_th = list(threshold_candidates)[index_best_th]
best_th

#### Testing with best threshold (on test set)

In [None]:
## combine positive and negative sample dicts for the test data
all_samples_test = {}
all_samples_test.update(dummy_positives_test)
all_samples_test.update(dummy_negatives_test)

## create dict of samples with true label for the test data
keys_test = list(all_samples_test.keys())
true_labels_test_array = list(np.concatenate((np.ones(len(dummy_positives_test)),np.zeros(len(dummy_negatives_test)))))
true_label_test_dict = dict(zip(keys_test, true_labels_test_array))

In [None]:
with tf.device(gpu):

  final_classifications = {}
  ttps = []
  earliness = []

  # create nN predictions using each sample in the training dataset as the test sample (n = no of exps in train set, N = no of classifiers)
  sample_predictions, true_labels_train = generate_predictions_table(dummy_positives, dummy_negatives, timestamps)

  # create multipliers for every classifier
  multipliers_2d = get_confidence_multipliers(sample_predictions, true_labels_train)

  # sample index
  sample_idx = 0

  # count inconclusive results
  inconc_count = 0
  
  # evaluate every experiment in the test set
  for key, value in all_samples_test.items():
    test_sample_name = key
    test_sample = value

    print(f"Sample {test_sample_name}")  
    predictions = []

    # get the predictions for each timestamp
    for i, t in enumerate(timestamps):

      train_data, train_labels = get_training_data_knn(positive_samples=dummy_positives, negative_samples=dummy_negatives, timestamp=t, test_samples=[test_sample_name])
      test_data = get_test_data_knn(test_sample, t)
      pred = KNN(3, test_data, train_data, train_labels, 'cosine')
      predictions.append(pred)

      # get confidence in the current prediction
      c = get_confidence(predictions[:i+1], multipliers_2d[:i+1]) # i+1 needed because slicing does not include last index 

      # check if confidence meets threshold
      if(c >= best_th): # best confidence threshold from cost function
      
        time_index = timestamps[i] # get the value of the sample number at which the sample needs to be indexed
        time_to_result = test_sample.index[time_index-1] - test_sample.index[0] # get actual time acorrding the experiment at which result is obtained

        # final prediction
        pred = predictions[i]
        
        # update arrays with final outcomes and time to result
        final_classifications[test_sample_name] = (pred, true_label_test_dict[test_sample_name])

        print(f"Predicted Label: {pred} \t True Label: {true_label_test_dict[test_sample_name]} \t Correct?: {pred == true_label_test_dict[test_sample_name]}")

        earliness.append(time_index/timestamps[-1])

        # if final output is positive get ttp
        if(pred == 1.0):
          print(f"TTP: {time_to_result}s \t {round((time_to_result)/60, 2)} mins")

        break

      # if threshold is never met then result is inconclusive
      if(i == len(timestamps)-1):
        final_classifications[test_sample_name] = (None, true_label_test_dict[test_sample_name])
        print("Inconclusive")
        inconc_count += 1
    
    sample_idx += 1
    print("")

  print(f"Accuracy: {accuracy(final_classifications)}")
  print(f"Sensitivity/Recall: {sensitivity(final_classifications)}")
  print(f"Specificity: {specificity(final_classifications)}")
  print(f"Precision: {precision(final_classifications)}")
  print(f"F1 Score: {f1(final_classifications)}")
  print(f"Average Earliness: {sum(earliness)/len(earliness)}")
  print(f"Total Inconclusive: {inconc_count}/{sample_idx}")

In [None]:
# plotting the positive samples that have been wrongly classified as negative

# x_range = [*range(0,1200,3)]

# fig, ax = plt.subplots(1,1, figsize=(10,5))

# ax.set_xlabel("Time (s)")
# ax.set_ylabel("Average Output (mV)")
# ax.set_title("Mis-classified Positives")

# for k,v in final_classifications.items():
#   if(v[0] != v[1] and v[1] == 1.0):
#     df = dummy_positives_test[k]
#     y_vals = df['Average Output']
#     ax.plot(x_range, y_vals)

#### Save Sythetic Data (for replication of results)

In [None]:
# pos_train_file = '/SyntheticData/Iteration10/PosTrain.pkl'
# pos_test_file = '/SyntheticData/Iteration10/PosTest.pkl'

# neg_train_file = '/SyntheticData/Iteration10/NegTrain.pkl'
# neg_test_file = '/SyntheticData/Iteration10/NegTest.pkl'

# save_data(pos_train_file, dummy_positives)
# save_data(pos_test_file, dummy_positives_test)
# save_data(neg_train_file, dummy_negatives)
# save_data(neg_test_file, dummy_negatives_test)

### Github Commands

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pwd

In [None]:
username = "adityag16"
git_token = "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"
repository = "/content/drive/MyDrive/Final-Year-Project"
%cd {repository}
!git status

In [None]:
!git add 'Early Time Series Classification - Average Ouput KNN.ipynb'
!git status

In [None]:
!git config --global user.email "aditya.gupta18@imperial.ac.uk"
!git config --global user.name "adityag16"

!git commit -m "All documentation done -- need to sort out file paths"
!git push origin main