### Connect Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive') 

Mounted at /content/drive


### Package Imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import math
import pandas as pd
import tensorflow as tf
from scipy.signal import savgol_filter
from collections import Counter

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from scipy.spatial import distance
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances, cosine_distances

### GPU Device

In [None]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-0a827671-bfd2-c73a-d984-1813501eb611)


In [None]:
gpu = tf.test.gpu_device_name()
print(gpu)

/device:GPU:0


### Pre-Processing Helper Functions

In [None]:
def decaying_exp(x, a, b):
    """ Returns exponential function
    Parameters
    ----------
    x : ndarray
        times
    a : double
        t(inf) value
    b : double
        slope to t=0
    Returns
    -------
    ndarray
        y-axis values of the function
    """
    return a*(1-np.exp(-b * x))


def fit_pixels_interpolate(time, X, interpolate_idx):
    """ Interpolates the curves for each pixel
    Parameters
    ----------
    time : ndarray
        times
    X : ndarray
        TxNM array to be interpolated
    idx_active : ndarray
        NM array specifying pixels that are active
    interpolate_idx : int
        interpolation is performed until this index
    Returns
    -------
    popt : ndarray
        optimal parameters for interpolation of each pixel, with shape 2xNM
    """
    popt = np.zeros((2, X.shape[1]))

    # for every pixel
    for i in range(X.shape[1]):

      data = filtfilt(b=np.ones(10) / 10, a=[1], x=X[:, i])

      # Fit the curve (interpolate)
      try:
        popt[:, i], pcov = curve_fit(decaying_exp, time[:interpolate_idx], data[:interpolate_idx], p0=[-10, 0.1])
      except:
        # print('EXCEPT: could not fit this pixel', i)
        popt[:, i] = None

    return popt

In [None]:
def filter_by_drift(df, interpolate_idx):

  popt = fit_pixels_interpolate(np.array(df.index), df.values, interpolate_idx)

  drift_avg = np.zeros(df.shape[0])
  pix_count = 0
  active = np.array(np.zeros(df.shape[1]), dtype=bool)

  for idx in range(df.shape[1]):

  # check if any of the drift params for the pixel are nan
    if(np.isnan(popt[0, idx]) and np.isnan(popt[1, idx])):
      active[idx] = False
    else:
      # if drift params exist then iterate over the values of the index and use these as x values for the drift curve
      y_vals = []
      for i in df.index:
        val = decaying_exp(i, popt[0,idx], popt[1,idx])
        y_vals.append(val)
      
      # subtract the extrapolated drift from the signal
      drift_error = np.abs(np.array(df.values[:, idx] - y_vals))
      
      # only keep pixels with drift error of less than 10mV
      if((drift_error < 12).all()):
        drift_avg = np.add(drift_avg, np.array(y_vals))
        pix_count += 1
        active[idx] = True
      else:
        active[idx] = False

  drift_avg/=pix_count

  df = df.loc[:, active]

  return df, drift_avg

In [None]:
def filter_by_vref(X, v_thresh=70):
    '''
    Identifies active pixels by checking if one of the first 10 derivatives d(i) is > v_thresh
    Parameters
    ---------
    X : ndarray
        Input 2D array (T x NM). T = time samples, NM = total number of pixels
    v_thresh : int, optional
        Minimum value of the derivative d(i)=X(i+1)-X(i) in mV. Default is 70
    Returns
    -------
    ndarray
        1D array of bool with dimension (NM). For each pixel, returns True if, during the first 10 samples,
        one of the derivatives is > v_thresh. The derivatives are calculated as d(i) = X(i+1)-X(i)
    '''
    return (np.diff(X[:10, :], axis=0) > v_thresh).any(axis=0)  # check if one of the first 10 derivatives is >v_thresh

In [None]:
def filter_by_vrange(X, v_range=(100, 900)):
    '''
    Identifies active pixels by checking that all the values are in v_range
    Parameters
    ---------
    X : ndarray
        Input 2D array (T x NM). T = time samples, NM = total number of pixels
    v_range : (int, int), optional
        tuple containing the minimum and maximum allowable voltage in mV. Default is (100, 900)
    Returns
    -------
    ndarray
        1D array of bool with dimension (NM). For each pixel, returns True if the value is always in v_range
    '''
    return (X < v_range[1]).all(axis=0) & (X > v_range[0]).all(axis=0)  # for each pixel, check if all the values are
    # within the given range


In [None]:
def filter_by_derivative(X, vthresh=5):
    """ Identifies active pixels by checking that the absolute value of the derivative is always below vthresh
    Parameters
    ----------
    X : ndarray
        input 2D array of shape TxNM
    vthresh : int
        threshold for active pixels. Default is 5
    Returns
    -------
    ndarray
        1D array of bool with dimension (NM). For each pixel, returns True if all the derivatives are below vthresh
    """
    x_diff = np.abs(np.diff(X, axis=0))
    return (x_diff < vthresh).all(axis=0)

In [None]:
def filter_active_pixels_drop(df, v_thresh_ref=50, v_range=(100, 900), v_thresh_deriv=5): #v_thresh_ref changed from 70 to 50
  active = filter_by_vref(df.values, v_thresh_ref) & filter_by_vrange(df.values, v_range) & filter_by_derivative(df.values, v_thresh_deriv)
  
  # drop pixels 
  df = df.loc[: , active]
  return df

In [None]:
def filter_active_pixels_deriv(df, v_thresh_deriv=5): 
  active = filter_by_derivative(df.values, v_thresh_deriv)
  
  # drop pixels 
  df = df.loc[: , active]
  return df

In [None]:
def filter_active_pixels_range(df, v_range=(100, 900)):
  active = filter_by_vrange(df.values, v_range)

  # drop pixels 
  df = df.loc[: , active]
  return df

In [None]:
def reshape_data(df, rows, cols):
  X = df.values #pandas.DataFrame.values: Return a Numpy representation of the DataFrame.
  X = X.reshape(-1, rows, cols, order='F') #or C. different reshaping row by row or column by column but this works
  return X

In [None]:
def filter_chemical_pixels(df, arr_rows, arr_cols):
  X = reshape_data(df, arr_rows, arr_cols) # reshape data to T x 78 x 56
  X_mean = np.mean(X, axis=0) # get mean to have 78 x 56 shape
  X_mean[1::3, 1::3] = np.nan # set temperature pixels to nan
  X_mean = X_mean.flatten('F') # restore shape to 4068 

  active_chemical = ~(np.isnan(X_mean)) # get bool array of all chemical pixels

  # drop pixels 
  df = df.loc[: , active_chemical]
  return df


In [None]:
def time_to_index(times, time_vect):
    '''
    Returns index of the times closest to the desired ones time_vect
    Arguments
    ---------
    times : list
        list of integers containing the desired times
    time_vect : nparray
        array of the times at which the values are sampled
    Returns
    -------
    list
        for each element in the input list times, return an element in the output list
        with the index of the sample closest to the desired time
    '''
    indices = []
    for time in times:  # for each time in the input list
        indices.append( np.argmin(np.abs(time_vect - time)) )
        # find index of the sampled time (in time_vect) closest to the desired one (time)
    return indices


def find_loading_time(time_vect, X, bounds=(600, 900)):  # for v2
    ''' Finds loading and settling time for the data of v2 chip
    Parameters
    ----------
    time_vect : ndarray
        1D array with dimension T containing the sampling times
    X : ndarray
        2D array with dimension TxNM containing the sampled data
    bounds : list, optional
        tuple containing the minimum and maximum times (in ms) where the loading time has to be searched.
        Default is (600, 900)
    Returns
    -------
    tuple
        - settled_index : index at which the settling occurs
        - settled_time : time at which the settling occurs
    '''

    search_start, search_end = time_to_index(bounds, time_vect)  # for each time in bounds, find the index
    # of the sample (in time_vect) that is closest to the desired one (in bounds)
    X_mean = np.mean(X, axis=1)  # for each sample, calculate the mean of all pixels
    X_mean_diff = np.diff(X_mean)  # find the derivative

    loading_index = np.argmax(X_mean_diff[search_start:search_end]) + search_start + 1  # find the index
    # where the derivative is max in the specified interval
    loading_index = loading_index  # add settling time
    settled_index = loading_index + 10  # add settling time
    settled_time = time_vect[settled_index]  # find the time that index corresponds to

    return settled_index, settled_time

In [None]:
def preprocess_data(df, deriv_thresh, deriv_thresh_bgsub=5):

  df = filter_chemical_pixels(df, 78, 56) # filter all chemical pixels

  df = filter_active_pixels_drop(df=df, v_thresh_deriv=deriv_thresh, v_range=(100,900)) # filter pixels by range, vref and deriv

  settle_idx, settle_time = find_loading_time(df.index, df, bounds=(600, 900)) # find settling point
  df = df.iloc[settle_idx + 10:, :] # use only the data after the settling time + 30s to allow reaction to have settled

  df = df.sub(df.iloc[0, :], axis='columns') # subtract value of first pixel from all pixels

  if(len(filter_active_pixels_deriv(df=df, v_thresh_deriv=deriv_thresh_bgsub).columns) != 0): # check if there is still data present after filtering
    df = filter_active_pixels_deriv(df=df, v_thresh_deriv=deriv_thresh_bgsub) # if data is present do filtering otherwise don't

  df = df.iloc[0:150+250, :] # take only 400 samples after settling point (approx 19-20mins) 

  df.index = df.index - df.index[0] # set the first time value to 0
  
  X, drift = filter_by_drift(df, 40) # filter by fitting of pixel to drift model

  if(len(X.columns) != 0): 
    df = X
    
  for col in df.columns:
    df[col] = savgol_filter(df[col],101, 3) # apply smoothing to each pixel 

  return df.T

In [None]:
def preprocess_partial_data(df, deriv_thresh, deriv_thresh_bgsub=5):

  df = filter_active_pixels_range(df=df, v_range=(100,900)) # filter by range incase of any saturation
  
  df = filter_active_pixels_deriv(df=df, v_thresh_deriv=deriv_thresh) # filter pixels by deriv

  df = df.sub(df.iloc[0, :], axis='columns') # subtract value of first pixel from all pixels

  if(len(filter_active_pixels_deriv(df=df, v_thresh_deriv=deriv_thresh_bgsub).columns) != 0): # check if there is still data present after filtering
    df = filter_active_pixels_deriv(df=df, v_thresh_deriv=deriv_thresh_bgsub) # if data is present do filtering otherwise dont

  df = df.iloc[:150+250, :] # take only 400 samples after settling point (approx 19-20mins) 

  df.index = df.index - df.index[0] # set the first time value to 0
  
  X, drift = filter_by_drift(df, 40) # filter by fitting of pixel to drift model

  if(len(X.columns) != 0): 
    df = X

  for col in df.columns:
    df[col] = savgol_filter(df[col],101, 3) # apply smoothing to each pixel
    
  return df.T

In [None]:
def normalise_data(series):
  return (series - series.min()) / (series.max() - series.min())

### Evaluation Metric Helper Functions

In [None]:
def accuracy(classification):
  total = len(classification)
  total_correct = 0
  for i in classification.values():
    if(i[0] == i[1]):
      total_correct +=1

  accuracy = (total_correct/total)*100

  return accuracy

In [None]:
def sensitivity(classification):
  true_pos = 0
  false_neg = 0

  for i in classification.values():

    true_label = int(i[1])
    predicted = int(i[0])

    if(true_label == 1 and predicted == 1):
      true_pos += 1
    
    if(true_label == 1 and predicted == 0):
      false_neg += 1

  sensitivity = 100*(true_pos/(true_pos + false_neg))

  return sensitivity

In [None]:
def specificity(classification):
  true_neg = 0
  false_pos = 0

  for i in classification.values():
    true_label = int(i[1])
    predicted = int(i[0])
    
    if(true_label == 0 and predicted == 0):
      true_neg += 1
    
    if(true_label == 0 and predicted == 1):
      false_pos += 1

  specificity = 100*(true_neg/(true_neg + false_pos))

  return specificity

In [None]:
def precision(classification):
  true_pos = 0
  false_pos = 0

  for i in classification.values():
    true_label = int(i[1])
    predicted = int(i[0])
    
    if(true_label == 1 and predicted == 1):
      true_pos += 1
    
    if(true_label == 0 and predicted == 1):
      false_pos += 1

  precision = 100*(true_pos/(true_pos + false_pos))

  return precision

In [None]:
def f1(classification):
  numerator = 2*precision(classification)*sensitivity(classification)
  denominator = precision(classification) + sensitivity(classification)
  return numerator/denominator

### Data Loading Helper Functions

In [None]:
def load_partial_covid_exp(filepath):

  bot_filepath = filepath[:-4] + "_bot.csv"
  top_filepath = filepath[:-4] + "_top.csv"

  ## load in 2 sheets
  df_neg = pd.read_csv(top_filepath, header=0, index_col=0)
  df_pos = pd.read_csv(bot_filepath, header=0, index_col=0)

  return df_pos, df_neg

### Array Dims

In [None]:
arr_rows = 78
arr_cols = 56

### Load Data

#### Positive Samples

In [None]:
## Average pixel value for all samples 

with tf.device(gpu):
  ## Gamma 1
  avg_data_g1_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/100921_DNA/100921_DNA/Data/gamma1.app.1e5/gamma1.app.1e5_data_export.csv"
  avg_g1 = pd.read_csv(avg_data_g1_file, header=0)

  ## Gamma 2
  avg_data_g2_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/100921_DNA/100921_DNA/Data/gamma2.app.1e4/gamma2.app.1e4_data_export.csv"
  avg_g2 = pd.read_csv(avg_data_g2_file, header=0)

  ## Gamma 3
  avg_data_g3_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/100921_DNA/100921_DNA/Data/gamma3.app.1e5/gamma3.app.1e5_data_export.csv"
  avg_g3 = pd.read_csv(avg_data_g3_file, header=0)
  
  ## Gamma 5 
  avg_data_g5_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/100921_DNA/100921_DNA/Data/gamma5.app.1e4/gamma5.app.1e4_data_export.csv"
  avg_g5 = pd.read_csv(avg_data_g5_file, header=0)

  ## 22RV1.ap1
  avg_data_22rv1_ap1_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/22RV1.ap1/22RV1.ap1_data_export.csv"
  avg_22rv1_ap1 = pd.read_csv(avg_data_22rv1_ap1_file, header=0)

  ## 22RV1.ap2
  avg_data_22rv1_ap2_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/22RV1.ap2/22RV1.ap2_data_export.csv"
  avg_22rv1_ap2 = pd.read_csv(avg_data_22rv1_ap2_file, header=0)

  ## 22RV1y.p3
  avg_data_22rv1y_p3_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/22Rv1y.p3/22Rv1y.p3_data_export.csv"
  avg_22rv1y_p3 = pd.read_csv(avg_data_22rv1y_p3_file, header=0)

  ## 22RV1y.p4
  avg_data_22rv1y_p4_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/22Rv1y.p4/22Rv1y.p4_data_export.csv"
  avg_22rv1y_p4 = pd.read_csv(avg_data_22rv1y_p4_file, header=0)

  ## ARV7.p1
  avg_data_arv7_p1_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/ARV7.p1/ARV7.p1_data_export.csv"
  avg_arv7_p1 = pd.read_csv(avg_data_arv7_p1_file, header=0).iloc[1:, :].reset_index(drop=True) # row 0 was NAN

  ## ARV7.p3
  avg_data_arv7_p3_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/ARV7.p3/ARV7.p3_data_export.csv"
  avg_arv7_p3 = pd.read_csv(avg_data_arv7_p3_file, header=0)

  ## ARV7.p4
  avg_data_arv7_p4_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/ARV7.p4/ARV7.p4_data_export.csv"
  avg_arv7_p4 = pd.read_csv(avg_data_arv7_p4_file, header=0)

  ## Beta 1
  avg_data_b1_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/100921_DNA/100921_DNA/Data/beta1.app.1e4/beta1.app.1e4_data_export.csv"
  avg_b1 = pd.read_csv(avg_data_b1_file, header=0)

  ## Beta 2
  avg_data_b2_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/100921_DNA/100921_DNA/Data/beta2.app.1e5/beta2.app.1e5_data_export.csv"
  avg_b2 = pd.read_csv(avg_data_b2_file, header=0)

  ## Beta 5
  avg_data_b5_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/100921_DNA/100921_DNA/Data/beta5.app.1e5/beta5.app.1e5_data_export.csv"
  avg_b5 = pd.read_csv(avg_data_b5_file, header=0)
  

In [None]:
## All pixel values for each time stamp

with tf.device(gpu):
  ## Gamma 1
  g1_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/100921_DNA/100921_DNA/Data/gamma1.app.1e5/gamma1.app.1e5_vsChem_export.csv"
  g1 = pd.read_csv(g1_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  g1.index = avg_g1["Time Elapsed"]

  ## Gamma 2
  g2_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/100921_DNA/100921_DNA/Data/gamma2.app.1e4/gamma2.app.1e4_vsChem_export.csv"
  g2 = pd.read_csv(g2_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  g2.index = avg_g2["Time Elapsed"]

  ## Gamma 3
  g3_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/100921_DNA/100921_DNA/Data/gamma3.app.1e5/gamma3.app.1e5_vsChem_export.csv"
  g3 = pd.read_csv(g3_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  g3.index = avg_g3["Time Elapsed"]

  ## Gamma 5
  g5_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/100921_DNA/100921_DNA/Data/gamma5.app.1e4/gamma5.app.1e4_vsChem_export.csv"
  g5 = pd.read_csv(g5_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  g5.index = avg_g5["Time Elapsed"]

  ## 22RV1.ap1
  rv1_ap1_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/22RV1.ap1/22RV1.ap1_vsChem_export.csv"
  rv1_ap1 = pd.read_csv(rv1_ap1_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  rv1_ap1.index = avg_22rv1_ap1['Time Elapsed']

  ## 22RV1.ap2
  rv1_ap2_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/22RV1.ap2/22RV1.ap2_vsChem_export.csv"
  rv1_ap2 = pd.read_csv(rv1_ap2_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  rv1_ap2.index = avg_22rv1_ap2['Time Elapsed']

  ## 22RV1y.p3
  rv1y_p3_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/22Rv1y.p3/22Rv1y.p3_vsChem_export.csv"
  rv1y_p3 = pd.read_csv(rv1y_p3_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  rv1y_p3.index = avg_22rv1y_p3['Time Elapsed']

  ## 22RV1y.p4
  rv1y_p4_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/22Rv1y.p4/22Rv1y.p4_vsChem_export.csv"
  rv1y_p4 = pd.read_csv(rv1y_p4_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  rv1y_p4.index = avg_22rv1y_p4['Time Elapsed']

  ## ARV7.p1 
  arv7_p1_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/ARV7.p1/ARV7.p1_vsChem_export.csv"
  arv7_p1 = pd.read_csv(arv7_p1_file, header=None).iloc[:, :(arr_rows*arr_cols)] 
  arv7_p1.index = avg_arv7_p1["Time Elapsed"]

  ## ARV7.p3 
  arv7_p3_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/ARV7.p3/ARV7.p3_vsChem_export.csv"
  arv7_p3 = pd.read_csv(arv7_p3_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  arv7_p3.index = avg_arv7_p3["Time Elapsed"]

  ## ARV7.p4 
  arv7_p4_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/ARV7.p4/ARV7.p4_vsChem_export.csv"
  arv7_p4 = pd.read_csv(arv7_p4_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  arv7_p4.index = avg_arv7_p4["Time Elapsed"]

  ## Beta 1
  b1_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/100921_DNA/100921_DNA/Data/beta1.app.1e4/beta1.app.1e4_vsChem_export.csv"
  b1 = pd.read_csv(b1_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  b1.index = avg_b1["Time Elapsed"]

  ## Beta 2
  b2_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/100921_DNA/100921_DNA/Data/beta2.app.1e5/beta2.app.1e5_vsChem_export.csv"
  b2 = pd.read_csv(b2_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  b2.index = avg_b2["Time Elapsed"]

  ## Beta 5
  b5_file = "/content/drive/MyDrive/Final-Year-Project/DNAPositives/100921_DNA/100921_DNA/Data/beta5.app.1e5/beta5.app.1e5_vsChem_export.csv"
  b5 = pd.read_csv(b5_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  b5.index = avg_b5["Time Elapsed"]



#### Negative Samples

In [None]:
## Average pixel value for all samples 

with tf.device(gpu):  
  ## ARV7.n1
  avg_data_arv7_file = "/content/drive/MyDrive/Final-Year-Project/DNANegatives/ARV7.n1/ARV7.n1_data_export.csv"
  avg_arv7 = pd.read_csv(avg_data_arv7_file, header=0)

  ## Yap.n2
  avg_data_yap_file = "/content/drive/MyDrive/Final-Year-Project/DNANegatives/yap.n2/yap.n2_data_export.csv"
  avg_yap = pd.read_csv(avg_data_yap_file, header=0)

  ## Yap1.n2
  avg_data_yap1_file = "/content/drive/MyDrive/Final-Year-Project/DNANegatives/yap1.n2/yap1.n2_data_export.csv"
  avg_yap1 = pd.read_csv(avg_data_yap1_file, header=0).iloc[1:, :].reset_index() # row 0 was NAN

  ## Yap1.n1.1 
  avg_data_yap1n1_file = "/content/drive/MyDrive/Final-Year-Project/DNANegatives/yap1.n1.1/yap1.n1.1_data_export.csv"
  avg_yap1n1 = pd.read_csv(avg_data_yap1n1_file, header=0).iloc[1:, :].reset_index() # row 0 was NAN

  ## ARV7.n2
  avg_data_arv72_file = "/content/drive/MyDrive/Final-Year-Project/DNANegatives/ARV7.n2/ARV7.n2_data_export.csv"
  avg_arv72 = pd.read_csv(avg_data_arv72_file, header=0)

  ## ARV7.n3
  avg_data_arv73_file = "/content/drive/MyDrive/Final-Year-Project/DNANegatives/ARV7.n3/ARV7.n3_data_export.csv"
  avg_arv73 = pd.read_csv(avg_data_arv73_file, header=0)

  ## DU145y.n1
  avg_data_du145y_n1_file = "/content/drive/MyDrive/Final-Year-Project/DNANegatives/DU145y.n1/DU145y.n1_data_export.csv"
  avg_du145y_n1 = pd.read_csv(avg_data_du145y_n1_file, header=0)

In [None]:
## All pixel values for each time stamp

with tf.device(gpu):   
  ## ARV7.n1 
  arv7_file = "/content/drive/MyDrive/Final-Year-Project/DNANegatives/ARV7.n1/ARV7.n1_vsChem_export.csv"
  arv7 = pd.read_csv(arv7_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  arv7.index = avg_arv7["Time Elapsed"]

  ## Yap.n2
  yap_file = "/content/drive/MyDrive/Final-Year-Project/DNANegatives/yap.n2/yap.n2_vsChem_export.csv"
  yap = pd.read_csv(yap_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  yap.index = avg_yap["Time Elapsed"]

  ## Yap1.n2
  yap1_file = "/content/drive/MyDrive/Final-Year-Project/DNANegatives/yap1.n2/yap1.n2_vsChem_export.csv"
  yap1 = pd.read_csv(yap1_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  yap1.index = avg_yap1["Time Elapsed"]

  ## Yap1.n1.1
  yap1n1_file = "/content/drive/MyDrive/Final-Year-Project/DNANegatives/yap1.n1.1/yap1.n1.1_vsChem_export.csv"
  yap1n1 = pd.read_csv(yap1n1_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  yap1n1.index = avg_yap1n1["Time Elapsed"]

  ## ARV7.n2
  arv72_file = "/content/drive/MyDrive/Final-Year-Project/DNANegatives/ARV7.n2/ARV7.n2_vsChem_export.csv"
  arv72 = pd.read_csv(arv72_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  arv72.index = avg_arv72["Time Elapsed"]

  ## ARV7.n3
  arv73_file = "/content/drive/MyDrive/Final-Year-Project/DNANegatives/ARV7.n3/ARV7.n3_vsChem_export.csv"
  arv73 = pd.read_csv(arv73_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  arv73.index = avg_arv73["Time Elapsed"]

  ## DU145y.n1
  du145y_n1_file = "/content/drive/MyDrive/Final-Year-Project/DNANegatives/DU145y.n1/DU145y.n1_vsChem_export.csv"
  du145y_n1 = pd.read_csv(du145y_n1_file, header=None).iloc[:, :(arr_rows*arr_cols)]
  du145y_n1.index = avg_du145y_n1["Time Elapsed"]

#### Covid Partial Data

In [None]:
## 150520_2_118
avg_118_file = "/content/drive/MyDrive/Final-Year-Project/COVIDPartialData/150520_2_118/exp_summary_118.csv"
exp_118_pos, exp_118_neg = load_partial_covid_exp(avg_118_file)

## 150520_4_2_86
avg_86_file = "/content/drive/MyDrive/Final-Year-Project/COVIDPartialData/150520_4_2_86/exp_summary_86.csv"
exp_86_pos, exp_86_neg = load_partial_covid_exp(avg_86_file)

## 150520_5_129
avg_129_file = "/content/drive/MyDrive/Final-Year-Project/COVIDPartialData/150520_5_129/exp_summary_129.csv"
exp_129_pos, exp_129_neg = load_partial_covid_exp(avg_129_file)

## 180520_4_165
avg_165_file = "/content/drive/MyDrive/Final-Year-Project/COVIDPartialData/180520_4_165/exp_summary_165.csv"
exp_165_pos, exp_165_neg = load_partial_covid_exp(avg_165_file)

## 180520_6_35
avg_35_file = "/content/drive/MyDrive/Final-Year-Project/COVIDPartialData/180520_6_35/exp_summary_35.csv"
exp_35_pos, exp_35_neg = load_partial_covid_exp(avg_35_file)

## 190520_1_28
avg_28_file = "/content/drive/MyDrive/Final-Year-Project/COVIDPartialData/190520_1_28/exp_summary_28.csv"
exp_28_pos, exp_28_neg = load_partial_covid_exp(avg_28_file) 

## 190520_2_14
avg_14_file = "/content/drive/MyDrive/Final-Year-Project/COVIDPartialData/190520_2_14/exp_summary_14.csv"
exp_14_pos, exp_14_neg = load_partial_covid_exp(avg_14_file)

## 210520_2_40
avg_40_file = "/content/drive/MyDrive/Final-Year-Project/COVIDPartialData/210520_2_40/exp_summary_40.csv"
exp_40_pos, exp_40_neg = load_partial_covid_exp(avg_40_file)

## 210520_3_88
avg_88_file = "/content/drive/MyDrive/Final-Year-Project/COVIDPartialData/210520_3_88/exp_summary_88.csv"
exp_88_pos, exp_88_neg = load_partial_covid_exp(avg_88_file)

## 210520_6_27
avg_27_file = "/content/drive/MyDrive/Final-Year-Project/COVIDPartialData/210520_6_27/exp_summary_27.csv"
exp_27_pos, exp_27_neg = load_partial_covid_exp(avg_27_file)

## 250520_1_134
avg_134_file = "/content/drive/MyDrive/Final-Year-Project/COVIDPartialData/250520_1_134/exp_summary_134.csv"
exp_134_pos, exp_134_neg = load_partial_covid_exp(avg_134_file)

## 250520_2_97
avg_97_file = "/content/drive/MyDrive/Final-Year-Project/COVIDPartialData/250520_2_97/exp_summary_97.csv"
exp_97_pos, exp_97_neg = load_partial_covid_exp(avg_97_file)

## 250520_6_2D1
avg_2d1_file = "/content/drive/MyDrive/Final-Year-Project/COVIDPartialData/250520_6_2D1/exp_summary_2D1.csv"
exp_2d1_pos, exp_2d1_neg = load_partial_covid_exp(avg_2d1_file)

## 250520_7_64
avg_64_file = "/content/drive/MyDrive/Final-Year-Project/COVIDPartialData/250520_7_64/exp_summary_64.csv"
exp_64_pos, exp_64_neg = load_partial_covid_exp(avg_64_file)

### Preprocessing

#### Positive Samples

In [None]:
g1 = preprocess_data(g1, 500)
g2 = preprocess_data(g2, 500)
g3 = preprocess_data(g3, 500)
g5 = preprocess_data(g5, 500)
rv1_ap1 = preprocess_data(rv1_ap1, 500)
rv1_ap2 = preprocess_data(rv1_ap2, 500)
rv1y_p3 = preprocess_data(rv1y_p3, 500)
rv1y_p4 = preprocess_data(rv1y_p4, 500)
arv7_p1 = preprocess_data(arv7_p1, 500)
arv7_p3 = preprocess_data(arv7_p3, 500)
arv7_p4 = preprocess_data(arv7_p4, 500)
b1 = preprocess_data(b1, 500)
b2 = preprocess_data(b2, 500)
b5 = preprocess_data(b5, 500)

#### Negative Samples

In [None]:
arv7 = preprocess_data(arv7, 500)
yap = preprocess_data(yap, 500)
yap1 = preprocess_data(yap1, 500)
yap1n1 = preprocess_data(yap1n1, 500)
arv72 = preprocess_data(arv72, 500)
arv73 = preprocess_data(arv73, 500)
du145y_n1 = preprocess_data(du145y_n1, 500)

#### Covid Partial Data

In [None]:
exp_118_pos = preprocess_partial_data(exp_118_pos, 500)
exp_86_pos = preprocess_partial_data(exp_86_pos, 500)
exp_129_pos = preprocess_partial_data(exp_129_pos, 500)
exp_165_pos = preprocess_partial_data(exp_165_pos, 500)
exp_35_pos = preprocess_partial_data(exp_35_pos, 500)
exp_28_pos = preprocess_partial_data(exp_28_pos, 500)
exp_14_pos = preprocess_partial_data(exp_14_pos, 500)
exp_40_pos = preprocess_partial_data(exp_40_pos, 500)
exp_88_pos = preprocess_partial_data(exp_88_pos, 500)
exp_27_pos = preprocess_partial_data(exp_27_pos, 500)
exp_134_pos = preprocess_partial_data(exp_134_pos, 500)
exp_97_pos = preprocess_partial_data(exp_97_pos, 500)
exp_2d1_pos = preprocess_partial_data(exp_2d1_pos, 500)
exp_64_pos = preprocess_partial_data(exp_64_pos, 500)

In [None]:
exp_118_neg = preprocess_partial_data(exp_118_neg, 500)
exp_86_neg = preprocess_partial_data(exp_86_neg, 500)
exp_129_neg = preprocess_partial_data(exp_129_neg, 500)
exp_165_neg = preprocess_partial_data(exp_165_neg, 500)
exp_35_neg = preprocess_partial_data(exp_35_neg, 500)
exp_28_neg = preprocess_partial_data(exp_28_neg, 500)
exp_14_neg = preprocess_partial_data(exp_14_neg, 500)
exp_40_neg = preprocess_partial_data(exp_40_neg, 500)
exp_88_neg = preprocess_partial_data(exp_88_neg, 500)
exp_27_neg = preprocess_partial_data(exp_27_neg, 500)
exp_134_neg = preprocess_partial_data(exp_134_neg, 500)
exp_97_neg = preprocess_partial_data(exp_97_neg, 500)
exp_2d1_neg = preprocess_partial_data(exp_2d1_neg, 500)
exp_64_neg = preprocess_partial_data(exp_64_neg, 500)

### Machine Learning - Neural Network Ensemble

#### Helper Functions

In [None]:
def get_training_data(positive_samples, negative_samples, timestamp, test_samples=[]):
  
  training_data = []
  pos_count = 0
  neg_count = 0

  ## iterate postive samples dict
  for key, sample in positive_samples.items():
    
    ## if dataset is test data do not add to training set
    if(key in test_samples):
      continue

    ## truncate sample to length t = timestamp (keep all rows and turncate columns)
    pos_subsample = sample.to_numpy()[:, 0:timestamp]

    ## pos_count = 0 means this is first sample to set training data = sample ortherwise update training data 
    if(pos_count == 0):
      training_data = pos_subsample
    else:
      training_data = np.concatenate((training_data,pos_subsample))

    ## increment count of number of positive samples
    pos_count += len(sample)

  ## iterate negative samples dict
  for key, sample in negative_samples.items():
    
    ## if dataset is test data do not add to training set
    if(key in test_samples):
      continue

    ## truncate sample to length t = timestamp (keep all rows and turncate columns)
    neg_subsample = sample.to_numpy()[:, 0:timestamp]

    ## update training data
    training_data = np.concatenate((training_data,neg_subsample))

    ## increment count of number of negative samples
    neg_count += len(sample)

  ## create positive and negative (1 and 0) label based on sample 
  pos_labels = np.ones(pos_count)
  neg_labels = np.zeros(neg_count)

  ## concatenate labels for final training labels
  training_labels = np.concatenate((pos_labels, neg_labels), axis=0)

  return np.asarray(training_data), training_labels ## np.asarry() converts list to 2D np array

In [None]:
def get_test_data(sample, timestamp):
  subsample = sample.to_numpy()[:, 0:timestamp]

  return np.asarray(subsample)

#### Training Data

In [None]:
positives = {"exp_118_pos":exp_118_pos, "exp_86_pos":exp_86_pos,"exp_129_pos":exp_129_pos, "exp_165_pos":exp_165_pos, 
             "exp_35_pos":exp_35_pos, "exp_28_pos":exp_28_pos, "exp_14_pos":exp_14_pos, "exp_40_pos":exp_40_pos, 
             "exp_88_pos":exp_88_pos, "exp_27_pos":exp_27_pos, 
             "exp_134_pos":exp_134_pos, "exp_97_pos":exp_97_pos, "exp_2d1_pos":exp_2d1_pos, "exp_64_pos":exp_64_pos, 
             "g1":g1, "g2":g2, "g3":g3, "g5":g5, "rv1_ap1":rv1_ap1, "rv1_ap2":rv1_ap2,  
             "arv7_p3":arv7_p3,"rv1y_p3":rv1y_p3, "rv1y_p4":rv1y_p4, 
             "arv7_p1":arv7_p1, "arv7_p4":arv7_p4, "b1":b1, "b2":b2, "b5":b5}

negatives = {"exp_118_neg":exp_118_neg, "exp_86_neg":exp_86_neg, "exp_129_neg":exp_129_neg, "exp_165_neg":exp_165_neg, 
             "exp_35_neg":exp_35_neg, "exp_28_neg":exp_28_neg, "exp_14_neg":exp_14_neg, "exp_40_neg":exp_40_neg, 
             "exp_88_neg":exp_88_neg, "exp_27_neg":exp_27_neg, "exp_134_neg":exp_134_neg, "exp_97_neg":exp_97_neg, 
             "exp_2d1_neg":exp_2d1_neg, "exp_64_neg":exp_64_neg, "yap":yap, "yap1":yap1, "yap1n1":yap1n1, "arv72":arv72, 
             "arv73":arv73, "du145y_n1":du145y_n1, "arv7":arv7}

#### Model Specs

In [None]:
number_of_samples = len(g1.columns)
number_of_classifiers = 20

timestep = int(number_of_samples/number_of_classifiers)
timestamps = [*range(timestep, number_of_samples+timestep, timestep)]

batch_size = 100
epochs = 15
loss_function = 'binary_crossentropy'
optimiser = 'adam'

In [None]:
print(timestamps)

[20, 40, 60, 80, 100, 120, 140, 160, 180, 200, 220, 240, 260, 280, 300, 320, 340, 360, 380, 400]


#### Creating Ensemble

In [None]:
def create_ensemble(number_of_classifiers, batch_size, epochs, loss_function, optimiser, timestamps, test_samples, positives, negatives):

  neural_nets = [0]*number_of_classifiers

  for i in range(number_of_classifiers):

    # print(f"============================================== Neural Network {i} ============================================")

    ## make model 
    neural_nets[i] = Sequential()
    # neural_nets[i].add(Dense(16, activation='relu', input_dim = timestamps[i]))
    # neural_nets[i].add(Dense(32, activation='relu'))
    # neural_nets[i].add(Dense(32, activation='relu'))
    # neural_nets[i].add(Dense(64, activation='relu'))
    # neural_nets[i].add(Dense(64, activation='relu'))
    # neural_nets[i].add(Dense(128, activation='relu'))
    # neural_nets[i].add(Dense(128, activation='relu'))
    neural_nets[i].add(Dense(32, activation='relu', input_dim = timestamps[i]))
    neural_nets[i].add(Dense(64, activation='relu'))
    neural_nets[i].add(Dense(128, activation='relu'))
    neural_nets[i].add(Dense(512, activation='relu'))
    neural_nets[i].add(Dense(1024, activation='relu'))
    neural_nets[i].add(Dense(2048, activation='relu'))
    neural_nets[i].add(Dense(1, activation='sigmoid'))

    ## compile model 
    neural_nets[i].compile(loss=loss_function, optimizer=optimiser, metrics=['accuracy'])

    ## model summary
    # neural_nets[i].summary()

    ## training data
    training_data, training_label = get_training_data(positive_samples=positives, negative_samples=negatives, timestamp=timestamps[i], test_samples=[test_samples])

    ## train model
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)
    neural_nets[i].fit(training_data, training_label,  batch_size=batch_size, epochs=epochs, shuffle=True, callbacks=[callback], verbose=0)

    # print("\n\n")

  return neural_nets

#### Evaluating Ensemble

In [None]:
def get_predictions(number_of_classifiers, ensemble, test_sample, timestamps):
  pred = []
  final_preds = []

  for i in range(number_of_classifiers):
    ## create test data that will be predicted by each neural net (will be for every pixel)
    test_data = get_test_data(test_sample, timestamps[i])

    ## make prediction for every pixel 
    en_pred = en[i].predict(test_data)

    ## make array of predictions for each pixel (x axis is the ith neural net, y axis is pixel)
    if(i==0):
      pred = en_pred
    else:
      pred = np.concatenate((pred, en_pred), axis = 1)
    
  ## round any value > 0.5 to 1 and < 0.5 to 0 
  pred = pred.round()

  ## for each pixel count the number of predictions as 1 or 0 and use the max to make final pixel pred as 0 or 1 
  for pixel_pred in pred:
    final_pixel_pred = Counter(pixel_pred).most_common(1)[0][0]
    final_preds.append(final_pixel_pred)

  ## make final prediction for the sample based on majority pixel prediction
  final_prediction = Counter(final_preds).most_common(1)[0][0]
  
  return final_prediction

In [None]:
## combine positive and negative sample dicts
all_samples = {}
all_samples.update(negatives)
all_samples.update(positives)

## create dict of samples with true label
keys = list(all_samples.keys())
true_labels = list(np.concatenate((np.zeros(len(negatives)),np.ones(len(positives)))))
true_label_dict = dict(zip(keys, true_labels))

In [None]:
final_classifications = {}

with tf.device(gpu):
  for key, value in all_samples.items():
    test_sample_name = key
    test_sample = value
    print(f"Testing sample: {test_sample_name}...")

    en = create_ensemble(number_of_classifiers, batch_size, epochs, loss_function, optimiser, timestamps, test_sample_name, positives, negatives)
    sample_classification = get_predictions(number_of_classifiers, en, test_sample, timestamps)

    final_classifications[key] = (sample_classification, true_label_dict[key])
    print(f"Predicted Label: {sample_classification} \t True Label: {true_label_dict[key]} \t Correct?: {sample_classification ==true_label_dict[key]} \n")

Testing sample: exp_118_neg...
Predicted Label: 1.0 	 True Label: 0.0 	 Correct?: False 

Testing sample: exp_86_neg...
Predicted Label: 0.0 	 True Label: 0.0 	 Correct?: True 

Testing sample: exp_129_neg...
Predicted Label: 0.0 	 True Label: 0.0 	 Correct?: True 

Testing sample: exp_165_neg...
Predicted Label: 0.0 	 True Label: 0.0 	 Correct?: True 

Testing sample: exp_35_neg...
Predicted Label: 1.0 	 True Label: 0.0 	 Correct?: False 

Testing sample: exp_28_neg...
Predicted Label: 0.0 	 True Label: 0.0 	 Correct?: True 

Testing sample: exp_14_neg...
Predicted Label: 1.0 	 True Label: 0.0 	 Correct?: False 

Testing sample: exp_40_neg...
Predicted Label: 1.0 	 True Label: 0.0 	 Correct?: False 

Testing sample: exp_88_neg...
Predicted Label: 0.0 	 True Label: 0.0 	 Correct?: True 

Testing sample: exp_27_neg...
Predicted Label: 1.0 	 True Label: 0.0 	 Correct?: False 

Testing sample: exp_134_neg...
Predicted Label: 0.0 	 True Label: 0.0 	 Correct?: True 

Testing sample: exp_97_

In [None]:
final_classifications

{'arv7': (0.0, 0.0),
 'arv72': (1.0, 0.0),
 'arv73': (0.0, 0.0),
 'arv7_p1': (1.0, 1.0),
 'arv7_p3': (0.0, 1.0),
 'arv7_p4': (1.0, 1.0),
 'du145a_p1': (1.0, 1.0),
 'du145a_p2': (1.0, 1.0),
 'du145a_p3': (1.0, 1.0),
 'du145y_n1': (1.0, 0.0),
 'exp_118_neg': (1.0, 0.0),
 'exp_118_pos': (0.0, 1.0),
 'exp_129_neg': (0.0, 0.0),
 'exp_129_pos': (0.0, 1.0),
 'exp_134_neg': (0.0, 0.0),
 'exp_134_pos': (0.0, 1.0),
 'exp_14_neg': (1.0, 0.0),
 'exp_14_pos': (0.0, 1.0),
 'exp_165_neg': (0.0, 0.0),
 'exp_165_pos': (0.0, 1.0),
 'exp_27_neg': (1.0, 0.0),
 'exp_27_pos': (1.0, 1.0),
 'exp_28_neg': (0.0, 0.0),
 'exp_28_pos': (0.0, 1.0),
 'exp_2d1_neg': (1.0, 0.0),
 'exp_2d1_pos': (0.0, 1.0),
 'exp_35_neg': (1.0, 0.0),
 'exp_35_pos': (0.0, 1.0),
 'exp_40_neg': (1.0, 0.0),
 'exp_40_pos': (0.0, 1.0),
 'exp_64_neg': (1.0, 0.0),
 'exp_64_pos': (0.0, 1.0),
 'exp_86_neg': (0.0, 0.0),
 'exp_86_pos': (1.0, 1.0),
 'exp_88_neg': (0.0, 0.0),
 'exp_88_pos': (1.0, 1.0),
 'exp_97_neg': (0.0, 0.0),
 'exp_97_pos': (0.0,

In [None]:
print(f"Accuracy: {accuracy(final_classifications)}")
print(f"Sensitivity/Recall: {sensitivity(final_classifications)}")
print(f"Specificity: {specificity(final_classifications)}")
print(f"Precision: {precision(final_classifications)}")
print(f"F1 Score: {f1(final_classifications)}")

Accuracy: 51.06382978723404
Sensitivity/Recall: 50.0
Specificity: 52.38095238095239
Precision: 56.52173913043478
F1 Score: 53.06122448979592


### Github Commands

In [1]:
!pwd

/content


In [None]:
username = "adityag16"
git_token = "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"
repository = "/content/drive/MyDrive/Final-Year-Project"
%cd {repository}
!git status

/content/drive/MyDrive/Final-Year-Project


In [None]:
!git add "Early Time Series Classification - Pixel Data NN.ipynb"
!git status

In [None]:
!git config --global user.email "aditya.gupta18@imperial.ac.uk"
!git config --global user.name "adityag16"

!git commit -m "removed some data samples"
!git push origin main