<a href="https://colab.research.google.com/github/Yiyuan80/MP/blob/main/data_filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import random
import os
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from matplotlib.pyplot import MultipleLocator
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPool2D, MaxPool3D, Dropout, AveragePooling3D, GlobalAveragePooling2D, GlobalMaxPool2D
from tensorflow.keras import Model
from tensorflow import keras
import time
import itertools
from itertools import combinations

In [None]:
def filter_name(filedir):
  """Filter names with several conditions and return a name list that satisfy the conditions."""
  filenames = [f for f in os.listdir(filedir) if f[-4:] == '.csv']

  id = []
  all_wake = []
  all_act = []
  max_length = 0

  for file in filenames:

    print('Parsing', file)
    df = pd.read_csv(os.path.join(filedir, file))
    last_day = df['daybymidnight'].unique()[-1]
    # impute missingness less than 30 and exclude days with >30 missing
    missing = df.loc[df['interval']=='EXCLUDED']
    missing_count = missing.groupby(['daybymidnight']).count()
    missing_exclude = missing_count.loc[missing_count['interval']>30]
    missing_impute = missing_count.loc[missing_count['interval']<=30]
    missing_ex_index = list(missing_exclude.index)
    missing_ex_index.append(1)
    missing_ex_index.append(last_day)
    missing_im_index = list(missing_impute.index)
    df_excluded = df[~df['daybymidnight'].isin(missing_ex_index)] # exclude days with larger than 30 excluded
    # df_impute = df_excluded[df_excluded['daybymidnight'].isin(missing_im_index)] # find days left need to be imputed
    df_imputed = df_excluded.fillna(method='ffill') # conduct forward fill

    # filter data with at least 5-consecutive days
    # days_imputed = df_imputed['daybymidnight'].unique()
    days = df_excluded['daybymidnight'].unique()

    if detect_consecutive(days,5):
      id.append(os.path.join(filedir, file))
      start_epoch = time_to_epoch(df_imputed['linetime'].values[0])
      end_epoch = time_to_epoch(df_imputed['linetime'].values[-1])

      wake = df_imputed['wake'].values
      act = df_imputed['activity'].values

      start_pad = np.array([np.nan] * start_epoch)

      wake = np.concatenate([start_pad, wake])
      act = np.concatenate([start_pad, act])

      max_length = max(max_length, len(wake))

      all_wake.append(wake.astype('bool'))
      all_act.append(act)

  # save id to txt  
  with open("/content/drive/MyDrive/mesa/filtered_id.txt", "w") as file:
    for row in id:
      s = "".join(map(str, row))
      file.write(s+'\n')

  if (max_length % 2880) > 0:
    max_length += 2880 - (max_length % 2880)

  all_wake = np.stack([
		nanpad(w, max_length).reshape(-1, 2880).astype('bool')
		for w in all_wake])
  all_act = np.stack([
		nanpad(w, max_length).reshape(-1, 2880).astype('float32')
		for w in all_act])

  print('Creating .npy files for wake and physical activity in the current directory.')

  np.save('wake.npy', all_wake)
  np.save('activity.npy', all_act)


def nanpad(arr, l):
	pad_length = l - len(arr)
	return np.concatenate([arr, np.array([np.nan] * pad_length)])


def time_to_epoch(timestring):
	hour, minute, second = [int(x) for x in timestring.split(':')]
	return int(hour * 120 + minute * 2 + second / 30)

In [None]:
def detect_consecutive(days, n):
  """check whether input data contains at least n consecutive days."""
  diff_days = list(np.diff(days)==1)
  runs = [len(list(g)) for _,g in itertools.groupby(diff_days)]
  if len(runs)==0 or max(runs) < n-1:
    return False
  elif max(runs) >= n-1: # test difference between days contain consecutive n-1 
    return True

In [None]:
def select_consecutive(days):
  """Select 5 consecutive days from a participant."""
  for i in days:
    conse = [i+1, i+2, i+3, i+4]
    if all(item in days for item in conse):
      return i

In [None]:
def pos_combination(wake):
  """generate all postive combinations."""
  all_pos = []
  mtsy = []
  for i in range(len(wake[:,1,1])):
    day_epochs = tf.reduce_sum(wake[i,:,:], axis=1)
    days = np.where(day_epochs!=2880)[0]
    days_comb = list(combinations(days[0:-2],2))
    for comb in days_comb:
      all_pos.append(np.stack([wake[i, comb[0]:comb[0]+3,:],wake[i, comb[1]:comb[1]+3,:]]))

  pos = np.stack(all_pos)
  np.save('/content/drive/MyDrive/mesa/saved_data/positive.npy', pos)

In [None]:
def main():

	# filedir = input('Enter directory with MESA actigraphy files:')
	# filenames = [f for f in os.listdir(filedir) if f[-4:] == '.csv']
	filenames = id_list

	all_wake = []
	all_act = []

	max_length = 0

	for file in filenames:

		print('Parsing', file)
		
		# df = pd.read_csv(os.path.join(filedir, file))
		df = pd.read_csv(file)
		
		start_epoch = time_to_epoch(df['linetime'].values[0])
		end_epoch = time_to_epoch(df['linetime'].values[-1])

		wake = df['wake'].values
		act = df['activity'].values

		start_pad = np.array([np.nan] * start_epoch)

		wake = np.concatenate([start_pad, wake])
		act = np.concatenate([start_pad, act])

		max_length = max(max_length, len(wake))

		all_wake.append(wake.astype('bool'))
		all_act.append(act)

	if (max_length % 2880) > 0:
		max_length += 2880 - (max_length % 2880)

	all_wake = np.stack([
		nanpad(w, max_length).reshape(-1, 2880).astype('bool')
		for w in all_wake])
	all_act = np.stack([
		nanpad(w, max_length).reshape(-1, 2880).astype('float32')
		for w in all_act])

	print('Creating .npy files for wake and physical activity in the current directory.')

	np.save('wake.npy', all_wake)
	np.save('activity.npy', all_act)


def nanpad(arr, l):
	pad_length = l - len(arr)
	return np.concatenate([arr, np.array([np.nan] * pad_length)])


def time_to_epoch(timestring):
	hour, minute, second = [int(x) for x in timestring.split(':')]
	return int(hour * 120 + minute * 2 + second / 30)

