In [2]:
import glob
import pandas as pd
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pandas.plotting import register_matplotlib_converters
from matplotlib.patches import Rectangle
from matplotlib.lines import Line2D
import matplotlib.patches as patches
import os

%matplotlib auto
register_matplotlib_converters()

Using matplotlib backend: agg


# Check if the sleep times and the actigraph data align

In [8]:
destSleepPath = '/well/doherty/users/cxx579/project_data/raine/gen1_26/allSleepTime.csv'
sleepTimePath = '/well/doherty/users/cxx579/project_data/raine/gen1_26/sleepTime.csv'
sleepDf = pd.read_csv(sleepTimePath,  index_col=0)

In [9]:
data_dir = '/well/doherty/projects/raine/gen1_26/gt3x/wrist/oneDayOct15/epoch'
file_desp = data_dir + '/*.csv.gz'
epoch_files = glob.glob(file_desp)

In [10]:
sleepDf

Unnamed: 0,subject_id,start_time,end_time
0,830928,2017-04-17 22:29:14,2017-04-18 05:26:14
1,342948,2016-11-30 21:59:31,2016-12-01 05:06:01
2,343364,2016-05-26 22:05:51,2016-05-27 05:30:51
3,788310,2017-01-30 21:45:11,2017-01-31 06:20:41
4,906975,2015-10-29 22:23:09,2015-10-30 05:45:09
...,...,...,...
996,183043,2016-11-08 21:56:31,2016-11-09 05:52:01
997,281230,2016-06-17 22:30:51,2016-06-18 06:09:21
998,163041,2017-02-08 22:14:55,2017-02-09 06:02:55
999,369218,2016-12-12 22:14:48,2016-12-13 06:01:18


In [6]:
def get_subjectID(file_path):
    path2file = file_path.split('/')[-1]
    path2file = path2file.split('_')[0]
    return path2file

In [7]:
subject_ids = []
epoch_paths = []
epoch_start_times = []
epoch_end_times = []

for path2epoch in epoch_files:
    test_file = path2epoch
    my_id = get_subjectID(test_file)
    epochDf = pd.read_csv(test_file)
    startTime = epochDf.iloc[0]['time']
    endTime = epochDf.iloc[-1]['time']
    
    subject_ids.append(my_id)
    epoch_paths.append(path2epoch)
    epoch_start_times.append(startTime)
    epoch_end_times.append(endTime)

In [8]:
epochTimeFrames = {'subject_id': subject_ids,
                  'epoch_path': epoch_paths,
                  'epoch_start_time': epoch_start_times,
                  'epoch_end_time': epoch_end_times}
epochTimeDf = pd.DataFrame(epochTimeFrames)

In [9]:
epochTimeDf['subject_id'] = epochTimeDf['subject_id'].astype(int)

In [10]:
merged_df = sleepDf.join(epochTimeDf.set_index('subject_id'), on='subject_id')

In [11]:
merged_df.to_csv(destSleepPath)

In [13]:
merged_df

Unnamed: 0,subject_id,start_time,end_time,epoch_path,epoch_start_time,epoch_end_time
0,830928,2017-04-17 22:29:14,2017-04-18 05:26:14,/well/doherty/projects/raine/gen1_26/gt3x/wris...,2017-04-17 18:00:00.033+0800 [Australia/Perth],2017-04-18 17:59:30.033+0800 [Australia/Perth]
1,342948,2016-11-30 21:59:31,2016-12-01 05:06:01,/well/doherty/projects/raine/gen1_26/gt3x/wris...,2016-11-30 18:00:00.033+0800 [Australia/Perth],2016-12-01 17:59:30.033+0800 [Australia/Perth]
2,343364,2016-05-26 22:05:51,2016-05-27 05:30:51,/well/doherty/projects/raine/gen1_26/gt3x/wris...,2016-05-26 18:00:00.033+0800 [Australia/Perth],2016-05-27 17:59:30.033+0800 [Australia/Perth]
3,788310,2017-01-30 21:45:11,2017-01-31 06:20:41,/well/doherty/projects/raine/gen1_26/gt3x/wris...,2017-01-30 18:00:00.033+0800 [Australia/Perth],2017-01-31 17:59:30.033+0800 [Australia/Perth]
4,906975,2015-10-29 22:23:09,2015-10-30 05:45:09,/well/doherty/projects/raine/gen1_26/gt3x/wris...,2015-10-29 18:00:00.033+0800 [Australia/Perth],2015-10-30 17:59:30.033+0800 [Australia/Perth]
...,...,...,...,...,...,...
996,183043,2016-11-08 21:56:31,2016-11-09 05:52:01,/well/doherty/projects/raine/gen1_26/gt3x/wris...,2016-11-08 18:00:00.033+0800 [Australia/Perth],2016-11-09 17:59:30.033+0800 [Australia/Perth]
997,281230,2016-06-17 22:30:51,2016-06-18 06:09:21,/well/doherty/projects/raine/gen1_26/gt3x/wris...,2016-06-17 18:00:00.033+0800 [Australia/Perth],2016-06-18 17:59:30.033+0800 [Australia/Perth]
998,163041,2017-02-08 22:14:55,2017-02-09 06:02:55,/well/doherty/projects/raine/gen1_26/gt3x/wris...,2017-02-08 18:00:00.033+0800 [Australia/Perth],2017-02-09 17:59:30.033+0800 [Australia/Perth]
999,369218,2016-12-12 22:14:48,2016-12-13 06:01:18,/well/doherty/projects/raine/gen1_26/gt3x/wris...,2016-12-12 18:00:00.033+0800 [Australia/Perth],2016-12-13 17:59:30.033+0800 [Australia/Perth]


# Check if the times are within the range
The conditions are:
* the sleep labels occur at the same time or later than the actigraph time
* the sleep labels end at the same time or earlier than the actigraph time


In [43]:
yStartT = merged_df.iloc[0]['start_time']
yEndT = merged_df.iloc[0]['end_time']
xStartT = merged_df.iloc[0]['epoch_start_time']
xStartT = merged_df.iloc[0]['epoch_end_time']


In [44]:
yStartT

'2017-04-17 22:29:14'

In [45]:
xStartT

'2017-04-18 17:59:30.033+0800 [Australia/Perth]'

In [None]:
first_day = datetime.datetime.strptime(file_date, '%d%m%Y')

# Merge data with labels 

1. We first get a list of gt3x file names that we need to merge 
2. We find their sleep labels and merge them with x, together with their age info and subject_id 
3. For each merged DF with labels  plot the merge data and save it as an image 
4. Save the concatenated DFs into a single file

### 0. House keeping for file names. No need to run after if it is not the first time after parsing

In [98]:
# fix file names for having two recordings for the same subject at different dates 
# only the ones with postfix 1 have labels 
x_file_one ='/well/doherty/projects/raine/gen1_26/gt3x/wrist/sleepOct20/epoch/563679_wrist_1-epoch.csv.gz'
x_file_two ='/well/doherty/projects/raine/gen1_26/gt3x/wrist/sleepOct20/epoch/563679_wrist_2-epoch.csv.gz'
x_file_new ='/well/doherty/projects/raine/gen1_26/gt3x/wrist/sleepOct20/epoch/563679_wrist_2016-04-01-epoch.csv.gz'
os.rename(x_file_one, x_file_new)
os.remove(x_file_two)


x_file_one ='/well/doherty/projects/raine/gen1_26/gt3x/wrist/sleepOct20/epoch/953536_wrist_1-epoch.csv.gz'
x_file_two ='/well/doherty/projects/raine/gen1_26/gt3x/wrist/sleepOct20/epoch/953536_wrist_2-epoch.csv.gz'
x_file_new ='/well/doherty/projects/raine/gen1_26/gt3x/wrist/sleepOct20/epoch/953536_wrist_2016-04-01-epoch.csv.gz'
os.rename(x_file_one, x_file_new)
os.rename(x_file_two)

FileNotFoundError: [Errno 2] No such file or directory: '/well/doherty/projects/raine/gen1_26/gt3x/wrist/sleepOct20/epoch/563679_wrist_1-epoch.csv.gz' -> '/well/doherty/projects/raine/gen1_26/gt3x/wrist/sleepOct20/epoch/563679_wrist_2016-04-01-epoch.csv.gz'

### 1. get file names 

In [20]:
epoch_dir = '/well/doherty/projects/raine/gen1_26/gt3x/wrist/sleepOct20/epoch'
label_dir = '/well/doherty/users/cxx579/project_data/raine/gen1_26/sleepLabelsClean/'
image_dir = '/well/doherty/projects/raine/gen1_26/imgs/'

master_file = os.path.join('/well/doherty/projects/raine/gen1_26/gt3x/wrist/sleepOct20', 'master.csv.gz')

epoch_desp = epoch_dir + '/*csv.gz'
file_list = glob.glob(epoch_desp)

In [12]:
LABEL_COLORS = {
    0: "#f95a5d",
    1: "#fda354",
    2: "#8c9d43",
    3: "#1c93b7",
    4: "#887ea5",
    5: "#3a3547"
}
def merge_rows(sleep_df, current_label):
    sleep_df = sleep_df[sleep_df['sleep_stage']==current_label]
    startTimes = []
    endTimes = []
    current_stage = []
    currentStartTime = None
    preTime = None
    labels = []

    for index, row in sleep_df.iterrows():
        currentTime = row['time']
        if preTime == None:
            currentStartTime = currentTime
        else:
            if currentTime - preTime > timedelta(minutes=1):
                startTimes.append(currentStartTime)
                endTimes.append(preTime+timedelta(seconds=30))
                currentStartTime = currentTime
                labels.append(current_label)
        preTime = currentTime


    startTimes.append(currentStartTime)
    endTimes.append(preTime+timedelta(seconds=30))        
    labels.append(current_label)
    
    stage_blocks = {
    'start_time': startTimes,
    'end_time': endTimes,
    'label': labels
    }
    stage_df = pd.DataFrame(stage_blocks)
    return stage_df

def parse_file_name(file_path):
    file_name = file_path.split('/')[-1]
    file_name = file_name[:-4] # remove .csv extension
    subjectID = file_name.split('_')[-1]
    file_date = file_name.split('_')[0]

    first_day = datetime.strptime(file_date, '%d%m%Y')

    if subjectID == '942099' or subjectID == '687006':
        # labels start on the same day:
        second_day = first_day
    else:
        second_day = first_day + timedelta(days=1)

    first_day_str = first_day.strftime('%d%m%Y')
    second_day_str = second_day.strftime('%d%m%Y')
    return first_day_str, second_day_str, subjectID

In [13]:
def xDate2yDate(xDate):
    year = xDate[:4]
    month = xDate[5:7]
    date = xDate[-2:]
    return date + month + year 

def xName2yName(x_name, label_root):
    full_file_name = x_name.split('/')[-1]
    full_file_name = full_file_name[:-13]
    subject_id = full_file_name.split('_')[0]
    date_str = full_file_name.split('_')[-1]
    yDate = xDate2yDate(date_str)
    
    return subject_id, os.path.join(label_root, yDate + '_' + subject_id + '.csv')

In [14]:
def updateTimes(first_day_str, second_day_str, x_df, y_df):
    current_date = first_day_str

    newtimes = []
    isSecondDay = False
    preHour = -1
    for index, row in y_df.iterrows():
        hour = int(row['time'].split(':')[0])
        if isSecondDay == False and ((hour < preHour and hour != 12) or (hour == 12 and preHour < hour)):
            isSecondDay = True 
            current_date = second_day_str
        row_time = datetime.strptime(current_date+' '+row['time'], '%d%m%Y %I:%M:%S %p') 
        newtimes.append(row_time)
        preHour = hour
    y_df['time'] = newtimes
    x_df['time']=x_df['time'].apply(lambda x: x[:-27])
    x_df['time']=x_df['time'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
    y_df = y_df.join(x_df.set_index('time'), on='time')
    return x_df, y_df

In [21]:
import traceback

files_with_NA = []
labels_with_NA = []
acc_without_labels = []
isPlotting = False
master_DF = None
i = 0

for x_file_path in file_list:
    i += 1

    subject_id, y_file_path = xName2yName(x_file_path, label_dir)
    image_save_path = os.path.join(image_dir, subject_id + '.png')
    try:
        # 2. We find their sleep labels and merge them with x, together with their age info and subject_id 
        if os.path.isfile(y_file_path) is False:
            acc_without_labels.append(y_file_path)
            continue

        x_df = pd.read_csv(x_file_path)
        y_df = pd.read_csv(y_file_path)
        if subject_id == '585573':
            # fix for strange file
            y_df = pd.read_csv(y_file_path, header=None)
            y_df.columns = ['idx', 'time', 'sleep_stage', '3']
            y_df = y_df[['idx', 'time', 'sleep_stage']]
        else:
            y_df.reset_index(inplace=True)
            y_df.columns = ['idx', 'time', 'sleep_stage']
        y_df = y_df[y_df.sleep_stage != 'NS']
        y_df = y_df.dropna()

        first_day_str, second_day_str, subjectID = parse_file_name(y_file_path) 

        # 2.2 we need to add time to the y labels 
        x_df, y_df = updateTimes(first_day_str, second_day_str, x_df, y_df)
        y_df['pid'] = subject_id 
        if y_df.isnull().values.any():
            files_with_NA.append(x_file_path)
            labels_with_NA.append(y_file_path)

        x = y_df['time']
        y = y_df['enmoTrunc']*1000
        y_df['sleep_stage'] = y_df['sleep_stage'] .astype(int)
        if i == 0:
            master_DF = y_df
        else:
            master_DF = pd.concat([master_DF, y_df])
        
        
        if isPlotting:
            ## plotting
            fig,ax = plt.subplots(1,1,sharex=False, sharey=False, figsize=(18,12))
            ax.grid(True)
            ax.set_title('Participant = ' + str(subject_id), fontsize=16, fontweight='bold')                        
            #format x-axis                                                                  
            ax.xaxis.set_major_locator(mdates.HourLocator())                                 
            ax.xaxis.set_major_formatter(mdates.DateFormatter('%H'))
            ax.xaxis.set_minor_locator(mdates.HourLocator())                                
            ax.tick_params(axis='x', which='major', labelsize=20) 

            ax.tick_params(axis='y', which='major', labelsize=20)                           
            ax.set_ylabel('Mean acceleration (mg)', fontsize=24, fontweight='bold')
            #format plot area                                                               
            ax.spines['right'].set_visible(False)                                           
            ax.spines['top'].set_visible(False)                                             
            ax.xaxis.set_ticks_position('bottom')                                           
            ax.yaxis.set_ticks_position('left')                                             
            ax.plot(x, y, color='black', label='lab')


            # OVERLAY LABELS
            legendPatches = []
            legendLabels = []
            start = mdates.date2num(y_df['time'].min())
            end = mdates.date2num(y_df['time'].max())
            ax.add_patch(Rectangle((start, -50), end-start, 50, color='grey', hatch='x'))

            for label in sorted(y_df['sleep_stage'].unique()):
                if label not in LABEL_COLORS.keys():
                    continue
                legendPatches += [patches.Patch(color=LABEL_COLORS[label], label=label)]
                legendLabels += [label]

                stage_blocks_df = merge_rows(y_df, label)
                for ix, row in stage_blocks_df.iterrows():
                    start = mdates.date2num(pd.to_datetime(row['start_time']))
                    end = mdates.date2num(pd.to_datetime(row['end_time']))
                    duration = (row['end_time'] - row['start_time'])
                    if duration.total_seconds() < 10 * 3600: #make sure less than 10hrs
                        ax.add_patch(Rectangle((start, -50), end-start, 50, color=LABEL_COLORS[label]))
            # print legend
            ax.legend(legendPatches, legendLabels, fontsize=24)
            plt.savefig(image_save_path)
            plt.close()
    except Exception as e: 
        print(e)
        print(traceback.format_exc())
        print("File not working " + x_file_path)

In [None]:
master_DF.to_csv(master_file, compression='infer')

In [130]:
## Acc without labels
acc_without_labels

['/well/doherty/users/cxx579/project_data/raine/gen1_26/sleepLabelsClean/28092016_678179.csv',
 '/well/doherty/users/cxx579/project_data/raine/gen1_26/sleepLabelsClean/09032017_494998.csv',
 '/well/doherty/users/cxx579/project_data/raine/gen1_26/sleepLabelsClean/24082015_278758.csv',
 '/well/doherty/users/cxx579/project_data/raine/gen1_26/sleepLabelsClean/14092015_493916.csv',
 '/well/doherty/users/cxx579/project_data/raine/gen1_26/sleepLabelsClean/09052016_629384.csv',
 '/well/doherty/users/cxx579/project_data/raine/gen1_26/sleepLabelsClean/22072016_927034.csv',
 '/well/doherty/users/cxx579/project_data/raine/gen1_26/sleepLabelsClean/24102016_418746.csv',
 '/well/doherty/users/cxx579/project_data/raine/gen1_26/sleepLabelsClean/15032016_214570.csv',
 '/well/doherty/users/cxx579/project_data/raine/gen1_26/sleepLabelsClean/08102015_910554.csv',
 '/well/doherty/users/cxx579/project_data/raine/gen1_26/sleepLabelsClean/08092016_260664.csv',
 '/well/doherty/users/cxx579/project_data/raine/ge

In [None]:
len(acc_without_labels)

In [4]:
# write html file in plots dir to visualise all plots
image_dir = '/well/doherty/projects/raine/gen1_26/imgs/'


html = '<html><head><title>Annotation plots</title></head><body>'
viz_fileList = sorted([e for e in os.listdir(image_dir) if e.endswith('.png')])
for viz in viz_fileList:
    html += '<img src="' + viz + '" width=1400 height=500><br>'
html += '</body></html>'
w = open(image_dir + 'viz.html','w')
w.write(html)
w.close()