In [1]:
import pandas as pd
from pandasql import sqldf

# Import data

In [2]:
# file path to data folder
path = './Data'

In [3]:
# import physiological data
physio = pd.DataFrame()
for subjectID in range(1, 28):
    # load physiological data for subject
    file = path + f'/subjectID_{subjectID}.csv'
    physio_subject = pd.read_csv(file)

    n_entries = physio_subject.shape[0]
    physio_subject['SubjectID'] = [subjectID for i in range(n_entries)]

    # clean column names (dataset contains different column names per subject)
    if 'SkinTemperature.Value' in physio_subject.columns:
        physio_subject = physio_subject.rename(columns={'SkinTemperature.Value': 'SkinTemperature'}, errors='raise')

    # combine all subject data
    physio = pd.concat([physio, physio_subject])

# import fatigue (PROs) data
fatigue = pd.read_csv(path + '/fatiguePROs.csv')

# Convert data

# a) Fatigue (PROs) data

In [53]:
# check number of total questions asked (irresp. of subject)
query = '''
SELECT
COUNT(CASE WHEN PROquestion LIKE 'Describe fatigue on a scale of 1 to 10, where 1 means you don’t feel tired at all and 10 means the worst tiredness you can imagine' THEN 1 END) AS VAS,
COUNT(CASE WHEN PROquestion LIKE 'Physically, today how often did you feel exhausted?' THEN 1 END) AS phF,
COUNT(CASE WHEN PROquestion LIKE 'Mentally, today how often did you feel exhausted?' THEN 1 END) AS MF,
COUNT(CASE WHEN PROquestion LIKE 'Are you feeling better, worse or the same as yesterday?' THEN 1 END) AS ReIP,
COUNT(CASE WHEN PROquestion LIKE 'Did you do sport today?' THEN 1 END) AS sport
FROM (
    SELECT PROquestion
    FROM fatigue
)
'''
sqldf(query)

Unnamed: 0,VAS,phF,MF,ReIP,sport
0,526,526,526,526,167


In [54]:
# convert questions into fatigue variables
query = '''
SELECT SubjectID as subjectID, DateTime AS timestamp, Timezone AS timezone,
CASE
    WHEN PROquestion LIKE 'Describe fatigue on a scale of 1 to 10, where 1 means you don’t feel tired at all and 10 means the worst tiredness you can imagine' THEN PROanswer_value
END AS 'VAS',
CASE
    WHEN PROquestion LIKE 'Physically, today how often did you feel exhausted?' THEN PROanswer_choice
END AS 'phF',
CASE
    WHEN PROquestion LIKE 'Mentally, today how often did you feel exhausted?' THEN PROanswer_choice
END AS 'MF',
CASE
    WHEN PROquestion LIKE 'Are you feeling better, worse or the same as yesterday?' THEN PROanswer_choice
END AS 'ReIP',
CASE
    WHEN PROquestion LIKE 'Did you do sport today?' THEN PROanswer_choice
END AS 'sport'
FROM fatigue
'''
temp = sqldf(query)
temp = temp.fillna(value=pd.NA)
print('total number of questions:', temp.shape[0])

total number of questions: 2271


In [55]:
# distinguish different questionnaires with same timestamp (note: data is in chronological order)
temp = temp.reset_index(drop=False)
temp = temp.rename(columns={'index': 'id'})

# same id = same questionnaire
questionnaire = 0
questionnaire_timestamp = temp.loc[0, 'timestamp']
asked_questions = {'VAS': 0, 'phF': 0, 'MF': 0, 'ReIP': 0, 'sport': 0} # already asked?
for i, row in temp.iterrows():
    asked_questions = {key: value + int(not pd.isna(row[key])) for key, value in asked_questions.items()}

    # different timestamp? -> different questionnaire
    if row['timestamp'] != questionnaire_timestamp:
        questionnaire += 1
        questionnaire_timestamp = temp.loc[i, 'timestamp']
        asked_questions = {key: int(not pd.isna(row[key])) for key, value in asked_questions.items()}
    # same timestamp but same question again (vas already asked)? -> different questionnaire
    elif 2 in set(asked_questions.values()):
        questionnaire += 1
        questionnaire_timestamp = temp.loc[i, 'timestamp']
        asked_questions = {key: int(not pd.isna(row[key])) for key, value in asked_questions.items()}
    else:
        pass

    temp.loc[i, 'id'] = questionnaire

query = '''
SELECT id, subjectID, timestamp, GROUP_CONCAT(DISTINCT timezone) AS timezone, GROUP_CONCAT(VAS) AS VAS, GROUP_CONCAT(phF) AS phF, GROUP_CONCAT(MF) AS MF, GROUP_CONCAT(ReIP) AS ReIP, GROUP_CONCAT(sport) AS sport
FROM temp
GROUP BY id, subjectID, timestamp
'''
questionnaires = sqldf(query)
print('total number of separate questionnaires:', questionnaires.shape[0])

total number of separate questionnaires: 526


In [56]:
# incomplete questionnaires (ignore sport label as it's not asked in all questionnaires)
query = '''
SELECT *
FROM questionnaires
WHERE VAS IS NULL OR phF IS NULL OR MF IS NULL OR ReIP IS NULL
'''
temp = sqldf(query)

# discard incomplete questionnaires
query = '''
SELECT *
FROM questionnaires

EXCEPT

SELECT *
FROM questionnaires
WHERE VAS IS NULL OR phF IS NULL OR MF IS NULL OR ReIP IS NULL;
'''
questionnaires = sqldf(query)

print('number of discarded questionnaires:', temp.shape[0])
temp

number of discarded questionnaires: 3


Unnamed: 0,id,subjectID,timestamp,timezone,VAS,phF,MF,ReIP,sport
0,105,15,09.06.19 22:27,UTC,1.0,,,,
1,215,23,05.02.18 21:00,CET,,,,Better,
2,281,24,06.02.18 21:43,CET,1.0,,,,


In [57]:
# aggregate questionnaires into daily fatigue values
query = '''
SELECT subjectID, SUBSTRING(Timestamp, 1, 8) AS date, GROUP_CONCAT(DISTINCT timezone) AS timezone, GROUP_CONCAT(VAS) AS VAS, GROUP_CONCAT(ReIP) AS ReIP, GROUP_CONCAT(phF) AS phF, GROUP_CONCAT(MF) AS MF, GROUP_CONCAT(sport) AS sport, COUNT(*) AS n_answers
FROM questionnaires
GROUP BY subjectID, date
ORDER BY n_answers DESC;
'''
fatigue_daily = sqldf(query)

# check how many multiple questionnaires are filled out per day
query = '''
SELECT COUNT(*) AS same_day_questionnaires
FROM fatigue_daily
WHERE n_answers > 1;
'''
print('total number of same day questionnaires:', sqldf(query)['same_day_questionnaires'][0])

total number of same day questionnaires: 44


In [58]:
# convert fatigue answers into numeric classes
query = '''
SELECT subjectID, timestamp, timezone,
CASE
WHEN CAST(VAS AS INT) BETWEEN 1.0 AND 4.0 THEN 0
WHEN CAST(VAS AS INT) BETWEEN 5.0 AND 10.0 THEN 1
END AS 'VAS',
CASE
WHEN phF LIKE 'never' THEN 0
WHEN phF LIKE 'sometimes' OR phF LIKE 'regularly' OR phF LIKE 'often' OR phF LIKE 'always' THEN 1
END AS 'phF',
CASE
WHEN MF LIKE 'never' THEN 0
WHEN MF LIKE 'sometimes' OR MF LIKE 'regularly' OR MF LIKE 'often' OR MF LIKE 'always' THEN 1
END AS 'MF',
CASE
WHEN ReIP LIKE 'worse' THEN -1
WHEN ReIP LIKE 'same' THEN 0
WHEN ReIP LIKE 'better' THEN 1
END AS 'ReIP',
CASE
WHEN sport LIKE 'No' THEN 0
WHEN sport LIKE 'Yes' THEN 1
END AS 'sport'
FROM questionnaires
'''
temp = sqldf(query)

# aggregate questionnaires into daily fatigue values
query = '''
SELECT subjectID, SUBSTRING(Timestamp, 1, 8) AS date, GROUP_CONCAT(DISTINCT timezone) AS timezone, GROUP_CONCAT(VAS) AS VAS, GROUP_CONCAT(ReIP) AS ReIP, GROUP_CONCAT(phF) AS phF, GROUP_CONCAT(MF) AS MF, GROUP_CONCAT(sport) AS sport, COUNT(*) AS n_answers
FROM temp
GROUP BY subjectID, date
HAVING n_answers > 1
ORDER BY n_answers DESC;
'''
print('days with multiple questionnaires:')
sqldf(query).fillna(value=pd.NA)

days with multiple questionnaires:


Unnamed: 0,subjectID,date,timezone,VAS,ReIP,phF,MF,sport,n_answers
0,24,06.02.18,CET,1110,"0,0,0,-1",1111,1111,,4
1,24,07.02.18,CET,1,"0,0,-1",101,101,,3
2,4,30.03.19,UTC,0,11,0,0,0.0,2
3,5,04.04.19,UTC,10,11,10,10,"0.0,0.0",2
4,5,07.04.19,UTC,10,11,11,10,"1.0,1.0",2
5,6,05.04.19,UTC,0,10,11,11,"0.0,0.0",2
6,8,09.06.19,UTC,0,11,10,0,"0.0,0.0",2
7,10,06.05.19,UTC,0,10,11,0,"0.0,0.0",2
8,10,08.05.19,UTC,0,11,10,0,"0.0,0.0",2
9,10,10.05.19,UTC,0,-11,10,10,"0.0,0.0",2


In [59]:
# aggregate questionnaires into SINGLE daily fatigue values
# TODO: CHECK VALIDNESS OF AVERAGING
query = '''
SELECT subjectID, SUBSTRING(Timestamp, 1, 8) AS date, GROUP_CONCAT(DISTINCT timezone) AS timezone, ROUND(AVG(VAS)) AS VAS, ROUND(AVG(ReIP)) AS ReIP, ROUND(AVG(phF)) AS phF, ROUND(AVG(MF)) AS MF, ROUND(AVG(sport)) AS sport, COUNT(*) AS n_answers
FROM temp
GROUP BY subjectID, date
ORDER BY n_answers DESC;
'''
Y = sqldf(query).fillna(value=pd.NA)

# for visualization:
query = '''
SELECT *
FROM Y
WHERE n_answers > 1
ORDER BY n_answers DESC;
'''
print('days with multiple questionnaires - averaged to single values:')
sqldf(query).fillna(value=pd.NA)

days with multiple questionnaires - averaged to single values:


Unnamed: 0,subjectID,date,timezone,VAS,ReIP,phF,MF,sport,n_answers
0,24,06.02.18,CET,1.0,0.0,1.0,1.0,,4
1,24,07.02.18,CET,0.0,0.0,1.0,1.0,,3
2,4,30.03.19,UTC,0.0,1.0,0.0,0.0,0.0,2
3,5,04.04.19,UTC,1.0,1.0,1.0,1.0,0.0,2
4,5,07.04.19,UTC,1.0,1.0,1.0,1.0,1.0,2
5,6,05.04.19,UTC,0.0,1.0,1.0,1.0,0.0,2
6,8,09.06.19,UTC,0.0,1.0,1.0,0.0,0.0,2
7,10,06.05.19,UTC,0.0,1.0,1.0,0.0,0.0,2
8,10,08.05.19,UTC,0.0,1.0,1.0,0.0,0.0,2
9,10,10.05.19,UTC,0.0,0.0,1.0,1.0,0.0,2


In [60]:
print('total number of days with fatigue data:', Y.shape[0])

total number of days with fatigue data: 476


# b) Physiological data

In [48]:
physio = physio.fillna(value=pd.NA) # otherwise SQL will ignore None values

# nested dataframe (all physiological data of the same day as lists)
query = '''
SELECT SubjectID AS subjectID,
    SUBSTRING(Timestamp, 1, 8) AS date,
    GROUP_CONCAT(IFNULL(ActivityCounts, 'None')) AS ActivityCounts,
    GROUP_CONCAT(IFNULL(Barometer, 'None')) AS Barometer,
    GROUP_CONCAT(IFNULL(BloodPerfusion, 'None')) AS BloodPerfusion,
    GROUP_CONCAT(IFNULL(BloodPulseWave, 'None')) AS BloodPulseWave,
    GROUP_CONCAT(IFNULL(EnergyExpenditure, 'None')) AS EnergyExpenditure,
    GROUP_CONCAT(IFNULL(GalvanicSkinResponse, 'None')) AS GalvanicSkinResponse,
    GROUP_CONCAT(IFNULL(HR, 'None')) AS HR,
    GROUP_CONCAT(IFNULL(HRV, 'None')) AS HRV,
    GROUP_CONCAT(IFNULL(RESP, 'None')) AS RESP,
    GROUP_CONCAT(IFNULL(Steps, 'None')) AS Steps,
    GROUP_CONCAT(IFNULL(SkinTemperature, 'None')) AS SkinTemperature,
    GROUP_CONCAT(IFNULL(ActivityClass, 'None')) AS ActivityClass
FROM physio
GROUP BY subjectID, date;'''
X = sqldf(query)

# nested dataframe (mean of all physiological data of the same day)
query = '''
SELECT SubjectID AS subjectID,
    SUBSTRING(Timestamp, 1, 8) AS date,
    AVG(ActivityCounts) AS ActivityCounts,
    AVG(Barometer) AS Barometer,
    AVG(BloodPerfusion) AS BloodPerfusion,
    AVG(BloodPulseWave) AS BloodPulseWave,
    AVG(EnergyExpenditure) AS EnergyExpenditure,
    AVG(GalvanicSkinResponse) AS GalvanicSkinResponse,
    AVG(HR) AS HR,
    AVG(HRV) AS HRV,
    AVG(RESP) AS RESP,
    AVG(Steps) AS Steps,
    AVG(SkinTemperature) AS SkinTemperature,
    AVG(ActivityClass) AS ActivityClass
FROM physio
GROUP BY subjectID, date;'''
X_mean = sqldf(query)

print('total number of days:', X_mean.shape[0])

total number of days: 951


In [49]:
# days without sensory data
query = '''
SELECT *
FROM X_mean
WHERE ActivityCounts IS NULL AND Barometer IS NULL AND BloodPerfusion IS NULL AND BloodPulseWave IS NULL AND EnergyExpenditure IS NULL AND GalvanicSkinResponse IS NULL AND HR IS NULL AND HRV IS NULL AND RESP IS NULL AND STEPS IS NULL AND SkinTemperature IS NULL AND ActivityClass IS NULL
'''
temp = sqldf(query)
print('number of discarded days:', temp.shape[0])

# discard days without sensory data
query = '''
SELECT *
FROM X

EXCEPT

SELECT X.subjectID, X.date, X.ActivityCounts, X.Barometer, X.BloodPerfusion,
       X.BloodPulseWave, X.EnergyExpenditure, X.GalvanicSkinResponse, X.HR,
       X.HRV, X.RESP, X.Steps, X.SkinTemperature, X.ActivityClass
FROM X JOIN temp ON X.subjectID = temp.subjectID AND X.date = temp.date;
'''
X = sqldf(query)

# discard days without sensory data
query = '''
SELECT *
FROM X_mean

EXCEPT

SELECT *
FROM temp;
'''
X_mean = sqldf(query)

number of discarded days: 406


In [50]:
# exclude days with >80% of data missing

test = X.copy()
import numpy as np



THRESHOLD = 0.8
n_days, n_cols = test.shape
variables = ['ActivityCounts', 'Barometer', 'BloodPerfusion',
             'BloodPulseWave', 'EnergyExpenditure', 'GalvanicSkinResponse', 'HR',
             'HRV', 'RESP', 'Steps', 'SkinTemperature', 'ActivityClass']

# flag deficient rows
test.insert(n_cols, "deficient", [False for i in range(n_days)], True)
for day in range(n_days):
    for variable in variables:
        time_series = np.array([value if value != 'None' else None for value in test[variable][day].split(',')], dtype=float)

        '''missing_data = np.mean(np.where(np.isnan(time_series), 1.0, 0.0))
        if missing_data > THRESHOLD:
            test['deficient'][day] = True'''
        data_length = np.sum(np.where(np.isnan(time_series), 0.0, 1.0))
        if data_length < THRESHOLD * 24 * 60:
            test['deficient'][day] = True
        #test[variable][day] = data_length

# remove deficient rows
query = '''
SELECT *
FROM test
WHERE deficient = 0;'''
test = sqldf(query)
test = test.drop(columns=['deficient']) # deficiency column is no longer required
X = test
test

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['deficient'][day] = True


Unnamed: 0,subjectID,date,ActivityCounts,Barometer,BloodPerfusion,BloodPulseWave,EnergyExpenditure,GalvanicSkinResponse,HR,HRV,RESP,Steps,SkinTemperature,ActivityClass
0,23,01.02.18,"0.784313725,0.784313725,0.784313725,0.78431372...","975.7,975.7,975.7,975.6,975.7,975.7,975.7,975....","0.59,0.57,0.53,0.5,0.55,0.53,0.625,0.66,0.6,0....","3.3,3.29,3.2,3.02,3.12,3.08,3.0,2.94,2.82,3.02...","960.0,960.0,960.0,960.0,960.0,960.0,960.0,960....","1.61975,1.613955556,1.611683333,1.610822222,1....","51.10281304,51.60327148,51.56562889,51.9514778...","41.36766153,37.5,35.03333333,34.0,33.9829303,3...","19.2822605,19.39399806,18.77951729,17.05347061...","0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....","36.16,36.22,36.25,36.28,36.28,36.31,36.38,36.4...","1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1...."
1,23,02.01.18,"1.568627451,0.784313725,1.568627451,0.78431372...","997.6,997.6,997.7,997.6,997.7,997.7,997.6,997....","0.775,0.81,0.69,0.38,0.525,0.71,0.75,0.77,0.72...","2.72,2.74,2.72,3.0,3.86,4.02,4.08,3.93,3.28,3....","960.0,942.0,720.0,1222.0,1097.0,960.0,960.0,96...","2.516494444,2.502794444,2.490383333,2.47836666...","61.88211485,62.59777898,65.09791196,75.9731398...","33.52402402,33.06666667,36.07876414,51.95,61.1...","13.87857143,13.74596376,13.34796023,14.1985670...","0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....","34.19,34.28,34.34,34.34,34.13,34.13,34.09,34.3...","1.0,1.0,9.0,None,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1..."
2,23,03.02.18,"None,None,None,None,None,None,None,None,None,N...","None,None,None,None,None,None,None,None,None,N...","None,None,None,None,None,None,None,None,None,N...","None,None,None,None,None,None,None,None,None,N...","None,None,None,None,None,None,None,None,None,N...","None,None,None,None,None,None,None,None,None,N...","None,None,None,None,None,None,None,None,None,N...","None,None,None,None,None,None,None,None,None,N...","None,None,None,None,None,None,None,None,None,N...","None,None,None,None,None,None,None,None,None,N...","None,None,None,None,None,None,None,None,None,N...","None,None,None,None,None,None,None,None,None,N..."
3,23,05.01.18,"0.784313725,0.784313725,0.784313725,0.78431372...","974.4,974.5,974.5,974.5,974.4,974.5,974.5,974....","0.46,0.45,0.48,0.56,0.42,0.5,0.68,0.47,0.42,0....","2.36,2.38,2.2,2.26,2.72,2.16,2.16,2.9,2.34,2.2...","960.0,960.0,960.0,960.0,960.0,960.0,960.0,960....","2.9455,2.932344444,2.921,2.908911111,2.8994888...","56.21378955,57.09264458,57.23455132,59.6960025...","58.33333333,62.0,66.88333333,69.05,70.55,71.78...","16.94489247,15.44487179,15.68073136,16.0732273...","0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....","34.19,34.19,34.22,34.25,34.25,34.0,33.91,33.81...","1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1...."
4,23,05.02.18,"0.784313725,0.784313725,0.784313725,0.78431372...","987.6,987.6,988.1,987.8,987.5,987.9,987.7,987....","1.035,1.05,1.11,1.11,1.11,1.13,0.73,0.81,0.78,...","2.64,2.73,2.52,2.72,2.68,2.6,2.74,2.92,2.5,2.4...","960.0,960.0,960.0,960.0,960.0,960.0,960.0,960....","1.356244444,1.354044444,1.350255556,1.34818333...","53.2735217,54.47929665,52.15482646,54.2955102,...","81.28333333,85.85,88.26666667,84.2,78.66666667...","12.19585879,16.11118465,17.70704321,18.7811643...","0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....","36.56,36.59,36.63,36.66,36.69,36.75,36.75,36.8...","1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,26,28.01.18,"0.784313725,0.784313725,0.784313725,0.78431372...","1005.1,1005.0,1005.1,1005.1,1005.05,1005.1,100...","0.83,0.82,0.8,0.815,0.84,0.865,0.785,0.77,0.85...","1.96,2.0,1.94,2.04,1.88,1.92,1.98,1.92,1.8,1.8...","960.0,960.0,960.0,960.0,960.0,960.0,960.0,960....","2.234005556,2.219355556,2.205,2.191105556,2.17...","48.83667906,47.7383781,47.32818044,47.8381026,...","58.88333333,53.99946121,58.21666667,64.9833333...","15.9505279,15.18362968,16.39126533,15.22462831...","0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....","36.28,36.28,36.325,36.38,36.41,36.455,36.5,36....","1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1...."
136,26,28.02.18,"0.784313725,0.784313725,3.137254902,1.17647058...","984.6,984.7,984.5,984.5,984.45,984.4,984.4,984...","None,None,0.13,0.13,0.17,0.18,0.19,0.16,0.16,0...","None,None,2.38,1.75,2.32,2.18,2.74,2.12,2.46,2...","2491.141706,1328.89823,2093.385481,1202.641182...","None,None,None,None,None,None,None,None,None,N...","None,None,57.1987234,61.52075722,60.35287641,7...","None,None,None,None,58.40909091,56.71666667,57...","None,None,11.56316591,17.0158046,16.86451144,1...","0.0,0.0,0.0,0.0,0.0,17.0,0.0,0.0,0.0,0.0,12.0,...","32.47,32.19,31.84,32.16,32.25,32.34,32.47,32.4...","1.0,1.0,9.0,9.0,1.0,1.0,12.0,None,9.0,9.0,None..."
137,26,29.01.18,"0.784313725,0.784313725,0.784313725,0.78431372...","1006.7,1006.7,1006.8,1006.7,1006.8,1006.9,1006...","1.03,1.03,1.05,0.96,1.01,1.02,1.01,1.055,0.945...","1.86,1.84,1.92,1.87,1.88,1.85,1.9,1.88,1.76,1....","960.0,960.0,960.0,960.0,960.0,960.0,960.0,960....","3.820805556,3.758555556,3.720144444,3.67541666...","49.76529338,49.72980719,49.35934664,49.1355028...","71.60175055,74.18333333,73.36666667,70.5715166...","15.05574324,16.13167035,15.90028763,15.5412667...","0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....","36.25,36.28,36.31,36.34,36.41,36.44,36.47,36.5...","1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1...."
138,26,30.01.18,"2.745098039,3.921568627,1.568627451,8.62745098...","1002.55,1002.5,1002.4,1002.5,1002.5,1002.6,100...","0.5,0.57,0.41,0.44,0.6,0.43,0.745,0.47,0.5,0.4...","2.24,2.06,2.02,2.16,3.18,0.0,2.2,2.13,3.83,4.4...","1622.0,699.0,701.0,928.0,2029.0,2445.0,3087.0,...","3.623661111,3.5611,3.5041,3.445161111,3.421588...","64.30637161,65.34692308,66.74287944,73.9073359...","50.43333333,50.01666667,49.75,50.0,52.23333333...","14.48012663,15.03758889,14.28263254,16.8743052...","0.0,0.0,0.0,0.0,23.0,21.0,11.0,54.0,116.0,114....","34.09,34.13,33.94,33.91,33.94,33.91,33.88,33.8...","10.0,9.0,9.0,9.0,12.0,12.0,11.0,12.0,11.0,11.0..."


In [51]:
print('total number of days with physiological data:', X.shape[0])

total number of days with physiological data: 140


# Join data

# a) unnested data (one row <=> one physiological measurement)

In [93]:
query = '''
SELECT SubjectID AS subjectID,
    SUBSTRING(Timestamp, 1, 8) AS date,
    SUBSTRING(Timestamp, 10, LENGTH(Timestamp)) AS time,
    ActivityCounts,
    Barometer,
    BloodPerfusion,
    BloodPulseWave,
    EnergyExpenditure,
    GalvanicSkinResponse,
    HR,
    HRV,
    RESP,
    Steps,
    SkinTemperature,
    ActivityClass
FROM physio;
'''
temp = sqldf(query).fillna(value=pd.NA)

query = '''
SELECT Y.subjectID,
    Y.date,
    temp.time,
    temp.ActivityCounts,
    temp.Barometer,
    temp.BloodPerfusion,
    temp.BloodPulseWave,
    temp.EnergyExpenditure,
    temp.GalvanicSkinResponse,
    temp.HR,
    temp.HRV,
    temp.RESP,
    temp.Steps,
    temp.SkinTemperature,
    temp.ActivityClass,
    y.timezone,
    y.VAS,
    y.phF,
    y.MF,
    y.ReIP,
    y.sport,
    y.n_answers
FROM temp JOIN Y ON temp.subjectID = Y.subjectID AND temp.date = Y.date;'''
data_unnested = sqldf(query)

print('number of discarded physiological measurements with no corresp. fatigue value:', temp.shape[0] - data_unnested.shape[0])

number of discarded physiological measurements with no corresp. fatigue value: 708404


In [94]:
print('total number of physiological measurements with corresp. fatigue value:', data_unnested.shape[0])

total number of physiological measurements with corresp. fatigue value: 617277


# b) nested data (one row <=> one day of physiological measurements)

In [61]:
query = '''
SELECT Y.subjectID, Y.date, X.ActivityCounts, X.Barometer, X.BloodPerfusion, X.BloodPulseWave, X.EnergyExpenditure, X.GalvanicSkinResponse, X.HR, X.HRV, X.RESP, X.Steps, X.SkinTemperature, X.ActivityClass, y.timezone, y.VAS, y.phF, y.MF, y.ReIP, y.sport, y.n_answers
FROM X JOIN Y ON X.subjectID = Y.subjectID AND X.date = Y.date;'''
data = sqldf(query)

print('number of discarded days with questionnaires but no physiological measurements:', Y.shape[0] - data.shape[0])

number of discarded days with questionnaires but no physiological measurements: 365


In [62]:
print('total number of days with physiological measurements + corresp. fatigue values:', data.shape[0])

total number of days with physiological measurements + corresp. fatigue values: 111


# c) nested data (one row <=> one day with mean of physiological measurements)

In [97]:
query = '''
SELECT Y.subjectID, Y.date, X_mean.ActivityCounts, X_mean.Barometer, X_mean.BloodPerfusion, X_mean.BloodPulseWave, X_mean.EnergyExpenditure, X_mean.GalvanicSkinResponse, X_mean.HR, X_mean.HRV, X_mean.RESP, X_mean.Steps, X_mean.SkinTemperature, X_mean.ActivityClass, y.timezone, y.VAS, y.phF, y.MF, y.ReIP, y.sport, y.n_answers
FROM X_mean JOIN Y ON X_mean.subjectID = Y.subjectID AND X_mean.date = Y.date;'''
data_mean = sqldf(query)

print('number of discarded days with questionnaires but no physiological measurements:', Y.shape[0] - data_mean.shape[0])

number of discarded days with questionnaires but no physiological measurements: 66


In [98]:
print('total number of days with physiological measurements + corresp. fatigue values:', data_mean.shape[0])

total number of days with physiological measurements + corresp. fatigue values: 410


# Export data

In [99]:
output_path = '/'.join(path.split('/')[:-1]) + '/Output'

In [100]:
data_unnested.to_csv(output_path + '/combined_data_unnested.csv')

In [101]:
data.to_csv(output_path + '/combined_data.csv')

In [102]:
 data_mean.to_csv(output_path + '/combined_data_mean.csv')