In [9]:
import pandas as pd
from pandasql import sqldf
import math

# Import data

In [2]:
paths = {'windows': 'C:/Users/jjung/iCloudDrive/ETH/MSc 3rd semester/Semester project/Data',
         'macOS': '/Users/janoschjungo/Library/Mobile Documents/com~apple~CloudDocs/ETH/MSc 3rd semester/Semester project/Data'}
path = paths['windows']

# import physiological data
physio = pd.DataFrame()
for subjectID in range(1, 28):
    # load physiological data for subject
    try:
        file = path + f'/subjectID_{subjectID}.csv'
        physio_subject = pd.read_csv(file)
    except FileNotFoundError:
        path = paths['macOS']
        file = path + f'/subjectID_{subjectID}.csv'
        physio_subject = pd.read_csv(file)
    n_entries = physio_subject.shape[0]
    physio_subject['SubjectID'] = [subjectID for i in range(n_entries)]

    # clean column names (dataset contains different column names per subject)
    if 'SkinTemperature.Value' in physio_subject.columns:
        physio_subject = physio_subject.rename(columns={'SkinTemperature.Value': 'SkinTemperature'}, errors='raise')

    # combine all subject data
    physio = pd.concat([physio, physio_subject])

# import fatigue (PROs) data
fatigue = pd.read_csv(path + '/fatiguePROs.csv')

# Convert data

In [58]:
# convert questionnaires into variables
data = {}
for _, row in fatigue.iterrows():
    # extract data
    subjectID, timestamp, timezone, question, VAS, answer = row
    fatigue_label = {'Physically, today how often did you feel exhausted?': 'phF',
                     'Mentally, today how often did you feel exhausted?': 'MF',
                     'Describe fatigue on a scale of 1 to 10, where 1 means you don’t feel tired at all and 10 means the worst tiredness you can imagine': 'VAS',
                     'Are you feeling better, worse or the same as yesterday?': 'ReIP',
                     'Did you do sport today?': 'Sport'}[question]
    fatigue_score = VAS if not math.isnan(VAS) else answer
    fatigue_score = str(fatigue_score)

    # combine same timestamp data
    if (subjectID, timestamp) in data.keys():
        # already other fatigue data for this timestamp
        try:
            data[(subjectID, timestamp)][fatigue_label] = data[(subjectID, timestamp)][fatigue_label] + ',' + fatigue_score
        except KeyError:
            data[(subjectID, timestamp)][fatigue_label] = fatigue_score
    else:
        # no data for this timestamp yet
        data[(subjectID, timestamp)] = {
            'subjectID': subjectID,
            'timestamp': timestamp,
            'timezone': timezone,
            fatigue_label: fatigue_score
        }

# Queries

Note that we sometimes have multiple questionnaires per day

In [62]:
Y = pd.DataFrame(list(data.values()))
Y

Unnamed: 0,subjectID,timestamp,timezone,VAS,ReIP,phF,MF,Sport
0,1,14.03.19 20:01,UTC,2.0,Worse,Never,Sometimes,No
1,1,15.03.19 20:01,UTC,2.0,Better,Sometimes,Sometimes,Yes
2,1,16.03.19 20:47,UTC,1.0,Same,Never,Never,No
3,1,17.03.19 20:01,UTC,1.0,Same,Sometimes,Never,Yes
4,1,18.03.19 20:13,UTC,2.0,Worse,Sometimes,Never,Yes
...,...,...,...,...,...,...,...,...
517,28,10.08.18 23:13,CEST,4.0,Same,Never,Sometimes,
518,28,13.08.18 21:39,CEST,4.0,Same,Sometimes,Regularly,
519,28,14.08.18 23:27,CEST,6.0,Worse,Never,Sometimes,
520,28,16.08.18 00:51,CEST,5.0,Better,Sometimes,Sometimes,


Multiple answers the same minute

In [64]:
query = '''
SELECT *
FROM Y
WHERE VAS LIKE '%,%' OR pHF LIKE '%,%' OR MF LIKE '%,%' OR Sport LIKE '%,%';
'''
sqldf(query)

Unnamed: 0,subjectID,timestamp,timezone,VAS,ReIP,phF,MF,Sport
0,24,06.02.18 21:43,CET,"1.0,5.0","nan,Same","nan,Sometimes","nan,Sometimes",
1,24,27.02.18 11:54,CET,"7.0,7.0","Same,Same","Often,Often","Often,Often",
2,24,03.05.18 07:53,CEST,"2.0,2.0","Same,Same","Sometimes,Sometimes","Never,Never",
3,27,06.02.18 21:45,CET,"4.0,3.0","Worse,Same","Often,Often","Regularly,Always",


Multiple answers the same day

In [129]:
query = '''
SELECT subjectID, SUBSTRING(timestamp, 1, 8) AS date, GROUP_CONCAT(VAS) AS VAS, GROUP_CONCAT(phF) AS phF, GROUP_CONCAT(MF) AS MF, GROUP_CONCAT(ReIP) AS ReIP, GROUP_CONCAT(Sport) AS Sport
FROM Y
GROUP BY subjectID, date
HAVING COUNT(*) > 1;'''
sqldf(query)

Unnamed: 0,subjectID,date,VAS,phF,MF,ReIP,Sport
0,4,30.03.19,"3.0,2.0","Never,Never","Never,Never","Better,Better","No,nan"
1,5,04.04.19,"6.0,2.0","Sometimes,Never","Sometimes,Never","Better,Better","No,No"
2,5,07.04.19,"9.0,3.0","Sometimes,Sometimes","Sometimes,Never","Better,Better","Yes,Yes"
3,6,05.04.19,"2.0,3.0","Sometimes,Sometimes","Sometimes,Sometimes","Better,Same","No,No"
4,8,09.06.19,"2.0,1.0","Sometimes,Never","Never,Never","Better,Better","No,No"
5,10,06.05.19,"2.0,2.0","Sometimes,Sometimes","Never,Never","Better,Same","No,No"
6,10,08.05.19,"2.0,1.0","Sometimes,Never","Never,Never","Better,Better","No,No"
7,10,10.05.19,"3.0,2.0","Sometimes,Never","Sometimes,Never","Worse,Better","No,No"
8,15,07.06.19,"1.0,7.0","Sometimes,Sometimes","Never,Never","Same,Same","Yes,Yes"
9,15,09.06.19,"1.0,1.0","Sometimes,nan","Never,nan","Worse,nan","Yes,nan"


# Join data

Physiological data

In [82]:
physio = physio.fillna(value='NaN') # otherwise SQL will ignore None values

In [88]:
query = '''
SELECT SubjectID AS subjectID, SUBSTRING(Timestamp, 1, 8) AS date, GROUP_CONCAT(ActivityCounts) AS ActivityCounts, GROUP_CONCAT(Barometer) AS Barometer, GROUP_CONCAT(BloodPerfusion) AS BloodPerfusion, GROUP_CONCAT(BloodPulseWave) AS BloodPulseWave, GROUP_CONCAT(EnergyExpenditure) AS EnergyExpenditure, GROUP_CONCAT(GalvanicSkinResponse) AS GalvanicSkinResponse, GROUP_CONCAT(HR) AS HR, GROUP_CONCAT(HRV) AS HRV, GROUP_CONCAT(RESP) AS RESP, GROUP_CONCAT(Steps) AS Steps, GROUP_CONCAT(SkinTemperature) AS SkinTemperature, GROUP_CONCAT(ActivityClass) AS ActivityClass
FROM physio
GROUP BY subjectID, date;'''
X = sqldf(query)

In [89]:
query = '''
SELECT *
FROM X
LIMIT 10;'''
sqldf(query)

Unnamed: 0,subjectID,date,ActivityCounts,Barometer,BloodPerfusion,BloodPulseWave,EnergyExpenditure,GalvanicSkinResponse,HR,HRV,RESP,Steps,SkinTemperature,ActivityClass
0,1,14.03.19,"0.784313725,1.176470588,2.352941176,1.17647058...","981.25,981.2,981.2,981.1,981.2,981.3,981.2,981...","0.0,0.175,0.19,0.11,0.1,0.09,NaN,NaN,NaN,NaN,N...","0.0,1.9,1.82,1.94,1.68,1.09,NaN,NaN,NaN,NaN,Na...","753.6365662,1256.060944,1256.060944,1256.06094...","NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,Na...","83.0,75.40363269,69.11265332,69.78116438,72.30...","NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,Na...","NaN,NaN,14.5625,12.17552786,12.531234,14.69469...","0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NaN,0....","28.28,29.44,30.41,31.0,31.455,31.795,32.22,31....","NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,Na..."
1,1,15.03.19,"0.784313725,0.784313725,0.784313725,0.78431372...","984.4,984.5,984.3,984.4,984.3,984.4,984.3,984....","0.875,0.825,0.975,0.98,1.0,0.985,0.96,0.93,0.9...","2.27,2.18,2.19,2.18,2.02,2.27,2.27,2.49,2.17,2...","1311.85318,1311.85318,1311.85318,1311.85318,13...","2.471338889,2.46735,2.466372222,2.4621,2.45465...","67.54923531,57.26287697,55.2253739,58.43175416...","44.92312073,51.26666667,57.7,58.01666667,57.41...","14.34420472,14.21430416,12.45540139,14.6309642...","0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....","36.13,36.19,36.25,36.31,36.44,36.5,36.59,36.69...","NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,Na..."
2,1,16.03.19,"0.784313725,1.176470588,5.098039216,0.78431372...","990.7,990.6,990.7,990.7,990.8,990.75,990.8,990...","1.045,0.79,0.84,0.585,0.415,0.74,0.835,0.835,0...","2.32,2.22,2.44,2.18,2.45,2.44,2.24,2.28,2.18,2...","1311.85318,1311.85318,1311.85318,1311.85318,13...","1.172788889,1.171305556,1.061122222,3.09794444...","64.65655738,63.83929024,63.86047157,62.6129328...","31.3815721,34.48333333,36.25,39.48333333,41.9,...","15.03026316,15.03544724,12.88235294,12.5884458...","0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....","34.34,34.38,34.38,34.455,34.16,34.06,34.19,34....","NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,Na..."
3,1,17.03.19,"0.784313725,0.784313725,0.784313725,0.78431372...","982.15,982.05,982.0,982.0,982.0,982.0,982.0,98...","1.42,1.47,1.48,1.46,1.49,1.475,1.47,1.46,1.45,...","2.86,2.7,2.73,2.72,2.68,2.9,2.96,3.02,3.08,3.3...","1311.85318,1311.85318,1311.85318,1311.85318,13...","2.573777778,2.567694444,2.560622222,2.55622222...","69.66641006,70.49757838,70.17700916,69.3862999...","23.08598131,22.81666667,20.41666667,20.5166666...","14.23136959,13.68923699,16.7,16.6549496,13.494...","0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....","35.63,35.63,35.44,35.41,35.38,35.38,35.38,35.3...","NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,Na..."
4,1,18.03.19,"0.784313725,1.176470588,0.784313725,0.78431372...","988.5,988.6,988.7,988.65,988.5,988.5,988.5,988...","1.48,1.49,1.155,1.525,1.38,1.45,1.555,1.41,1.6...","1.54,1.61,2.18,2.01,1.76,1.76,1.78,1.81,1.5,1....","1289.98896,1311.85318,1311.85318,1311.85318,13...","2.632423729,2.630555556,2.626594444,2.62318333...","57.42528736,59.38585964,65.7858185,58.39290306...","33.69491525,34.18333333,37.91988636,46.85,52.9...","15.35642063,14.75081392,14.21644803,10.5670285...","0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....","36.94,36.97,36.97,36.69,36.53,36.34,36.19,35.9...","NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,Na..."
5,1,19.03.19,"0.784313725,0.784313725,0.784313725,0.78431372...","995.5,995.5,995.5,995.5,995.5,995.5,995.4,995....","0.92,0.97,0.97,0.99,0.995,0.975,0.855,0.83,0.9...","1.6,1.55,1.66,1.42,1.53,1.65,1.81,2.23,2.11,2....","1311.85318,1311.85318,1311.85318,1311.85318,13...","2.708977778,2.695766667,2.687044444,2.67635,2....","56.14445575,56.35464231,57.19795658,54.9835939...","46.43976494,46.36666667,47.43333333,44.6333333...","13.76580566,14.66740631,14.63070847,15.0659286...","0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....","33.91,33.84,33.81,33.81,33.78,33.78,33.72,33.9...","NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,Na..."
6,1,20.03.19,"0.784313725,0.784313725,0.784313725,0.78431372...","1002.0,1001.9,1001.95,1002.0,1002.0,1002.0,100...","1.27,1.55,1.49,1.515,1.5,1.48,1.485,1.46,1.45,...","2.0,1.87,1.96,1.92,1.9,1.84,1.9,1.99,1.96,1.89...","1311.85318,1311.85318,1311.85318,1311.85318,13...","NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,Na...","52.63032548,53.35165125,50.74380601,50.8068331...","64.95,65.5,67.45484509,66.46666667,60.66617575...","12.61840121,13.39650074,13.60103118,14.1496332...","0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....","35.72,35.56,35.36,35.19,35.0,34.84,34.75,34.59...","NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,Na..."
7,1,21.03.19,"0.784313725,0.784313725,0.784313725,0.78431372...","1003.5,1003.5,1003.5,1003.6,1003.6,1003.6,1003...","0.78,0.785,0.8,0.8,0.81,0.82,0.82,0.81,0.83,0....","2.54,2.51,2.54,2.54,2.44,2.6,2.62,2.4,2.5,2.51...","1311.85318,1311.85318,1311.85318,1311.85318,13...","NaN,NaN,4.994809524,4.985632653,4.950655556,4....","65.28477491,65.71710526,65.48634294,65.2672392...","21.6,22.0,22.0,22.1,22.0,21.38333333,20.55,20....","16.19938493,14.53333333,15.303368,16.1539437,1...","0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....","36.59,36.69,36.75,36.81,36.91,36.97,37.03,37.0...","NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,Na..."
8,2,16.04.19,"1.176470588,1.176470588,0.784313725,3.52941176...","986.8,986.6,986.8,986.7,986.7,986.7,986.7,986....","0.44,0.46,0.71,0.7,0.745,0.78,0.88,0.735,0.81,...","3.0,2.96,2.97,3.24,2.54,2.64,2.62,3.15,3.5,3.1...","776.4142323,1330.995827,1330.995827,1330.99582...","NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,Na...","65.2020202,68.51652893,65.31147541,71.66792672...","NaN,NaN,55.60869565,56.63333333,54.81666667,54...","16.0,15.41666667,13.57201965,11.98377388,15.46...","0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,49.0,...","33.34,33.19,33.09,33.03,32.97,32.97,33.0,33.03...","NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,Na..."
9,3,21.03.19,"1.176470588,1.176470588,1.176470588,1.17647058...","997.9,997.9,998.0,998.0,998.0,998.0,998.0,NaN,...","NaN,0.05,0.07,0.07,0.05,0.04,0.04,NaN,NaN,NaN,...","NaN,1.8,1.44,1.45,1.58,1.73,1.72,NaN,NaN,NaN,N...","88.73305511,1330.995827,1330.995827,1330.99582...","NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,Na...","NaN,78.40410418,78.07865169,74.62928055,79.711...","NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,Na...","NaN,23.76666667,15.9504668,11.81666667,14.5336...","0.0,0.0,0.0,0.0,0.0,0.0,0.0,NaN,NaN,NaN,0.0,0....","31.205,31.455,32.31,32.75,33.19,33.425,33.56,N...","NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,Na..."


Fatigue (PROs) data

In [148]:
Y = Y.fillna(value='NaN') # otherwise SQL will ignore None values

In [106]:
query = '''
SELECT *
FROM Y
LIMIT 10;
'''
sqldf(query)

Unnamed: 0,subjectID,timestamp,timezone,VAS,ReIP,phF,MF,Sport
0,1,14.03.19 20:01,UTC,2.0,Worse,Never,Sometimes,No
1,1,15.03.19 20:01,UTC,2.0,Better,Sometimes,Sometimes,Yes
2,1,16.03.19 20:47,UTC,1.0,Same,Never,Never,No
3,1,17.03.19 20:01,UTC,1.0,Same,Sometimes,Never,Yes
4,1,18.03.19 20:13,UTC,2.0,Worse,Sometimes,Never,Yes
5,1,20.03.19 20:21,UTC,3.0,Worse,Sometimes,Sometimes,Yes
6,1,21.03.19 13:05,UTC,1.0,Better,Never,Never,No
7,2,16.04.19 20:03,UTC,3.0,Better,Never,Never,No
8,2,17.04.19 21:00,UTC,3.0,Same,Never,Sometimes,No
9,2,18.04.19 20:33,UTC,4.0,Same,Sometimes,Never,Yes


In [111]:
query = '''
SELECT *
FROM Y
WHERE VAS LIKE '%,%' OR pHF LIKE '%,%' OR MF LIKE '%,%' OR Sport LIKE '%,%';
'''
sqldf(query)

Unnamed: 0,subjectID,timestamp,timezone,VAS,ReIP,phF,MF,Sport
0,24,06.02.18 21:43,CET,"1.0,5.0","nan,Same","nan,Sometimes","nan,Sometimes",
1,24,27.02.18 11:54,CET,"7.0,7.0","Same,Same","Often,Often","Often,Often",
2,24,03.05.18 07:53,CEST,"2.0,2.0","Same,Same","Sometimes,Sometimes","Never,Never",
3,27,06.02.18 21:45,CET,"4.0,3.0","Worse,Same","Often,Often","Regularly,Always",


Combine same day data

In [167]:
query = '''
SELECT subjectID, SUBSTRING(Timestamp, 1, 8) AS date, GROUP_CONCAT(DISTINCT timezone) AS timezone, GROUP_CONCAT(VAS) AS VAS, GROUP_CONCAT(ReIP) AS ReIP, GROUP_CONCAT(phF) AS phF, GROUP_CONCAT(MF) AS MF, GROUP_CONCAT(Sport) AS Sport, COUNT(*) AS n_answers
FROM (
    SELECT SubjectID as subjectID, DateTime AS timestamp, Timezone AS timezone,
        CASE
            WHEN PROquestion LIKE 'Describe fatigue on a scale of 1 to 10, where 1 means you don’t feel tired at all and 10 means the worst tiredness you can imagine' THEN PROanswer_value
        END AS 'VAS',
        CASE
            WHEN PROquestion LIKE 'Physically, today how often did you feel exhausted?' THEN PROanswer_choice
        END AS 'phF',
        CASE
            WHEN PROquestion LIKE 'Mentally, today how often did you feel exhausted?' THEN PROanswer_choice
        END AS 'MF',
        CASE
            WHEN PROquestion LIKE 'Are you feeling better, worse or the same as yesterday?' THEN PROanswer_choice
        END AS 'ReIP',
        CASE
            WHEN PROquestion LIKE 'Did you do sport today?' THEN PROanswer_choice
        END AS 'sport'
    FROM fatigue
)
GROUP BY subjectID, date
ORDER BY n_answers DESC;
'''
Y_daily = sqldf(query)
Y_daily

Unnamed: 0,subjectID,date,timezone,VAS,ReIP,phF,MF,Sport,n_answers
0,24,06.02.18,CET,"5.0,5.0,1.0,5.0,2.0","Same,Same,Same,Worse","Regularly,Regularly,Sometimes,Sometimes","Sometimes,Sometimes,Sometimes,Sometimes",,20
1,24,07.02.18,CET,"4.0,3.0,5.0","Same,Same,Worse","Sometimes,Never,Regularly","Sometimes,Never,Sometimes",,12
2,4,30.03.19,UTC,"3.0,2.0","Better,Better","Never,Never","Never,Never",No,10
3,5,04.04.19,UTC,"6.0,2.0","Better,Better","Sometimes,Never","Sometimes,Never","No,No",10
4,5,07.04.19,UTC,"9.0,3.0","Better,Better","Sometimes,Sometimes","Sometimes,Never","Yes,Yes",10
...,...,...,...,...,...,...,...,...,...
472,27,31.08.18,CEST,3.0,Worse,Sometimes,Sometimes,,4
473,28,09.08.18,CEST,5.0,Same,Never,Never,,4
474,28,10.08.18,CEST,4.0,Same,Never,Sometimes,,4
475,28,13.08.18,CEST,4.0,Same,Sometimes,Regularly,,4


Combine same day data and convert to numerical classes

In [181]:
query = '''
WITH fatigue_scores AS (
    SELECT SubjectID as subjectID, DateTime AS timestamp, Timezone AS timezone,
    CASE
        WHEN PROquestion LIKE 'Describe fatigue on a scale of 1 to 10, where 1 means you don’t feel tired at all and 10 means the worst tiredness you can imagine' THEN PROanswer_value
    END AS 'VAS',
    CASE
        WHEN PROquestion LIKE 'Physically, today how often did you feel exhausted?' THEN PROanswer_choice
    END AS 'phF',
    CASE
        WHEN PROquestion LIKE 'Mentally, today how often did you feel exhausted?' THEN PROanswer_choice
    END AS 'MF',
    CASE
        WHEN PROquestion LIKE 'Are you feeling better, worse or the same as yesterday?' THEN PROanswer_choice
    END AS 'ReIP',
    CASE
        WHEN PROquestion LIKE 'Did you do sport today?' THEN PROanswer_choice
    END AS 'sport'
    FROM fatigue
),
fatigue_classes AS (
    SELECT subjectID, timestamp, timezone,
    CASE
        WHEN VAS BETWEEN 1 AND 4 THEN 0
        WHEN VAS BETWEEN 5 AND 10 THEN 1
    END AS 'VAS',
    CASE
        WHEN phF LIKE 'never' THEN 0
        WHEN phF LIKE 'sometimes' OR phF LIKE 'regularly' OR phF LIKE 'often' OR phF LIKE 'always' THEN 1
    END AS 'phF',
    CASE
        WHEN MF LIKE 'never' THEN 0
        WHEN MF LIKE 'sometimes' OR MF LIKE 'regularly' OR MF LIKE 'often' OR MF LIKE 'always' THEN 1
    END AS 'MF',
    CASE
        WHEN ReIP LIKE 'worse' THEN -1
        WHEN ReIP LIKE 'same' THEN 0
        WHEN ReIP LIKE 'better' THEN 1
    END AS 'ReIP',
    CASE
        WHEN sport LIKE 'No' THEN 0
        WHEN sport LIKE 'Yes' THEN 1
    END AS 'sport'
    FROM fatigue_scores
)
SELECT subjectID, SUBSTRING(Timestamp, 1, 8) AS date, GROUP_CONCAT(DISTINCT timezone) AS timezone, GROUP_CONCAT(VAS) AS VAS, GROUP_CONCAT(ReIP) AS ReIP, GROUP_CONCAT(phF) AS phF, GROUP_CONCAT(MF) AS MF, GROUP_CONCAT(sport) AS sport, COUNT(*) AS n_answers
FROM fatigue_classes
GROUP BY subjectID, date
ORDER BY n_answers DESC;
'''
Y_daily_classes = sqldf(query)
Y_daily_classes

Unnamed: 0,subjectID,date,timezone,VAS,ReIP,phF,MF,sport,n_answers
0,24,06.02.18,CET,11010,"0,0,0,-1",1111,1111,,20
1,24,07.02.18,CET,001,"0,0,-1",101,101,,12
2,4,30.03.19,UTC,00,11,00,00,0,10
3,5,04.04.19,UTC,10,11,10,10,00,10
4,5,07.04.19,UTC,10,11,11,10,11,10
...,...,...,...,...,...,...,...,...,...
472,27,31.08.18,CEST,0,-1,1,1,,4
473,28,09.08.18,CEST,1,0,0,0,,4
474,28,10.08.18,CEST,0,0,0,1,,4
475,28,13.08.18,CEST,0,0,1,1,,4


In [180]:
fatigue

Unnamed: 0,SubjectID,DateTime,Timezone,PROquestion,PROanswer_value,PROanswer_choice
0,1,14.03.19 20:01,UTC,"Describe fatigue on a scale of 1 to 10, where ...",2.0,
1,1,14.03.19 20:01,UTC,"Are you feeling better, worse or the same as y...",,Worse
2,1,14.03.19 20:01,UTC,"Physically, today how often did you feel exhau...",,Never
3,1,14.03.19 20:01,UTC,"Mentally, today how often did you feel exhausted?",,Sometimes
4,1,14.03.19 20:01,UTC,Did you do sport today?,,No
...,...,...,...,...,...,...
2266,28,16.08.18 00:51,CEST,"Mentally, today how often did you feel exhausted?",,Sometimes
2267,28,16.08.18 00:52,CEST,"Describe fatigue on a scale of 1 to 10, where ...",4.0,
2268,28,16.08.18 00:52,CEST,"Are you feeling better, worse or the same as y...",,Better
2269,28,16.08.18 00:52,CEST,"Physically, today how often did you feel exhau...",,Never


In [182]:
query = '''
WITH fatigue_scores AS (
    SELECT SubjectID as subjectID, DateTime AS timestamp, Timezone AS timezone,
    CASE
        WHEN PROquestion LIKE 'Describe fatigue on a scale of 1 to 10, where 1 means you don’t feel tired at all and 10 means the worst tiredness you can imagine' THEN PROanswer_value
    END AS 'VAS',
    CASE
        WHEN PROquestion LIKE 'Physically, today how often did you feel exhausted?' THEN PROanswer_choice
    END AS 'phF',
    CASE
        WHEN PROquestion LIKE 'Mentally, today how often did you feel exhausted?' THEN PROanswer_choice
    END AS 'MF',
    CASE
        WHEN PROquestion LIKE 'Are you feeling better, worse or the same as yesterday?' THEN PROanswer_choice
    END AS 'ReIP',
    CASE
        WHEN PROquestion LIKE 'Did you do sport today?' THEN PROanswer_choice
    END AS 'sport'
    FROM fatigue
),
fatigue_classes AS (
    SELECT subjectID, timestamp, timezone,
    CASE
        WHEN VAS BETWEEN 1 AND 4 THEN 0
        WHEN VAS BETWEEN 5 AND 10 THEN 1
    END AS 'VAS',
    CASE
        WHEN phF LIKE 'never' THEN 0
        WHEN phF LIKE 'sometimes' OR phF LIKE 'regularly' OR phF LIKE 'often' OR phF LIKE 'always' THEN 1
    END AS 'phF',
    CASE
        WHEN MF LIKE 'never' THEN 0
        WHEN MF LIKE 'sometimes' OR MF LIKE 'regularly' OR MF LIKE 'often' OR MF LIKE 'always' THEN 1
    END AS 'MF',
    CASE
        WHEN ReIP LIKE 'worse' THEN -1
        WHEN ReIP LIKE 'same' THEN 0
        WHEN ReIP LIKE 'better' THEN 1
    END AS 'ReIP',
    CASE
        WHEN sport LIKE 'No' THEN 0
        WHEN sport LIKE 'Yes' THEN 1
    END AS 'sport'
    FROM fatigue_scores
)
SELECT *
FROM fatigue_scores;
'''
sqldf(query)

Unnamed: 0,subjectID,timestamp,timezone,VAS,phF,MF,ReIP,sport
0,1,14.03.19 20:01,UTC,2.0,,,,
1,1,14.03.19 20:01,UTC,,,,Worse,
2,1,14.03.19 20:01,UTC,,Never,,,
3,1,14.03.19 20:01,UTC,,,Sometimes,,
4,1,14.03.19 20:01,UTC,,,,,No
...,...,...,...,...,...,...,...,...
2266,28,16.08.18 00:51,CEST,,,Sometimes,,
2267,28,16.08.18 00:52,CEST,4.0,,,,
2268,28,16.08.18 00:52,CEST,,,,Better,
2269,28,16.08.18 00:52,CEST,,Never,,,


In [210]:
query = '''
WITH fatigue_scores AS (
    SELECT SubjectID as subjectID, DateTime AS timestamp, Timezone AS timezone,
    CASE
        WHEN PROquestion LIKE 'Describe fatigue on a scale of 1 to 10, where 1 means you don’t feel tired at all and 10 means the worst tiredness you can imagine' THEN PROanswer_value
    END AS 'VAS',
    CASE
        WHEN PROquestion LIKE 'Physically, today how often did you feel exhausted?' THEN PROanswer_choice
    END AS 'phF',
    CASE
        WHEN PROquestion LIKE 'Mentally, today how often did you feel exhausted?' THEN PROanswer_choice
    END AS 'MF',
    CASE
        WHEN PROquestion LIKE 'Are you feeling better, worse or the same as yesterday?' THEN PROanswer_choice
    END AS 'ReIP',
    CASE
        WHEN PROquestion LIKE 'Did you do sport today?' THEN PROanswer_choice
    END AS 'sport'
    FROM fatigue
),
fatigue_by_timestamp AS (
    SELECT subjectID, timestamp, GROUP_CONCAT(DISTINCT timezone) AS timezone, GROUP_CONCAT(VAS) AS VAS, GROUP_CONCAT(ReIP) AS ReIP, GROUP_CONCAT(phF) AS phF, GROUP_CONCAT(MF) AS MF, GROUP_CONCAT(sport) AS sport, COUNT(*) AS n_answers
    FROM fatigue_scores
    GROUP BY timestamp
)
SELECT *
FROM fatigue_by_timestamp
'''
sqldf(query)

Unnamed: 0,subjectID,timestamp,timezone,VAS,ReIP,phF,MF,sport,n_answers
0,23,01.01.18 21:00,CET,"2.0,4.0","Better,Worse","Never,Regularly","Never,Regularly",,8
1,23,01.02.18 21:00,CET,"6.0,5.0,7.0","Better,Same,Same","Regularly,Sometimes,Sometimes","Sometimes,Sometimes,Regularly",,12
2,24,01.03.18 13:28,CET,4.0,Better,Regularly,Regularly,,4
3,25,01.03.18 20:43,CET,2.0,Better,Sometimes,Never,,4
4,27,01.03.18 21:51,CET,1.0,Better,Sometimes,Never,,4
...,...,...,...,...,...,...,...,...,...
474,22,31.05.19 22:39,UTC,2.0,Same,Sometimes,Never,No,5
475,26,31.07.18 01:38,CEST,3.0,Better,Sometimes,Sometimes,,4
476,26,31.07.18 20:57,CEST,3.0,Same,Sometimes,Sometimes,,4
477,27,31.08.18 20:24,CEST,3.0,Worse,Sometimes,Sometimes,,4


Same day questionnaires aggregation

In [153]:
query = '''
SELECT subjectID, SUBSTRING(Timestamp, 1, 8) AS date, GROUP_CONCAT(timezone) AS timezone, GROUP_CONCAT(VAS) AS VAS, GROUP_CONCAT(ReIP) AS ReIP, GROUP_CONCAT(phF) AS phF, GROUP_CONCAT(MF) AS MF, GROUP_CONCAT(Sport) AS Sport, COUNT(*) AS n_answers
FROM Y
GROUP BY subjectID, date
ORDER BY n_answers DESC;
'''
Y_daily = sqldf(query)
Y_daily

Unnamed: 0,subjectID,date,timezone,VAS,ReIP,phF,MF,Sport,n_answers
0,24,06.02.18,"CET,CET,CET,CET","5.0,5.0,1.0,5.0,2.0","Same,Same,nan,Same,Worse","Regularly,Regularly,nan,Sometimes,Sometimes","Sometimes,Sometimes,nan,Sometimes,Sometimes","NaN,NaN,NaN,NaN",4
1,24,07.02.18,"CET,CET,CET","4.0,3.0,5.0","Same,Same,Worse","Sometimes,Never,Regularly","Sometimes,Never,Sometimes","NaN,NaN,NaN",3
2,4,30.03.19,"UTC,UTC","3.0,2.0","Better,Better","Never,Never","Never,Never","No,nan",2
3,5,04.04.19,"UTC,UTC","6.0,2.0","Better,Better","Sometimes,Never","Sometimes,Never","No,No",2
4,5,07.04.19,"UTC,UTC","9.0,3.0","Better,Better","Sometimes,Sometimes","Sometimes,Never","Yes,Yes",2
...,...,...,...,...,...,...,...,...,...
472,27,31.08.18,CEST,3.0,Worse,Sometimes,Sometimes,,1
473,28,09.08.18,CEST,5.0,Same,Never,Never,,1
474,28,10.08.18,CEST,4.0,Same,Never,Sometimes,,1
475,28,13.08.18,CEST,4.0,Same,Sometimes,Regularly,,1


Convert to numerical classes

In [155]:
query = '''
WITH multiple_answers AS (
    SELECT *
    FROM Y
    WHERE VAS LIKE '%,%' OR pHF LIKE '%,%' OR MF LIKE '%,%' OR Sport LIKE '%,%'
),
single_answers AS (
    SELECT *
    FROM Y EXCEPT SELECT * FROM multiple_answers
)
SELECT subjectID, timestamp, timezone,
    CASE
        WHEN VAS BETWEEN 1 AND 4 THEN 0
        WHEN VAS BETWEEN 5 AND 10 THEN 1
        ELSE 'NaN'
    END AS 'VAS',
    CASE
        WHEN phF LIKE 'never' THEN 0
        WHEN phF LIKE 'sometimes' OR phF LIKE 'regularly' OR phF LIKE 'often' OR phF LIKE 'always' THEN 1
        ELSE 'NaN'
    END AS 'phF',
    CASE
        WHEN MF LIKE 'never' THEN 0
        WHEN MF LIKE 'sometimes' OR MF LIKE 'regularly' OR MF LIKE 'often' OR MF LIKE 'always' THEN 1
        ELSE 'NaN'
    END AS 'MF',
    CASE
        WHEN ReIP LIKE 'worse' THEN -1
        WHEN ReIP LIKE 'same' THEN 0
        WHEN ReIP LIKE 'better' THEN 1
        ELSE 'NaN'
    END AS 'ReIP',
    CASE
        WHEN Sport LIKE 'No' THEN 0
        WHEN Sport LIKE 'Yes' THEN 1
        ELSE 'NaN'
    END AS 'Sport'
FROM single_answers;
'''
Y_classes = sqldf(query)
Y_classes

Unnamed: 0,subjectID,timestamp,timezone,VAS,phF,MF,ReIP,Sport
0,1,14.03.19 20:01,UTC,0,0,1,-1,0
1,1,15.03.19 20:01,UTC,0,1,1,1,1
2,1,16.03.19 20:47,UTC,0,0,0,0,0
3,1,17.03.19 20:01,UTC,0,1,0,0,1
4,1,18.03.19 20:13,UTC,0,1,0,-1,1
...,...,...,...,...,...,...,...,...
513,28,10.08.18 23:13,CEST,,0,1,0,
514,28,13.08.18 21:39,CEST,,1,1,0,
515,28,14.08.18 23:27,CEST,,0,1,-1,
516,28,16.08.18 00:51,CEST,,1,1,1,


In [157]:
query = '''
SELECT subjectID, SUBSTRING(timestamp, 1, 8) AS date, GROUP_CONCAT(VAS) AS VAS, GROUP_CONCAT(phF) AS phF, GROUP_CONCAT(MF) AS MF, GROUP_CONCAT(ReIP) AS ReIP, GROUP_CONCAT(Sport) AS Sport, COUNT(*) AS n_answers
FROM Y_classes
GROUP BY subjectID, date
ORDER BY n_answers DESC;'''
sqldf(query)

Unnamed: 0,subjectID,date,VAS,phF,MF,ReIP,Sport,n_answers
0,24,06.02.18,"NaN,NaN,0",111,111,"0,0,-1","NaN,NaN,NaN",3
1,24,07.02.18,"NaN,0,NaN",101,101,"0,0,-1","NaN,NaN,NaN",3
2,4,30.03.19,00,00,00,11,"0,NaN",2
3,5,04.04.19,"NaN,0",10,10,11,00,2
4,5,07.04.19,"NaN,0",11,10,11,11,2
...,...,...,...,...,...,...,...,...
469,27,31.08.18,0,1,1,-1,,1
470,28,09.08.18,,0,0,0,,1
471,28,10.08.18,,0,1,0,,1
472,28,13.08.18,,1,1,0,,1
