# Goal of These Studies
The goal of this study was to detect the asymptomatic incubation period, which is the period during which we want to reduce viral transmission, and the intent of using a machine learning algorithm was to do so only using ECG physiological and electrophysiological data. This model was, in the vast majority of the cases, able to detect the asymptomatic state well before the onset of symptoms, like a fever.

In [2]:
import pandas as pd
import numpy as np
import os

def find_csv_filenames(path_to_dir, suffix=".csv"):
    filenames = os.listdir(path_to_dir)
    return [filename for filename in filenames if filename.endswith(suffix)]

tables = find_csv_filenames('/Users/camilledunning/Desktop/MachineLearningPython/scripps-data')
tables = [c for c in tables if c != 'SubjectMetaData.csv-Table 1.csv']
tables

['MARV-Cyno-IM.csv-Table 1.csv',
 'EBOV-Cyno-Aero-1.csv-Table 1.csv',
 'MARV-Rhesus-Aero.csv-Table 1.csv',
 'NIPAH-AGM-IT.csv-Table 1.csv',
 'LASSA-Cyno-Aero.csv-Table 1.csv',
 'EBOV-Cyno-Aero-2.csv-Table 1.csv',
 'Plague-AGM-Aero.csv-Table 1.csv']

# Marburg Virus - Cynomolgus Macaques
* 9 subjects
* For subject with the most data, 4 days before exposure, 10 days after exposure

In [4]:
mar1 = pd.read_csv(tables[0])
[sub + str(mar1[mar1['SubjectName'] == sub].shape) for sub in np.unique(mar1['SubjectName'])]
print(max(mar1[(mar1['SubjectName'] == 'mci003') & (mar1['Y'] == 0)]['Day']) + 1)
print(max(mar1[(mar1['SubjectName'] == 'mci003') & (mar1['Y'] == 1)]['Day']) + 1)

FileNotFoundError: [Errno 2] File b'MARV-Cyno-IM.csv-Table 1.csv' does not exist: b'MARV-Cyno-IM.csv-Table 1.csv'

## Some Data Type Meanings
* **AOPSystolic** - Aortic pressure, blood pressure at root of aorta, systolic: force of blood against artery walls while ventricles squeeze
* **AOPDiastolic** - Force of blood against artery walls as ventricles in heart relax
* **Resp** - Respiration rate
* **Temp** - Temperature measurement
* **PR** - PR interval on ECG, beginning of P wave (wave generated by depolarization front as it transits the atria)
* **QT** - Time from start of the Q wave to the end of the T wave
* **LVPSystolic/Diastolic** - Left ventricular pressure
* **Bazett** - Bazett formula, corrects measured QT interval to a value attributable to a heart rate of 60 BPM
* **Fridericia** - Fridericia formula, maybe more accurate than the Bazett formula, as that formula has been said to overcorrect at fast heart rates and undercorrect at slow heart rates

In [5]:
def gen_table(table):
    df = pd.read_csv(table)
    cols = df.columns[5:]
    _, idx = np.unique([col.split('_')[0] for col in cols], return_index=True)
    cols = cols[np.sort(idx)]
    tbl = pd.DataFrame(cols).set_index([0])
    tbl.index.rename('Data Types', inplace=True)
    tbl['Sampling Frequency'] = list(range(0, len(tbl)))

    s_freqs = []

    for sub in np.unique(df['SubjectName']):
        for y in np.unique(df['Y']):
            for d in np.unique(df[df['SubjectName'] == sub]['Day']):
                s_freqs.append(len(df[(df['SubjectName'] == sub) & (df['Y'] == y) & (df['Day'] == d)]))

    s_freqs = [f for f in s_freqs if f != 0]
    s_freqs

    median_s_freq = round(np.median(s_freqs), 0)
    tbl['Sampling Frequency'] = np.repeat(median_s_freq, len(tbl))

    num_missing_data = abs(sum(s_freqs - median_s_freq))
    percent_missing_data = round(num_missing_data / len(df), 3)
    tbl['Percent Data Missing'] = np.repeat(percent_missing_data, len(tbl))

    if all(df):
        tbl['# of Subjects Represented'] = np.repeat(len(np.unique(df['SubjectName'])), len(tbl))

    tbl['Median'] = [np.median(df[col]) for col in cols]
    tbl['Q25'] = [df[col].quantile(0.25) for col in cols]
    tbl['Q75'] = [df[col].quantile(0.75) for col in cols]
    tbl['Min'] = [min(df[col]) for col in cols]
    tbl['Max'] = [max(df[col]) for col in cols]
    return tbl
    
gen_table(tables[0])

UnboundLocalError: local variable 'cols' referenced before assignment

# Ebola - Cynomolgus Macaques #1

In [59]:
ebo1 = pd.read_csv(tables[1])
print(len(np.unique(ebo1['SubjectName'])))
print([sub + str(ebo1[ebo1['SubjectName'] == sub].shape) for sub in np.unique(ebo1['SubjectName'])])
print(max(ebo1[(ebo1['SubjectName'] == 'e1009') & (ebo1['Y'] == 0)]['Day']) + 1)
print(max(ebo1[(ebo1['SubjectName'] == 'e1009') & (ebo1['Y'] == 1)]['Day']) + 1)

6
['e1001(604, 45)', 'e1004(632, 45)', 'e1005(650, 45)', 'e1009(669, 45)', 'e1011(574, 45)', 'e1015(641, 45)']
8
8


* 6 subjects
* For subject with the most data, 8 days before exposure, 8 days after exposure

In [60]:
gen_table(tables[1])

Unnamed: 0_level_0,Sampling Frequency,Percent Data Missing,# of Subjects Represented,Median,Q25,Q75,Min,Max
Data Types,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AOPAMean_Mean,48.0,0.082,6,-0.044589,-0.601681,0.566622,-12.982389,9.683679
AOPDiastolic_Mean,48.0,0.082,6,-0.046605,-0.573127,0.537039,-12.121885,9.311269
AOPSystolic_Mean,48.0,0.082,6,-0.040211,-0.616388,0.58308,-13.789496,10.02746
Activity_Mean,48.0,0.082,6,0.000494,-0.053321,0.016397,-1.510156,2.104627
Bazett_Mean,48.0,0.082,6,-0.029136,-0.365756,0.255027,-4.087383,6.306805
Fridericia_Mean,48.0,0.082,6,-0.156367,-0.73105,0.293546,-6.192722,4.89496
HR_Mean,48.0,0.082,6,0.176912,-0.302573,0.919383,-2.790744,24.262667
PR_Mean,48.0,0.082,6,-0.047928,-0.444173,0.282391,-8.592986,6.672442
QRS_Mean,48.0,0.082,6,-0.075048,-0.446663,0.231754,-7.093461,4.266291
QT_Mean,48.0,0.082,6,-0.277198,-1.06148,0.334909,-10.98702,4.991559


# Marburg Virus - Rhesus Macaques

In [61]:
mar2 = pd.read_csv(tables[2])
print(len(np.unique(mar2['SubjectName'])))
print([sub + str(mar2[mar2['SubjectName'] == sub].shape) for sub in np.unique(mar2['SubjectName'])])
print(max(mar2[(mar2['SubjectName'] == 'mra002') & (mar2['Y'] == 0)]['Day']) + 1)
print(max(mar2[(mar2['SubjectName'] == 'mra002') & (mar2['Y'] == 1)]['Day']) + 1)

5
['mra001(704, 63)', 'mra002(705, 63)', 'mra003(685, 63)', 'mra004(688, 63)', 'mra005(666, 63)']
7
8


* 5 subjects
* For subject with the most data, 7 days before exposure, 8 days after exposure

In [62]:
gen_table(tables[2])

Unnamed: 0_level_0,Sampling Frequency,Percent Data Missing,# of Subjects Represented,Median,Q25,Q75,Min,Max
Data Types,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AOPSystolic_Mean,48.0,0.044,5,-0.101547,-0.629598,0.63412,-10.357273,8.875209
AOPDiastolic_Mean,48.0,0.044,5,-0.193828,-0.779765,0.399196,-10.93983,4.792388
RespMean_Mean,48.0,0.044,5,0.170465,-0.463914,0.865989,-4.857199,7.398475
Temp_Mean,48.0,0.044,5,0.426143,-0.334512,2.435568,-11.196541,32.095655
QT_Mean,48.0,0.044,5,-0.049829,-0.815337,0.648257,-8.53664,6.002094
PR_Mean,48.0,0.044,5,0.070935,-0.37314,0.579794,-7.064624,6.442386
QRS_Mean,48.0,0.044,5,0.076545,-0.31664,0.537907,-9.318058,6.822095
RR_Mean,48.0,0.044,5,0.200541,-0.406009,0.846467,-4.552082,5.568404
HR_Mean,48.0,0.044,5,-0.242146,-0.77473,0.377216,-3.415402,7.918673
LVPSystolic_Mean,48.0,0.044,5,-0.19701,-0.627188,0.355767,-10.608516,13.199828


# NIPAH Virus - African Green Monkey

In [63]:
ni1 = pd.read_csv(tables[3])
print(len(np.unique(ni1['SubjectName'])))
print([sub + str(ni1[ni1['SubjectName'] == sub].shape) for sub in np.unique(ni1['SubjectName'])])
print(max(ni1[(ni1['SubjectName'] == 'n1001') & (ni1['Y'] == 0)]['Day']) + 1)
print(max(ni1[(ni1['SubjectName'] == 'n1001') & (ni1['Y'] == 1)]['Day']) + 1)

7
['n1001(1199, 51)', 'n1002(633, 51)', 'n1003(713, 51)', 'n1004(1199, 51)', 'n2001(746, 51)', 'n2002(378, 51)', 'n2003(363, 51)']
6
21


* 7 subjects
* For subject with the most data, 6 days before exposure, 21 days after exposure

In [64]:
gen_table(tables[3])

Unnamed: 0_level_0,Sampling Frequency,Percent Data Missing,# of Subjects Represented,Median,Q25,Q75,Min,Max
Data Types,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AOPAMean_Mean,48.0,0.092,7,-0.129179,-0.782278,0.668341,-10.125261,11.523314
AOPDiastolic_Mean,48.0,0.092,7,-0.086859,-0.758263,0.689559,-8.435507,9.164154
AOPSystolic_Mean,48.0,0.092,7,-0.138715,-0.839969,0.586775,-11.386101,12.337743
Bazett_Mean,48.0,0.092,7,0.218998,-0.274254,0.905813,-3.125161,8.643236
Fridericia_Mean,48.0,0.092,7,0.042155,-0.555611,0.662946,-7.220124,8.555924
HR_Mean,48.0,0.092,7,0.358441,-0.326017,1.623131,-2.086436,14.786177
LVPDiastolic_Mean,48.0,0.092,7,-0.293824,-1.231214,0.173113,-137.094208,6.569729
LVPMean_Mean,48.0,0.092,7,-0.211752,-0.771987,0.388339,-4.022255,8.596418
LVPRate_Mean,48.0,0.092,7,0.346307,-0.321497,1.561563,-1.851359,99.298887
LVPSystolic_Mean,48.0,0.092,7,-0.158142,-0.66926,0.515249,-5.571174,11.963397


# Lassa Virus

In [65]:
la = pd.read_csv(tables[4])
print(len(np.unique(la['SubjectName'])))
print([sub + str(la[la['SubjectName'] == sub].shape) for sub in np.unique(la['SubjectName'])])
print(max(la[(la['SubjectName'] == 'l004') & (la['Y'] == 0)]['Day']) + 1)
print(max(la[(la['SubjectName'] == 'l004') & (la['Y'] == 1)]['Day']) + 1)

4
['l001(765, 63)', 'l002(868, 63)', 'l003(763, 63)', 'l004(2252, 63)']
7
41


* 4 subjects
* For subject with the most data, 7 days before exposure, 41 days after exposure

In [66]:
gen_table(tables[4])

Unnamed: 0_level_0,Sampling Frequency,Percent Data Missing,# of Subjects Represented,Median,Q25,Q75,Min,Max
Data Types,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AOPSystolic_Mean,48.0,0.053,4,-0.602288,-1.360249,-0.015445,-5.395446,4.789397
AOPDiastolic_Mean,48.0,0.053,4,-0.299385,-0.850813,0.260644,-4.077971,2.847689
RespMean_Mean,48.0,0.053,4,0.235189,-0.452365,1.302369,-3.641613,21.84154
Temp_Mean,48.0,0.053,4,1.086217,-0.227678,4.051556,-17.613714,28.416362
QT_Mean,48.0,0.053,4,-0.427122,-1.520196,0.539966,-8.987419,7.288494
PR_Mean,48.0,0.053,4,-0.482029,-1.494143,0.21198,-5.344537,4.722723
QRS_Mean,48.0,0.053,4,-0.048603,-0.597972,0.431852,-4.67235,3.382814
RR_Mean,48.0,0.053,4,-0.557252,-1.460536,0.205992,-5.148415,5.108762
HR_Mean,48.0,0.053,4,0.510774,-0.288236,1.805464,-3.037293,9.949064
LVPSystolic_Mean,48.0,0.053,4,-0.516158,-1.038237,-0.154436,-4.688526,5.629364


# Ebola - Cynomolgus Macaques #2

In [67]:
ebo2 = pd.read_csv(tables[5])
print(len(np.unique(ebo2['SubjectName'])))
print([sub + str(ebo2[ebo2['SubjectName'] == sub].shape) for sub in np.unique(ebo2['SubjectName'])])
print(max(ebo2[(ebo2['SubjectName'] == 'e2003') & (ebo2['Y'] == 0)]['Day']) + 1)
print(max(ebo2[(ebo2['SubjectName'] == 'e2003') & (ebo2['Y'] == 1)]['Day']) + 1)

8
['e2002(637, 45)', 'e2003(697, 45)', 'e2006(614, 45)', 'e2007(658, 45)', 'e2010(428, 45)', 'e2012(362, 45)', 'e2013(576, 45)', 'e2014(632, 45)']
7
8


* 8 subjects
* For subject with the most data, 7 days before exposure, 8 days after exposure

In [68]:
gen_table(tables[5])

Unnamed: 0_level_0,Sampling Frequency,Percent Data Missing,# of Subjects Represented,Median,Q25,Q75,Min,Max
Data Types,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AOPAMean_Mean,46.0,0.159,8,-0.003708,-0.514571,0.615087,-32.705997,22.009888
AOPDiastolic_Mean,46.0,0.159,8,0.02172,-0.460259,0.584153,-32.446644,22.13506
AOPSystolic_Mean,46.0,0.159,8,-0.037733,-0.554137,0.566964,-31.606238,20.505507
Activity_Mean,46.0,0.159,8,0.000847,-0.029624,0.018525,-0.923573,5.621855
Bazett_Mean,46.0,0.159,8,-0.10911,-0.538512,0.203893,-11.904234,28.178474
Fridericia_Mean,46.0,0.159,8,-0.274003,-1.11003,0.207119,-14.669014,19.217052
HR_Mean,46.0,0.159,8,0.26607,-0.276571,1.239282,-4.171943,80.900777
PR_Mean,46.0,0.159,8,-0.095962,-0.591451,0.299793,-17.577943,14.346717
QRS_Mean,46.0,0.159,8,-0.025106,-0.326806,0.269726,-5.101806,5.549823
QT_Mean,46.0,0.159,8,-0.450953,-1.690132,0.270463,-43.173638,10.317883


# Plague

In [69]:
pl = pd.read_csv(tables[6])
print(len(np.unique(pl['SubjectName'])))
print([sub + str(pl[pl['SubjectName'] == sub].shape) for sub in np.unique(pl['SubjectName'])])
print(max(pl[(pl['SubjectName'] == 'p002') & (pl['Y'] == 0)]['Day']) + 1)
print(max(pl[(pl['SubjectName'] == 'p002') & (pl['Y'] == 1)]['Day']) + 1)

4
['p001(422, 63)', 'p002(542, 63)', 'p003(472, 63)', 'p004(537, 63)']
7
6


* 4 subjects
* For subject with the most data, 7 days before exposure, 6 days after exposure

In [70]:
gen_table(tables[6])

Unnamed: 0_level_0,Sampling Frequency,Percent Data Missing,# of Subjects Represented,Median,Q25,Q75,Min,Max
Data Types,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AOPSystolic_Mean,47.0,0.12,4,0.110942,-0.425753,0.796712,-2.100714,18.915475
AOPDiastolic_Mean,47.0,0.12,4,0.234994,-0.342205,0.637629,-19.629225,6.44155
RespMean_Mean,47.0,0.12,4,-0.02804,-0.282588,0.309316,-6.269505,15.80354
Temp_Mean,47.0,0.12,4,0.151729,-0.594166,1.100494,-25.646292,37.77433
QT_Mean,47.0,0.12,4,-0.150204,-1.078289,0.401825,-11.964768,5.131973
PR_Mean,47.0,0.12,4,-0.18115,-0.816333,0.26299,-6.22611,4.764792
QRS_Mean,47.0,0.12,4,0.009656,-0.390772,0.338498,-4.012442,28.227331
RR_Mean,47.0,0.12,4,-0.203803,-0.950242,0.25013,-6.241178,2.844594
HR_Mean,47.0,0.12,4,0.149704,-0.248654,0.947132,-1.847398,14.367946
LVPSystolic_Mean,47.0,0.12,4,0.036734,-0.450513,0.727963,-11.430092,13.977909
