In [1]:
from bs4 import BeautifulSoup
import requests
import os
import threading

In [6]:
#scrape root directory from physionet MIMICIII waveform (only)
ROOTURL = "https://physionet.org/files/mimic3wdb/1.0"
recordsRequest = requests.get(ROOTURL + "/RECORDS-waveforms")

#convert request item in Soup obj and into vector of string names
soup = BeautifulSoup(recordsRequest.text, 'html.parser')
all_record_paths = str(soup).splitlines()[40000:40010]

In [7]:
idx = 0
folder_root = "physionet.org/files/mimic3wdb/1.0/"
listOfGoodBoys = []

def scrape_mimic_list(record_paths):
    for record_path in record_paths:
        scrape_mimic(record_path=record_path)

def scrape_mimic(record_path):
    print(record_path)
    record_name = record_path[3:10]

    #Will look in each record's general layout file for presence of an ABP and PLETH waveform
    #    if it exists then will download file
    RecordsInFolderRequest = requests.get(ROOTURL + "/" + record_path + "/RECORDS")
    all_records_in_folder = str(BeautifulSoup(RecordsInFolderRequest.text, 'html.parser')).splitlines()

    for record in all_records_in_folder:
        HeaderRequest = requests.get(ROOTURL + "/" + record_path + record + ".hea")
        soup_layout = BeautifulSoup(HeaderRequest.text, 'html.parser')
        signalData = str(soup_layout).splitlines()[1:]
        signalList = []
        for line in signalData:
            signalList.append(line.split()[-1])
        signal_matches = ["ABP", "PLETH"] 
        
        if not all(a in signalList for a in signal_matches):
            continue
        #use --no-parent otherwise you will download all directory files
        filePath = record_path + record
        listOfGoodBoys.append(filePath)
        download_cmd = "wget -r --no-parent " + ROOTURL + "/" + filePath;
        
        # For use with Windows (with WSL) add wsl add beginning of system command
        if os.name!='posix': 
            download_cmd = "wsl "+ download_cmd

        #check if files have already been downloaded otherwise download them
        if not os.path.isfile(folder_root+filePath+".dat"):
            print("DOWNLOADING -- " + filePath )
            os.system(download_cmd +".dat") #fg; echo DOWNLOADED
        else:
            print("ALREADY DOWNLOADED " + filePath+".dat")
        if not os.path.isfile(folder_root+filePath+".hea"):
            print("DOWNLOADING -- " + filePath )
            os.system(download_cmd +".hea") #fg; echo DOWNLOADED
        else:
            print("ALREADY DOWNLOADED " + filePath+".hea")
        print("DOWNLOAD COMPLETE -- " + filePath)


if __name__ == '__main__':
    numThreads = 8
    threadList = []

    numRecords = len(all_record_paths)//numThreads
    
    for x in range(8):
#         threadRunner = threading.Thread(target=scrape_mimic_list, args=(all_record_paths[x:len(all_record_paths):numThreads],))
        if x < 7:
            threadRunner = threading.Thread(target=scrape_mimic_list, args=(all_record_paths[x*numRecords:(x+1)*numRecords],))
        else:
            threadRunner = threading.Thread(target=scrape_mimic_list, args=(all_record_paths[x*numRecords:],))
        threadRunner.start()
        threadList.append(threadRunner)

    for x in range(8):
        print("joining threads....")
        threadList[x].join()

36/3600224/36/3600241/36/3600270/


36/3600271/
36/3600276/
36/3600293/
36/3600305/
36/3600314/joining threads....

DOWNLOADING -- 36/3600314/3600314_0001
DOWNLOADING -- 36/3600293/3600293_0001
DOWNLOADING -- 36/3600314/3600314_0001
DOWNLOADING -- 36/3600293/3600293_0001
DOWNLOAD COMPLETE -- 36/3600314/3600314_0001
DOWNLOAD COMPLETE -- 36/3600293/3600293_0001
DOWNLOADING -- 36/3600293/3600293_0002
DOWNLOADING -- 36/3600293/3600293_0002
DOWNLOADING -- 36/3600271/3600271_0006
DOWNLOAD COMPLETE -- 36/3600293/3600293_0002
DOWNLOADING -- 36/3600293/3600293_0004
DOWNLOADING -- 36/3600271/3600271_0006
DOWNLOADING -- 36/3600293/3600293_0004
DOWNLOAD COMPLETE -- 36/3600271/3600271_0006
DOWNLOAD COMPLETE -- 36/3600293/3600293_0004
DOWNLOADING -- 36/3600271/3600271_0007
DOWNLOADING -- 36/3600271/3600271_0007
DOWNLOAD COMPLETE -- 36/3600271/3600271_0007
DOWNLOADING -- 36/3600271/3600271_0008
DOWNLOADING -- 36/3600271/3600271_0008
DOWNLOAD COMPLETE -- 36/3600271/3600271_0008
joining threads....
joi

KeyboardInterrupt: 

In [8]:
folderRoot = "physionet.org/files/mimic3wdb/1.0/"
fileThreshold = 17000
for record in listOfGoodBoys:
    filePath = folderRoot+record
    fileSize =  os.path.getsize(filePath+'.dat')
    if fileSize > fileThreshold:
        # Convert to TXT
        #rdsamp: to text is -p > newName.txt
        # -s : signal list is ABP, PLETH in that order => output cols will be TIME-ABP-PLETH
        # -S : search for first valid time for ABP
        rdsamp_cmd = "rdsamp -r " + filePath +" -p > " + filePath + ".txt -s ABP PLETH -S ABP"
        if os.name!='posix': 
            rdsamp_cmd = "wsl " + rdsamp_cmd
        print('CONVERTING TO TXT -- ' + filePath)
        print(rdsamp_cmd)
        os.system(rdsamp_cmd)
        print('CONVERTING TO TXT COMPLETE -- ' + filePath)
        # move converted files to new folder
    
        move_cmd = "mv " + filePath +".txt physionet.org/textdata/"
        if os.name != 'posix':
            move_cmd = "wsl "+ move_cmd
        print(move_cmd)
        os.system( move_cmd )
        
        # Convert to MAT
        # wfdb2mat_cmd = "wfdb2mat -r " + filePath
        # print('CONVERTING TO MAT -- ' + filePath)
        # if os.name!='posix':
        #     wfdb2mat_cmd = "wsl "+wfdb2mat_cmd
        # os.system(wfdb2mat_cmd)
        # folder = record.split('/')[0] + '/' + record.split('/')[1] + '/'
        # fileName = record.split('/')[2]
        # mv_cmd = "mv " + fileName + "m.mat " + fileName + "m.hea " + folderRoot + folder 
        # if os.name!='posix': 
        #     mv_cmd = "wsl "+ mv_cmd
        # os.system(mv_cmd)
        # print('CONVERTING TO MAT COMPLETE -- ' + filePath)
        

CONVERTING TO TXT -- physionet.org/files/mimic3wdb/1.0/36/3600314/3600314_0001
rdsamp -r physionet.org/files/mimic3wdb/1.0/36/3600314/3600314_0001 -p > physionet.org/files/mimic3wdb/1.0/36/3600314/3600314_0001.txt -s ABP PLETH -S ABP
CONVERTING TO TXT COMPLETE -- physionet.org/files/mimic3wdb/1.0/36/3600314/3600314_0001
mv physionet.org/files/mimic3wdb/1.0/36/3600314/3600314_0001.txt physionet.org/textdata/
CONVERTING TO TXT -- physionet.org/files/mimic3wdb/1.0/36/3600293/3600293_0002
rdsamp -r physionet.org/files/mimic3wdb/1.0/36/3600293/3600293_0002 -p > physionet.org/files/mimic3wdb/1.0/36/3600293/3600293_0002.txt -s ABP PLETH -S ABP
CONVERTING TO TXT COMPLETE -- physionet.org/files/mimic3wdb/1.0/36/3600293/3600293_0002
mv physionet.org/files/mimic3wdb/1.0/36/3600293/3600293_0002.txt physionet.org/textdata/
CONVERTING TO TXT -- physionet.org/files/mimic3wdb/1.0/36/3600271/3600271_0006
rdsamp -r physionet.org/files/mimic3wdb/1.0/36/3600271/3600271_0006 -p > physionet.org/files/mimic3

In [22]:
# Generate wabp files and extract to text:
txtFileList = os.listdir('physionet.org/textdata/')
fileNames = [f[:-4] for f in txtFileList]

ROOTFOLDER = "physionet.org/files/mimic3wdb/1.0"
EXTRACTFOLDER = "physionet.org/abp_ann"
if not os.path.isdir(EXTRACTFOLDER):
    os.mkdir(EXTRACTFOLDER)


for f in fileNames:
    wabp_cmd = "wabp -r " + ROOTFOLDER + "/" + f[:2] + "/" + f[:7] + "/" +  f
    if os.name != 'posix':
        wabp_cmd = "wsl "+ wabp_cmd
    print(wabp_cmd)
    os.system(wabp_cmd)
    wabp_cmd = "rdann -r " + ROOTFOLDER + "/" + f[:2] + "/" + f[:7] + "/" +  f + " -a wabp > physionet.org/abp_ann/" + f+"_abp.txt"
    print(wabp_cmd)
    if os.name != 'posix':
        wabp_cmd = "wsl "+ wabp_cmd
    os.system(wabp_cmd)  

wabp -r physionet.org/files/mimic3wdb/1.0/30/3000063/3000063_0013
rdann -r physionet.org/files/mimic3wdb/1.0/30/3000063/3000063_0013 -a wabp > physionet.org/abp_ann/3000063_0013_abp.txt
wabp -r physionet.org/files/mimic3wdb/1.0/30/3000063/3000063_0007
rdann -r physionet.org/files/mimic3wdb/1.0/30/3000063/3000063_0007 -a wabp > physionet.org/abp_ann/3000063_0007_abp.txt
wabp -r physionet.org/files/mimic3wdb/1.0/30/3000063/3000063_0006
rdann -r physionet.org/files/mimic3wdb/1.0/30/3000063/3000063_0006 -a wabp > physionet.org/abp_ann/3000063_0006_abp.txt
wabp -r physionet.org/files/mimic3wdb/1.0/30/3000063/3000063_0012
rdann -r physionet.org/files/mimic3wdb/1.0/30/3000063/3000063_0012 -a wabp > physionet.org/abp_ann/3000063_0012_abp.txt
wabp -r physionet.org/files/mimic3wdb/1.0/36/3600293/3600293_0004
rdann -r physionet.org/files/mimic3wdb/1.0/36/3600293/3600293_0004 -a wabp > physionet.org/abp_ann/3600293_0004_abp.txt
wabp -r physionet.org/files/mimic3wdb/1.0/36/3600314/3600314_0001
rdan

rdann -r physionet.org/files/mimic3wdb/1.0/30/3000393/3000393_0008 -a wabp > physionet.org/abp_ann/3000393_0008_abp.txt
wabp -r physionet.org/files/mimic3wdb/1.0/30/3000393/3000393_0034
rdann -r physionet.org/files/mimic3wdb/1.0/30/3000393/3000393_0034 -a wabp > physionet.org/abp_ann/3000393_0034_abp.txt
wabp -r physionet.org/files/mimic3wdb/1.0/30/3000393/3000393_0020
rdann -r physionet.org/files/mimic3wdb/1.0/30/3000393/3000393_0020 -a wabp > physionet.org/abp_ann/3000393_0020_abp.txt
wabp -r physionet.org/files/mimic3wdb/1.0/30/3000714/3000714_0020
rdann -r physionet.org/files/mimic3wdb/1.0/30/3000714/3000714_0020 -a wabp > physionet.org/abp_ann/3000714_0020_abp.txt
wabp -r physionet.org/files/mimic3wdb/1.0/30/3000714/3000714_0021
rdann -r physionet.org/files/mimic3wdb/1.0/30/3000714/3000714_0021 -a wabp > physionet.org/abp_ann/3000714_0021_abp.txt
wabp -r physionet.org/files/mimic3wdb/1.0/30/3000714/3000714_0035
rdann -r physionet.org/files/mimic3wdb/1.0/30/3000714/3000714_0035 -a 

In [19]:
os.mkdir("physionet.org/f")