In [1]:
from bs4 import BeautifulSoup
import requests
import os
import threading

In [2]:
#scrape root directory from physionet MIMICIII waveform (only)
ROOTURL = "https://physionet.org/files/mimic3wdb/1.0"
recordsRequest = requests.get(ROOTURL + "/RECORDS-waveforms")

#convert request item in Soup obj and into vector of string names
soup = BeautifulSoup(recordsRequest.text, 'html.parser')
all_record_paths = str(soup).splitlines()[0:40]

In [3]:
idx = 0
folder_root = "physionet.org/files/mimic3wdb/1.0/"
listOfGoodBoys = []

def scrape_mimic_list(record_paths):
    for record_path in record_paths:
        scrape_mimic(record_path=record_path)

def scrape_mimic(record_path):
    print(record_path)
    record_name = record_path[3:10]

    #Will look in each record's general layout file for presence of an ABP and PLETH waveform
    #    if it exists then will download file
    RecordsInFolderRequest = requests.get(ROOTURL + "/" + record_path + "/RECORDS")
    all_records_in_folder = str(BeautifulSoup(RecordsInFolderRequest.text, 'html.parser')).splitlines()

    for record in all_records_in_folder:
        HeaderRequest = requests.get(ROOTURL + "/" + record_path + record + ".hea")
        soup_layout = BeautifulSoup(HeaderRequest.text, 'html.parser')
        signalData = str(soup_layout).splitlines()[1:]
        signalList = []
        for line in signalData:
            signalList.append(line.split()[-1])
        signal_matches = ["ABP", "PLETH"] 
        
        if not all(a in signalList for a in signal_matches):
            continue
        #use --no-parent otherwise you will download all directory files
        filePath = record_path + record
        listOfGoodBoys.append(filePath)
        download_cmd = "wget -r --no-parent " + ROOTURL + "/" + filePath;
        
        # For use with Windows (with WSL) add wsl add beginning of system command
        if os.name!='posix': 
            download_cmd = "wsl "+ download_cmd

        #check if files have already been downloaded otherwise download them
        if not os.path.isfile(folder_root+filePath+".dat"):
            print("DOWNLOADING -- " + filePath )
            os.system(download_cmd +".dat") #fg; echo DOWNLOADED
        else:
            print("ALREADY DOWNLOADED " + filePath+".dat")
        if not os.path.isfile(folder_root+filePath+".hea"):
            print("DOWNLOADING -- " + filePath )
            os.system(download_cmd +".hea") #fg; echo DOWNLOADED
        else:
            print("ALREADY DOWNLOADED " + filePath+".hea")
        print("DOWNLOAD COMPLETE -- " + filePath)


if __name__ == '__main__':
    numThreads = 8
    threadList = []

    numRecords = len(all_record_paths)//numThreads
    
    for x in range(8):
#         threadRunner = threading.Thread(target=scrape_mimic_list, args=(all_record_paths[x:len(all_record_paths):numThreads],))
        if x < 7:
            threadRunner = threading.Thread(target=scrape_mimic_list, args=(all_record_paths[x*numRecords:(x+1)*numRecords],))
        else:
            threadRunner = threading.Thread(target=scrape_mimic_list, args=(all_record_paths[x*numRecords:],))
        threadRunner.start()
        threadList.append(threadRunner)

    for x in range(8):
        print("joining threads....")
        threadList[x].join()


30/3000003/
30/3000051/30/3000060/
30/3000063/
30/3000065/

30/3000086/
30/3000100/
30/3000103/
joining threads....
30/3000154/
30/3000125/
30/3000203/
ALREADY DOWNLOADED 30/3000063/3000063_0006.dat
ALREADY DOWNLOADED 30/3000063/3000063_0006.hea
DOWNLOAD COMPLETE -- 30/3000063/3000063_0006
30/3000190/
ALREADY DOWNLOADED 30/3000063/3000063_0007.dat30/3000189/

ALREADY DOWNLOADED 30/3000063/3000063_0007.hea
DOWNLOAD COMPLETE -- 30/3000063/3000063_0007
ALREADY DOWNLOADED 30/3000063/3000063_0009.dat
ALREADY DOWNLOADED 30/3000063/3000063_0009.hea
DOWNLOAD COMPLETE -- 30/3000063/3000063_0009
ALREADY DOWNLOADED 30/3000063/3000063_0010.dat
ALREADY DOWNLOADED 30/3000063/3000063_0010.hea
DOWNLOAD COMPLETE -- 30/3000063/3000063_0010
ALREADY DOWNLOADED 30/3000063/3000063_0011.dat
ALREADY DOWNLOADED 30/3000063/3000063_0011.hea
DOWNLOAD COMPLETE -- 30/3000063/3000063_0011
ALREADY DOWNLOADED 30/3000063/3000063_0012.dat
ALREADY DOWNLOADED 30/3000063/3000063_0012.hea
DOWNLOAD COMPLETE -- 30/3000063/300

30/3000611/
30/3000598/
DOWNLOADING -- 30/3000393/3000393_0020
DOWNLOAD COMPLETE -- 30/3000393/3000393_0020
30/3000713/
DOWNLOADING -- 30/3000393/3000393_0021
DOWNLOADING -- 30/3000393/3000393_0021
DOWNLOAD COMPLETE -- 30/3000393/3000393_0021
DOWNLOADING -- 30/3000393/3000393_0022
DOWNLOADING -- 30/3000393/3000393_0022
DOWNLOAD COMPLETE -- 30/3000393/3000393_0022
DOWNLOADING -- 30/3000393/3000393_0023
DOWNLOADING -- 30/3000393/3000393_0023
DOWNLOAD COMPLETE -- 30/3000393/3000393_0023
30/3000714/
DOWNLOADING -- 30/3000393/3000393_0025
ALREADY DOWNLOADED 30/3000714/3000714_0001.dat
ALREADY DOWNLOADED 30/3000714/3000714_0001.hea
DOWNLOAD COMPLETE -- 30/3000714/3000714_0001
ALREADY DOWNLOADED 30/3000714/3000714_0002.dat
ALREADY DOWNLOADED 30/3000714/3000714_0002.hea
DOWNLOAD COMPLETE -- 30/3000714/3000714_0002
ALREADY DOWNLOADED 30/3000714/3000714_0003.dat
ALREADY DOWNLOADED 30/3000714/3000714_0003.hea
DOWNLOAD COMPLETE -- 30/3000714/3000714_0003
ALREADY DOWNLOADED 30/3000714/3000714_0004.

In [13]:
folderRoot = "physionet.org/files/mimic3wdb/1.0/"
fileThreshold = 17000
for record in listOfGoodBoys:
    filePath = folderRoot+record
    fileSize =  os.path.getsize(filePath+'.dat')
    if fileSize > fileThreshold:
        # Convert to TXT
        #rdsamp: to text is -p > newName.txt
        # -s : signal list is ABP, PLETH in that order => output cols will be TIME-ABP-PLETH
        # -S : search for first valid time for ABP
        rdsamp_cmd = "rdsamp -r " + filePath +" -p >" + filePath + ".txt -s ABP PLETH -S ABP"
        if os.name!='posix': 
            rdsamp_cmd = "wsl " + rdsamp_cmd
        print('CONVERTING TO TXT -- ' + filePath)
        os.system(rdsamp_cmd)
        print('CONVERTING TO TXT COMPLETE -- ' + filePath)
        #move converted files to new folder
        move_cmd = "mv " + filePath +".txt physionet.org/textdata"
        if os.name!='posix':
            move_cmd = "wsl "+ move_cmd
        os.system( move_cmd )
        
        # Convert to MAT
        wfdb2mat_cmd = "wfdb2mat -r " + filePath
        print('CONVERTING TO MAT -- ' + filePath)
        if os.name!='posix':
            wfdb2mat_cmd = "wsl "+wfdb2mat_cmd
        os.system(wfdb2mat_cmd)
        folder = record.split('/')[0] + '/' + record.split('/')[1] + '/'
        fileName = record.split('/')[2]
        mv_cmd = "mv " + fileName + "m.mat " + fileName + "m.hea " + folderRoot + folder 
        if os.name!='posix': 
            mv_cmd = "wsl "+ mv_cmd
        os.system(mv_cmd)
        print('CONVERTING TO MAT COMPLETE -- ' + filePath)
        

CONVERTING TO TXT -- physionet.org/files/mimic3wdb/1.0/30/3000063/3000063_0006
CONVERTING TO TXT COMPLETE -- physionet.org/files/mimic3wdb/1.0/30/3000063/3000063_0006
CONVERTING TO MAT -- physionet.org/files/mimic3wdb/1.0/30/3000063/3000063_0006
CONVERTING TO MAT COMPLETE -- physionet.org/files/mimic3wdb/1.0/30/3000063/3000063_0006
CONVERTING TO TXT -- physionet.org/files/mimic3wdb/1.0/30/3000063/3000063_0007
CONVERTING TO TXT COMPLETE -- physionet.org/files/mimic3wdb/1.0/30/3000063/3000063_0007
CONVERTING TO MAT -- physionet.org/files/mimic3wdb/1.0/30/3000063/3000063_0007
CONVERTING TO MAT COMPLETE -- physionet.org/files/mimic3wdb/1.0/30/3000063/3000063_0007
CONVERTING TO TXT -- physionet.org/files/mimic3wdb/1.0/30/3000063/3000063_0009
CONVERTING TO TXT COMPLETE -- physionet.org/files/mimic3wdb/1.0/30/3000063/3000063_0009
CONVERTING TO MAT -- physionet.org/files/mimic3wdb/1.0/30/3000063/3000063_0009
CONVERTING TO MAT COMPLETE -- physionet.org/files/mimic3wdb/1.0/30/3000063/3000063_000

CONVERTING TO MAT COMPLETE -- physionet.org/files/mimic3wdb/1.0/30/3000393/3000393_0002
CONVERTING TO TXT -- physionet.org/files/mimic3wdb/1.0/30/3000397/3000397_0012
CONVERTING TO TXT COMPLETE -- physionet.org/files/mimic3wdb/1.0/30/3000397/3000397_0012
CONVERTING TO MAT -- physionet.org/files/mimic3wdb/1.0/30/3000397/3000397_0012
CONVERTING TO MAT COMPLETE -- physionet.org/files/mimic3wdb/1.0/30/3000397/3000397_0012
CONVERTING TO TXT -- physionet.org/files/mimic3wdb/1.0/30/3000393/3000393_0005
CONVERTING TO TXT COMPLETE -- physionet.org/files/mimic3wdb/1.0/30/3000393/3000393_0005
CONVERTING TO MAT -- physionet.org/files/mimic3wdb/1.0/30/3000393/3000393_0005
CONVERTING TO MAT COMPLETE -- physionet.org/files/mimic3wdb/1.0/30/3000393/3000393_0005
CONVERTING TO TXT -- physionet.org/files/mimic3wdb/1.0/30/3000393/3000393_0008
CONVERTING TO TXT COMPLETE -- physionet.org/files/mimic3wdb/1.0/30/3000393/3000393_0008
CONVERTING TO MAT -- physionet.org/files/mimic3wdb/1.0/30/3000393/3000393_000

CONVERTING TO MAT COMPLETE -- physionet.org/files/mimic3wdb/1.0/30/3000714/3000714_0012
CONVERTING TO TXT -- physionet.org/files/mimic3wdb/1.0/30/3000714/3000714_0013
CONVERTING TO TXT COMPLETE -- physionet.org/files/mimic3wdb/1.0/30/3000714/3000714_0013
CONVERTING TO MAT -- physionet.org/files/mimic3wdb/1.0/30/3000714/3000714_0013
CONVERTING TO MAT COMPLETE -- physionet.org/files/mimic3wdb/1.0/30/3000714/3000714_0013
CONVERTING TO TXT -- physionet.org/files/mimic3wdb/1.0/30/3000393/3000393_0028
CONVERTING TO TXT COMPLETE -- physionet.org/files/mimic3wdb/1.0/30/3000393/3000393_0028
CONVERTING TO MAT -- physionet.org/files/mimic3wdb/1.0/30/3000393/3000393_0028
CONVERTING TO MAT COMPLETE -- physionet.org/files/mimic3wdb/1.0/30/3000393/3000393_0028
CONVERTING TO TXT -- physionet.org/files/mimic3wdb/1.0/30/3000714/3000714_0014
CONVERTING TO TXT COMPLETE -- physionet.org/files/mimic3wdb/1.0/30/3000714/3000714_0014
CONVERTING TO MAT -- physionet.org/files/mimic3wdb/1.0/30/3000714/3000714_001

In [3]:
print(os.getpid())

39307


True