In [1]:
import os
import math
import csv
import numpy as np
from matplotlib import pyplot as plt
from PIL import Image
from pdf2image import convert_from_path
from pdf2image.exceptions import (
    PDFInfoNotInstalledError,
    PDFPageCountError,
    PDFSyntaxError
)

In [4]:
SAMPLING_RATE = 200
# Determins how many ecgs are recorded per number of datapoints.
# i.e., Defining to 200 means that the program will record the last 6000 datapoints every 200 datapoints

In [2]:
# Defines the location of key markers in the pdf. See markers.ipynb for info on how markers are located
topleft =(62, 193)
topright = (1637, 193)
bottomleft = (62, 2082)
bottomright = (1637, 2082)

hlines = [193, 429, 665, 901, 1137, 1138, 1374, 1610, 1846, 2082]
vlines = [62, 259, 456, 653, 850, 1046, 1243, 1440, 1637]

# Defines the four "sections" of the graph in terms of hlines indecies
# A graph section is the vertical section in which the ecg line is contained
graph_sections = ((0, 2), (2, 4), (5, 7), (7, 9))

In [3]:
path = "C:\\Users\\alber\\Documents\\coding_things\\science_fair\\ECG_recordings"

In [5]:
def scanner(img_bool, hlines, vlines):
    data = []
    
    last_known_value = 0
    
    # Counts the number of times where the program cannot find the line
    # If it is above a certian number it will return.
    not_found_counter = 0
    
    # The height of each graph section
    section_height = hlines[1] - hlines[0]
    
    # for every image
    for n in range(len(img_bool)):
        # for the four horizontal graph sections
        for section in graph_sections:
            # read in a vertical slice of the graph
            for i in range(vlines[0] + 1, vlines[-1]):
                
                # Look for the location of the ecg line
                if i in vlines:
                    data.append(last_known_value)
                    continue

                found = False
                for j in range(hlines[section[0]] + 1, hlines[section[1]]):
                    if img_bool[n][i][j] == True and (j != hlines[section[0]] and j != hlines[section[0] + 1] and j != hlines[section[1]]):
                        data.append((hlines[section[0] + 1] - j) / 118)
                        last_known_value = ((hlines[section[0] + 1] - j) / 118)
                        found = True
                        not_found_counter = 0
                        break

                if not found:
                    data.append(last_known_value)
                    not_found_counter += 1
                    if not_found_counter >= 500:
                        return data
        
    return data
        

In [6]:
def drowsyness_value(time, additional):
    # Calculates a drowsyness value based on the time through a sign function.
    # Note: only meant for use with times near or between 6p.m. and 2a.m.

    minutes = time % 100
    hours = time - minutes

    time = hours / 100 + minutes / 60 + additional / 200 / 1000 / 0.6
    
    if time < 4:
        time += 24

    if time >= 16:
        value = math.sin((math.pi * (time + 2)) / 8)
        value = value * 0.4 + 0.5
        return value
    else:
        return 0

In [7]:
def csv_converter(name, time, data):    
    # Create a CSV file depending on the type of data
    with open(f"datasets/{name}.csv", mode='w', newline='') as file:
        writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

        # Increment though the data and wirte it
        for i in range(800, len(data) - 800 - 6000, SAMPLING_RATE):
            writer.writerow([drowsyness_value(float(time), i)] + data[i:i + 6000])

In [None]:
# Reads in user inputted pdfs. Enter 0000 to break.
type = input("type of data:")
pdfs = []

while True:
    input_pdf = input("PDF name in HHMM format: ")
    
    if input_pdf == "0000":
        break
    else:
        pdfs.append(input_pdf)

In [8]:
# PREDEFINED PDFS FOR DATASETS USED. DOES NOT WORK DO TO THE ABSENCE OF PDFS IN GITHUB DOWNLOAD

# pdfs = ['1759', '1900', '1954', '2054', '2152', '2256', '2356', '0101', '0156']
# type = "test/full"
# SAMPLING_RATE = 200

# pdfs = ['1757', '1857', '1958', '2004', '2059', '2157', '2254', '0001', '0100', '0103', '0151', '0154', '0159']
# type = "train"
# SAMPLING_RATE = 100

['1759', '1900', '1954', '2054', '2152', '2256', '2356', '0101', '0156']
9


In [None]:
print(pdfs)
print(len(pdfs))

In [9]:
# For every pdf defined in the list
for pdf in pdfs:
    # Convert pdf into a list of images
    images = convert_from_path(f"{path}\\{pdf}.pdf", poppler_path=r'C:\Program Files\poppler-0.68.0\bin')

    try:
        os.mkdir(f"{path}\\{pdf}")
    except FileExistsError:
        print("WARN: file has already been parsed")

    width, height = images[0].size
    
    img_bool = np.zeros((len(images), width, height), dtype=bool)

    n = 0
    for image in images:
        im = image.load()

        # Convert image into a np bool array
        for i in range(width):
            for j in range(height):
                if im[i, j][0] <= 64 and im[i, j][1] <= 64 and im[i, j][2] <= 64:
                    img_bool[n][i][j] = True

        # Save image into directory
        image = image.save(f"{path}\\{pdf}\\{n}.png")

        n += 1

    # Scanner and CSV converter functions
    data = scanner(img_bool, hlines, vlines)
    csv_converter(type, int(pdf), data)

WARN: file has already been parsed
WARN: file has already been parsed
WARN: file has already been parsed
WARN: file has already been parsed
WARN: file has already been parsed
WARN: file has already been parsed
WARN: file has already been parsed
WARN: file has already been parsed
WARN: file has already been parsed
