# SRTR Program-Specific Reports Data Collection

The goal of this notebook is to develop code for reading SRTR Program-Specific Reports (PSRs) in PDF format, extracting the tabular data and writing to csv files. Archived PSR files in PDF format for liver transplant are assumed be have been downloaded and extracted into a folder named 'reports_10-2018'. Transplant center code is assumed to be encoded into the filename as the first 4 letters.


In [1]:
# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import re
import datetime
import camelot
import json
import zipfile
import os

from matplotlib.ticker import FormatStrFormatter

%matplotlib inline

In [4]:
directory = 'reports_07-2019\\socal\\'
center_list = []

for filename in os.listdir(directory):
    if filename.endswith(".pdf"):
        center_code = filename[0:4]
        path_filename = directory + filename
        tables = camelot.read_pdf(path_filename, flavor='stream', 
                              pages='1,6,7,8,9,10,11,12,13,14,15,16,17,19,21,23,25,27,28,29,30,31,32,33,34,35')
        print(path_filename,':',tables.n, 'tables')
        # build a log file name
        log_name = path_filename[:-4] + '.log'
        with open(log_name, 'w+', encoding='utf-8', newline='',) as file: 
            # log parsing_report for each table 
            for i in range(tables.n):
                file.write(json.dumps(tables[i].parsing_report))
                file.write('\n')

        csv_name = path_filename[:-4] + '.csv'
        tables.export(csv_name, f='csv', compress=True)
        center_list.append(center_code)

reports_07-2019\socal\AZMCTX1LI201905PNEW.pdf : 26 tables
reports_07-2019\socal\CACSTX1LI201905PNEW.pdf : 31 tables
reports_07-2019\socal\CAGHTX1LI201905PNEW.pdf : 29 tables
reports_07-2019\socal\CALLTX1LI201905PNEW.pdf : 28 tables
reports_07-2019\socal\CASDTX1LI201905PNEW.pdf : 29 tables
reports_07-2019\socal\CAUCTX1LI201905PNEW.pdf : 27 tables
reports_07-2019\socal\CAUHTX1LI201905PNEW.pdf : 27 tables


In [5]:
print(center_list)

['AZMC', 'CACS', 'CAGH', 'CALL', 'CASD', 'CAUC', 'CAUH']
