# Parse the VG_Form.pdf for OAI variable categories and sources

OAI has a number of PDFs that list all variables collected, their nature, and where they came from. Most handy is General Information.zip\/General\/VG_Form.pdf as we can use this to get the categories and subcategories and assemble them into a readable table.

With over 9,000 variables collected, this is critical just to start to make sense of the variables.

For now, this only extracts a wiki table for categories/subcategories of the variables collected each visit, and another for tracking what sources were used for which visit. Clearly, more can be done with the data.

## Imports and constants

In [None]:
import re
import pickle
import sys
from tqdm import tqdm
import numpy as np
import pandas
from IPython.display import display
from string import ascii_letters
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer

In [None]:
# Constants

# Tags to look for when parsing 
hdr_tag = "Variable Guide"
vn_tag = "Variable Name"
var_tag = "Variable"
src_tag = "Source"
page_tag = "Page"
lbl_tag = "Label"
sas_ds_tag = "SAS Dataset"
rc_tag = "Release Comments"
cat_tag = "Category"
sub_tag = "Subcategory"
stats1_tag = "N "
stats2_tag = "Value N"
end_tag = "_______________"
ftr_tag = "Release Version"

# This list was created by first parsing and collecting all listed categories into a set
known_categories = ["Bookkeeping", "Demographics", "Study eligibility", "Knee symptoms",
                    "Medical history, arthritis", "Medications", "Physical activity",
                    "Medical history, general Comorbidity", "Knee pain/OA status",
                    "Knee function/QOL Knee", "Other joint symptoms", "Back pain",
                    "Anthropometry", "Hand and/or foot exam", "Medical history, general", 
                    "Image Assessments: X-ray", "Joint imaging", "Health care access",
                    "Global function/disability/QOL", "Nutrition", "WOMAC/KOOS", 
                    "Knee function/QOL", "Blood pressure & pulse", "Performance measures", 
                    "Strength measures", "Knee exam", "Biospecimens collection",
                    "Image Assessments: MRI", "Accelerometry", "Hip function/QOL", 
                    "Hip symptoms", "Outcomes"]

visits = {'P02':'IEI', 'P01':'SV', 'V00':'EV', 'V01':'12m', 'V02':'18m', 'V03':'24m', 'V04':'30m', 'V05':'36m', 'V06':'48m', 'V07':'60m', 'V08':'72m', 'V09':'84m', 'V10':'96m', 'V11':'108m', 'V99':"Outcomes"}

col_names = [var_tag, lbl_tag, src_tag, page_tag, sas_ds_tag, rc_tag]

## Original file ingestion

Parse the PDF once, and convert into a smaller serialized format for faster re-processing in the future (approx 10 min to read the whole PDF). Do this once, and afterwards only use the binary reader in the next cell.

In [None]:
# Parse PDF into a list of pages
# Each page is only a list of the PDF textboxes in the page

pages = []
for page_layout in tqdm(extract_pages(r"..\..\Downloads\General Information\General\VG_Form.pdf")):
    text_boxes = []
    for element in page_layout:
        if isinstance(element, LTTextContainer):
            text_boxes.append((element.get_text(), element.x0, element.y0, element.x1, element.y1))
    pages.append(text_boxes)

pickle.dump(pages, open( "vg_form_pdf_elements.pkl", "wb" ) )

# Fast file ingestion
Read in the binary serialized data (approx 1 sec to read the binary form) created in prior section.

In [None]:
pages = pickle.load(open("vg_form_pdf_elements.pkl", "rb" ))

In [None]:
# How many variables are described in this document? (for a later sanity check)
total_var_cnt = 0
for page in pages:
    for element in page:
        if element[0].strip().startswith(end_tag):
            total_var_cnt += 1
print(total_var_cnt)

## Ensure the PDF textboxes are in the same order they are rendered on a page 

In [None]:
# Parse PDF elements into a list of text boxes and coordinates that match their rendering order

lines = []
for page in tqdm(pages):
    text_boxes = []
    for element in page:
        txt = element[0].strip()
        
        # found text box containing a header or footer, ignore and move on
        if txt.startswith(hdr_tag) or txt.startswith(page_tag) or txt.startswith(ftr_tag):
            continue
        
        # Check for multiple lines in a single text_box
        txt = txt.split('\n')
        if len(txt) == 1:
            text_boxes.append((txt[0].strip(), element[2], element[4], element[1])) # txt, y0, y1, x0
        else:
            # Delete dups: This PDF has random text repeated in the PDF even though it only renders them once
            tmp = []
            [tmp.append(x.strip()) for x in txt if x not in tmp]
            txt = tmp
            
            # Break multiline boxes into single line boxes
            line_cnt = len(txt)
            height = (element[4] - element[2])/line_cnt # y1-y0
            for i in range(line_cnt):
                text_boxes.append((txt[i], element[2]+(height*(line_cnt-(1+i))), element[4]-(height*i), element[1]))

    # Order text_boxes by vertical (top to bottom), then horizontal position (left to right)                
    text_boxes = sorted(text_boxes, key=lambda box: box[3])  # secondary sort variable (horiz pos)
    text_boxes = sorted(text_boxes, key=lambda box: box[2], reverse=True) # primary sort variable (vert pos)

    # Concatenate text_boxes that render at same horizontal position (shows up in the pdf as Label: value)
    last_tb = ["", sys.maxsize, sys.maxsize, sys.maxsize]
    for tb in text_boxes:
        if (last_tb[1] - tb[1]) < 1:  # same line of text on a page
             lines[-1] = lines[-1] + " " + tb[0].strip()
        else:  # new line of text
            lines.append(tb[0].strip())
        last_tb = tb

In [None]:
# Optional memory cleanup
text_boxes = None
pages = None

In [None]:
# Kludge: there are a few text boxes that place the text None a line before Release Comments,
# yet renders them on the same line as Release Comments: None
# This hunts those down and fixes them before parsing
l = 0
while l < len(lines):
    if lines[l] == "None" and lines[l+1].startswith(rc_tag):
        lines[l] = lines[l+1] + " " + lines[l]
        lines.pop(l+1)
    l += 1

## Parse text of the document into data about OAI variables
First parse into lists, then into pandas dataframes.
One dataframe includes each variable, label, source, source page, dataet file name, and release comment.
Since each variable can have more than one category/subcategory associated with it, a separate dataframe is used to hold these associations.

In [None]:
# Parse the text into data about the OAI variables

def get_var(lines, l, label, next_label=None):
    # Parse as many lines as needed to get the variable value
    # Return value and new parser location
    assert(lines[l].startswith(label))
    value = lines[l][len(label)+1:].strip()
    l += 1
    # Some comments run beyond one line
    while next_label and not lines[l].startswith(next_label):
        value = value + " " + lines[l].strip()
        l += 1
    return value, l
    

variables = []
variable_cats = [] 
l = 0
while l < len(lines):
    # The first line describes where the variable came from
    src_page = None
    src = lines[l].strip()
    # If applicable, split out src and src_page
    idx = re.search(page_tag + "|" + page_tag.lower() + "| p |, p", lines[l])
    if idx:
        src = lines[l][:idx.start()].strip()
        src_page = lines[l][idx.end():].strip()
    else:
        idx = re.search("p\d+", lines[l]) # e.g. p50Q
        if idx:
            src = lines[l][:idx.start()].strip()
            src_page = lines[l][idx.start()+1:].strip()

    # Clean up source names that are different just because of spelling
    src = src.replace("Follow-Up", "Follow-up")
    src = re.sub("Quest$", "Questionnaire", src)
    src = src.replace("Checklist", "").strip()
    
    # If src_page # exists, parse it
    if src_page:
        if '-' in src_page:  # e.g. 25-26
            [src_page, _] = src_page.split('-')
            if not src_page.isdigit(): # e.g. 6/108-Mo piv
                [src_page, _] = src_page.split('/')
        elif '(' in src_page:  # e.g. 8(a)
            [src_page, _] = src_page.split('(')
        elif src_page.isalpha():  # e.g. ii
            src_page = 0
        elif not src_page.isdigit(): # e.g. 27Qf
            src_page = src_page.rstrip(ascii_letters) 
        
        src_page = int(src_page)
    
    l += 1
    
    # Next is the variable name, never larger than a single line 
    var_name, l = get_var(lines, l, vn_tag)
    
    # Next is the variable label
    label, l = get_var(lines, l, lbl_tag, sas_ds_tag)
    
    # Get the name of the SAS Dataset the variable is stored in
    dataset, l = get_var(lines, l, sas_ds_tag)
    
    # Get the release comments
    rel_cmnts, l = get_var(lines, l, rc_tag, cat_tag)
    
    # Get the categories/subcategories
    assert(lines[l].startswith(cat_tag))
    l += 1 # desired values are the line below the text "Category:"
    while not lines[l].startswith(stats1_tag) and not lines[l].startswith(stats2_tag) and not lines[l].startswith(end_tag):
        for cat in known_categories:
            if lines[l].startswith(cat):
                # Store as name, category, subcategory (which is always on the same line as the category)
                variable_cats.append((var_name, cat, lines[l][len(cat):].strip()))
                break
        l += 1
    
    # All remaining text is the summary stats section (ignored), and the line marks the end of a variable description
    while l < len(lines) and not lines[l].startswith(end_tag):
        l += 1
    l += 1
    
    variables.append((var_name, label, src, src_page, dataset, rel_cmnts))

In [None]:
# Optional memory cleanup
lines = None

In [None]:
# Put data into Pandas dataframe, and optimize the storage (reduces to 1/3 size)

vars_df = pandas.DataFrame(variables, columns=col_names)
# Setting types, reduces memory size by 50%
vars_df[var_tag] = vars_df[var_tag].astype('string')
vars_df[lbl_tag] = vars_df[lbl_tag].astype('string')
vars_df[src_tag] = vars_df[src_tag].astype('category')
vars_df[page_tag] = vars_df[page_tag].astype('UInt8')
vars_df[sas_ds_tag] = vars_df[sas_ds_tag].astype('category')
vars_df[rc_tag].replace('None', np.nan, inplace=True)
vars_df[rc_tag] = vars_df[rc_tag].astype('category')

vars_cat_df = pandas.DataFrame(variable_cats, columns=[var_tag, cat_tag, sub_tag])
vars_cat_df[var_tag] = vars_cat_df[var_tag].astype('string')
vars_cat_df[cat_tag] = vars_cat_df[cat_tag].astype('category')
vars_cat_df[sub_tag] = vars_cat_df[sub_tag].astype('category')

In [None]:
# Optional cleanup
variables = None
variable_cats = None

## Review results

In [None]:
#  A quick look at the variable parse results

# print(str(vars_df.memory_usage(index=True).sum()) + " bytes")
vars_df.describe(exclude=["UInt8"])  # Exclude the page column beacuse that is all it will render

There seem to be 27 variables that are repeated. Each repeated variable has a source listed as "Follow-up Visit Interview/Workbook" and a twin with a source "96-Month Close-Out/108-Mo Invw" (or something similar).

Labels are repeated because labels don't encode the visit data, and the same questions get asked at different visits.

In [None]:
# Look at repeated variables
vars_df.loc[vars_df['Variable'].duplicated(keep=False)].sort_values(by=['Variable'])

In [None]:
#  A quick look at the category/subcat parse results

# print(str(vars_cat_df.memory_usage(index=True).sum()) + " bytes")
vars_cat_df.describe()

Many variables have more than one category/subcategory assigned to them, so the count being greater than 9250/9277 is expected.

In [None]:
# Sanity checks

print("Variables in dataframe: " + str(vars_df.shape[0]))
assert vars_df.shape[0] == total_var_cnt

# NA is the expected columns?
for cn in col_names:
    na_cnt = vars_df[cn].isna().sum()
    if na_cnt > 0 and cn not in [page_tag, rc_tag]:
        print("\n!!!Unexpected NA values in column: " + cn)

# Are the source page numbers sane?
if vars_df.Page.max() > 80:
    print("\n!!!Unexpectedly large page number: " + str(vars_df.Page.max()))

# Are the variable names sane?
if vars_df.Variable.map(len).max() > 11:
    print("\n!!!Unexpectedly long variable name")


In [None]:
# View categories for sanity

# Check unique data sources
print("\nSources(" + str(len(vars_df[src_tag].unique())) + "):")
for name in vars_df[src_tag].unique():
    print(name)
    
# Check unique dataset filenames (ignoring visit suffix)
sas_df = set()
for df in vars_df[sas_ds_tag].unique():
    if df[-2:] in ['00','01','02','03','04','05','06','07','08','09','10','11','99']:
        sas_df.add(df[:-2])
    else:
        sas_df.add(df)
sas_df = list(sas_df)
sas_df.sort()
print("\nDataset files(" + str(len(sas_df)) + "):")
for df in sas_df:
    print(df)
if len(sas_df) > 22:
    print("\n!!!Unexpected number of dataset file names")
    
# Check all categories 
cats = list(vars_cat_df[cat_tag].unique())
cats.sort()
print("\nCategories(" + str(len(cats)) + "):")
for c in cats:
    print(c)
if len(cats) > 32:
    print("\n!!!Unexpected number of category types")
    
# Check unique categories in release comments
rcs = list(vars_df[rc_tag].unique())
rcs.remove(np.nan)
rcs.sort()
print("\n" + rc_tag + "(" + str(len(rcs)) + "):")
for rc in rcs:
    print(rc)
if len(rcs) > 4:
    print("\n!!!Unexpected number of release comment types")


## Grouping variables by category/subcategory and visit

In [None]:
# Use the visit prefix from the variable names to group data by cat/subcat/visit
# This is just to get a sense of things
tmp_df = vars_cat_df.copy()
tmp_df["Visit"] = tmp_df[var_tag].str[:3]
for visit in visits.keys():
    tmp_df[visit] = np.where(tmp_df['Visit'] == visit, True, False)
tmp_df = tmp_df.drop(columns=[var_tag, 'Visit'])
tmp_df = tmp_df.groupby([cat_tag, sub_tag], observed=True).sum()

pandas.set_option('display.max_rows', None)
display(tmp_df)

### Dump variable count per cat/subcat and visit into a wiki table

In [None]:
# Dump categories and subcategories into a wiki table
# Table lists categories and subcategories and which visits collected any data related to that subcategory

table_str = '{| class="wikitable"\n ! Category !! Subcategory !! ' + ' !! '.join(visits.values())
last_cat = ""
for group, sub in tmp_df.index:
    table_str += "\n|-\n"
    if group != last_cat:
         table_str += "|rowspan=" + str(len(tmp_df.loc[group].index)) + " | " + group + '\n'
    table_str += "| " + sub
    for col in visits:
        table_str += " || " + str(tmp_df.loc[group].loc[sub][col])
    last_cat = group
table_str += "\n|}"

print(table_str)

## Grouping variables by variable source and visit

In [None]:
tmp_df = vars_df.copy()
tmp_df["Visit"] = tmp_df[var_tag].str[:3]
for visit in visits.keys():
    tmp_df[visit] = np.where(tmp_df['Visit'] == visit, True, False)
tmp_df = tmp_df.drop(columns=[var_tag, 'Visit'])
tmp_df = tmp_df.groupby([src_tag], observed=True).sum()

pandas.set_option('display.max_rows', None)
display(tmp_df[tmp_df.columns.difference(['Page'])])

### Dump variable counts per source and visit into a wiki table

In [None]:
# Table lists sources and which visits collected any data related to that source

table_str = '{| class="wikitable"\n! Source !! ' + ' !! '.join(visits.values())
last_src = ""
for src in tmp_df.index:
    table_str += "\n|-\n"
    if src != last_src:
        table_str += "| " + src
    for col in visits:
        table_str += " || " + str(tmp_df.loc[src].loc[col])
    last_cat = src
table_str += "\n|}"

print(table_str)