# Parse the VG_Form.pdf for OAI variable categories and sources

OAI has a number of PDFs that list all variables collected, their nature, and where they came from. Most handy is General Information.zip\/General\/VG_Form.pdf as we can use this to get the categories and subcategories.

With over 9,000 variables collected, getting the categories can be critical just to start to make sense of the variables.

## Imports and constants

In [1]:
import re
import pickle
import sys
from tqdm import tqdm
import numpy as np
import pandas as pd
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer



In [2]:
# Constants

# Tags to look for when parsing 
hdr_tag = "Variable Guide"
vn_tag = "Variable Name"
var_tag = "Variable"
src_tag = "Source"
page_tag = "Page"
lbl_tag = "Label"
sas_ds_tag = "SAS Dataset"
rc_tag = "Release Comments"
cat_tag = "Category"
sub_tag = "Subcategory"
stats1_tag = "N "
stats2_tag = "Value N"
end_tag = "_______________"
ftr_tag = "Release Version"

# This list was created by first parsing and collecting all listed categories into a set
known_categories = ["Bookkeeping", "Demographics", "Study eligibility", "Knee symptoms",
                    "Medical history, arthritis", "Medications", "Physical activity",
                    "Knee pain/OA status", "Other joint symptoms", "Back pain",
                    "Anthropometry", "Hand and/or foot exam", "Medical history, general", 
                    "Image Assessments: X-ray", "Joint imaging", "Health care access",
                    "Global function/disability/QOL", "Nutrition", "WOMAC/KOOS", 
                    "Knee function/QOL", "Blood pressure & pulse", "Performance measures", 
                    "Strength measures", "Knee exam", "Biospecimens collection",
                    "Image Assessments: MRI", "Accelerometry", "Hip function/QOL", 
                    "Hip symptoms", "Outcomes"]

visits = {'P02':'IEI', 'P01':'SV', 'V00':'EV', 'V01':'12m', 'V02':'18m', 'V03':'24m', 'V04':'30m', 'V05':'36m', 'V06':'48m', 'V07':'60m', 'V08':'72m', 'V09':'84m', 'V10':'96m', 'V11':'108m', 'V99':"Outcomes"}

col_names = [var_tag, lbl_tag, src_tag, page_tag, sas_ds_tag, rc_tag]

## Original file ingestion

Parse the PDF once, and convert into a smaller serialized format for faster re-processing in the future (approx 10 min to read the whole PDF). Do this once, and afterwards only use the binary reader in the next cell.

In [None]:
# Parse PDF into a list of pages
# Each page is only a list of the PDF textboxes in the page

pages = []
for page_layout in tqdm(extract_pages(r"../data/pdfs/General/VG_Form.pdf")):
    text_boxes = []
    for element in page_layout:
        if isinstance(element, LTTextContainer):
            text_boxes.append((element.get_text(), element.x0, element.y0, element.x1, element.y1))
    pages.append(text_boxes)

pickle.dump(pages, open( "pkl/vg_form_pdf_elements.pkl", "wb" ))

# Fast file ingestion
Read in the binary serialized data (approx 1 sec to read the binary form) created in prior section.

In [3]:
pages = pickle.load(open("pkl/vg_form_pdf_elements.pkl", "rb" ))

In [None]:
# How many variables are described in this document? (for a later sanity check)
total_var_cnt = 0
for page in pages:
    for element in page:
        if element[0].strip().startswith(end_tag):
            total_var_cnt += 1
print(total_var_cnt)

## Ensure the PDF textboxes are in the same order they are rendered on a page 

In [None]:
# Parse PDF elements into a list of text boxes and coordinates that match their rendering order

lines = []
for page in tqdm(pages):
    text_boxes = []
    for element in page:
        txt = element[0].strip()
        
        # found text box containing a header or footer, ignore and move on
        if txt.startswith(hdr_tag) or txt.startswith(page_tag) or txt.startswith(ftr_tag):
            continue
        
        # Check for multiple lines in a single text_box
        txt = txt.split('\n')
        if len(txt) == 1:
            text_boxes.append((txt[0].strip(), element[2], element[4], element[1])) # txt, y0, y1, x0
        else:
            # Delete dups: This PDF has random text repeated in the PDF even though it only renders them once
            tmp = []
            [tmp.append(x.strip()) for x in txt if x not in tmp]
            txt = tmp
            
            # Break multiline boxes into single line boxes
            line_cnt = len(txt)
            height = (element[4] - element[2])/line_cnt # y1-y0
            for i in range(line_cnt):
                text_boxes.append((txt[i], element[2]+(height*(line_cnt-(1+i))), element[4]-(height*i), element[1]))

    # Order text_boxes by vertical (top to bottom), then horizontal position (left to right)                
    text_boxes = sorted(text_boxes, key=lambda box: box[3])  # secondary sort variable (horiz pos)
    text_boxes = sorted(text_boxes, key=lambda box: box[2], reverse=True) # primary sort variable (vert pos)

    # Concatenate text_boxes that render at same horizontal position (shows up in the pdf as Label: value)
    last_tb = ["", sys.maxsize, sys.maxsize, sys.maxsize]
    for tb in text_boxes:
        if (last_tb[1] - tb[1]) < 1:  # same line of text on a page
             lines[-1] = lines[-1] + " " + tb[0].strip()
        else:  # new line of text
            lines.append(tb[0].strip())
        last_tb = tb

In [None]:
# Optional memory cleanup
text_boxes = None
pages = None

In [None]:
# Kludge: there are a few text boxes that place the text None a line before Release Comments,
# yet renders them on the same line as Release Comments: None
# This hunts those down and fixes them before parsing
l = 0
while l < len(lines):
    if lines[l] == "None" and lines[l+1].startswith(rc_tag):
        lines[l] = lines[l+1] + " " + lines[l]
        lines.pop(l+1)
    l += 1

## Parse text of the document into data about OAI variables
First parse into lists, then into pandas dataframes.
One dataframe includes each variable, label, source, source page, dataet file name, and release comment.
Since each variable can have more than one category/subcategory associated with it, a separate dataframe is used to hold these associations.

In [None]:
# Parse the text into data about the OAI variables

def get_var(lines, l, label, next_label=None):
    # Parse as many lines as needed to get the variable value
    # Return value and new parser location
    assert(lines[l].startswith(label))
    value = lines[l][len(label)+1:].strip()
    l += 1
    # Some comments run beyond one line
    while next_label and not lines[l].startswith(next_label):
        value = value + " " + lines[l].strip()
        l += 1
    return value, l
    

variables = []
variable_cats = [] 
l = 0
while l < len(lines):
    # The first line describes where the variable came from
    src_page = None
    src = lines[l].strip()
    # If applicable, split out src and src_page
    idx = re.search(page_tag + "|" + page_tag.lower() + "| p |, p", lines[l])
    if idx:
        src = lines[l][:idx.start()].strip()
        src_page = lines[l][idx.end():].strip()
    else:
        idx = re.search("p\d+", lines[l]) # e.g. p50Q
        if idx:
            src = lines[l][:idx.start()].strip()
            src_page = lines[l][idx.start()+1:].strip()

    # Clean up source names that are different just because of spelling
    src = src.replace("Follow-Up", "Follow-up")
    src = re.sub("Quest$", "Questionnaire", src)
    src = src.replace("Checklist", "").strip()
    
    l += 1
    
    # Next is the variable name, never larger than a single line 
    var_name, l = get_var(lines, l, vn_tag)
    
    # Next is the variable label
    label, l = get_var(lines, l, lbl_tag, sas_ds_tag)
    
    # Get the name of the SAS Dataset the variable is stored in
    dataset, l = get_var(lines, l, sas_ds_tag)
    
    # Get the release comments
    rel_cmnts, l = get_var(lines, l, rc_tag, cat_tag)
    
    # Get the categories/subcategories
    assert(lines[l].startswith(cat_tag))
    l += 1 # desired values are the line below the text "Category:"
    while not lines[l].startswith(stats1_tag) and not lines[l].startswith(stats2_tag) and not lines[l].startswith(end_tag):
        for cat in known_categories:
            if lines[l].startswith(cat):
                # Store as name, category, subcategory (which is always on the same line as the category)
                variable_cats.append((var_name, cat, lines[l][len(cat):].strip()))
                break
        l += 1
    
    # All remaining text is the summary stats section (ignored), and the line marks the end of a variable description
    while l < len(lines) and not lines[l].startswith(end_tag):
        l += 1
    l += 1
    
    variables.append((var_name, label, src, src_page, dataset, rel_cmnts))

In [None]:
# Optional memory cleanup
lines = None

## Clean and Save Variable Data In Dataframe

In [None]:
# Put variable data into Pandas dataframe, and optimize the storage (reduces to 1/3 size)
vars_df = pd.DataFrame(variables, columns=col_names)
# Setting types, reduces memory size by 50%
vars_df[var_tag] = vars_df[var_tag].astype('string')
vars_df[lbl_tag] = vars_df[lbl_tag].astype('string')
vars_df[src_tag] = vars_df[src_tag].astype('category')
vars_df[page_tag] = vars_df[page_tag].astype('category')
vars_df[sas_ds_tag] = vars_df[sas_ds_tag].astype('category')
vars_df[rc_tag].replace('None', np.nan, inplace=True)
vars_df[rc_tag] = vars_df[rc_tag].astype('category')

In [None]:
# Sanity checks
print('Variables in dataframe: ' + str(vars_df.shape[0]))
assert vars_df.shape[0] == (total_var_cnt)

# NA is the expected columns?
for cn in col_names:
    na_cnt = vars_df[cn].isna().sum()
    if na_cnt > 0 and cn not in [page_tag, rc_tag]:
        print('\n!!!Unexpected NA values in column: ' + cn)

# Are the source page numbers sane? e.g. 4 (extension)
if vars_df.Page.map(len).max() > 13:
    print('\n!!!Unexpectedly large page number: ' + str(vars_df.Page.max()))

# Are the variable names sane?
if vars_df.Variable.map(len).max() > 11:
    print('\n!!!Unexpectedly long variable name')

# View values for sanity
# Check unique data sources
print('\nSources(' + str(len(vars_df.Source.unique())) + '):')
for name in vars_df.Source.unique():
    print(name)
    
# Check unique dataset filenames (ignoring visit suffix)
sas_df = set()
for df in vars_df[sas_ds_tag].unique():
    if df[-2:] in ['00','01','02','03','04','05','06','07','08','09','10','11','99']:
        sas_df.add(df[:-2])
    else:
        sas_df.add(df)
sas_df = list(sas_df)
sas_df.sort()
print('\nDataset files(' + str(len(sas_df)) + '):')
for df in sas_df:
    print(df)
if len(sas_df) > 22:
    print('\n!!!Unexpected number of dataset file names')
    
# Check unique categories in release comments
rcs = list(vars_df[rc_tag].unique())
rcs.remove(np.nan)
rcs.sort()
print('\n' + rc_tag + '(' + str(len(rcs)) + '):')
for rc in rcs:
    print(rc)
if len(rcs) > 4:
    print('\n!!!Unexpected number of release comment types')

In [None]:
# Optional cleanup
variables = None

### Review/cleanup results

In [None]:
#  A quick look at the variable parse results

# print(str(vars_df.memory_usage(index=True).sum()) + " bytes")
vars_df.describe()

From this we see that 27 variables are listed twice.

Labels are repeated because most labels don't encode the visit data, and the same questions get asked at different visits.

### Remove repeated variables

In [None]:
# Look at repeated variables
vars_df.loc[vars_df.Variable.duplicated(keep=False)].sort_values(by=['Variable'])

There seem to be 27 variables names that are repeated. Each repeated variable has a source listed as "Follow-up Visit Interview/Workbook" and a twin with a source "96-Month Close-Out/108-Mo Invw" (or something similar).

I haven't gone through all the data yet, but I'm not convinced yet that source documents are extremely accurately recorded. I can't see any reason a small handful of questions are marked '96-Month Close-Out/108-Mo Invw' or '96-Month Close-Out Follow-up Intvw'. In fact, only one 96m variable is marked with '96-Month Close-Out/108-Mo Invw', the interview date.  This seems like more human inconsistence in bookkeeping. For now, let's fix this by reducing these sources to the common label "Follow-up Visit Interview". Note that the source pages for the duplicates are lost in this.

Feel free to fix that if you need it. Ideally, sources would replaced with the names of the PDF files in the CRFs Workbooks.zip archive.

In [None]:
# Removing dropping repeated values.
vars_df = vars_df.drop(vars_df.loc[vars_df.Variable.duplicated(keep=False) & (vars_df.Source == '96-Month Close-Out/108-Mo Invw')].index)
vars_df = vars_df.drop(vars_df.loc[vars_df.Variable.duplicated(keep=False) & (vars_df.Source == '96-Month Close-Out Follow-up Intvw')].index)
# Drop the category (this does mix up the source pages)
vars_df.Source = vars_df.Source.str.replace('96-Month Close-Out/108-Mo Invw', 'Follow-up Visit Interview')
vars_df.Source = vars_df.Source.str.replace('96-Month Close-Out Follow-up Intvw', 'Follow-up Visit Interview')
vars_df.shape[0]

### Clean labels

We already track the visit information in the variable name, and the variable source in its own column, there is no need for either to be appended to the label. Doing so prevents us from knowing how many unique variables there are. 

In [None]:
label_prefixes = ['SV', 'SV/EV XR', 'IEI', 'EV', 'EV WBK', 'EV XR', 'EV MRI', 'SAQ',
                  'BL kXR reading \(JD\)', 'BL/FU kXR reading', 'BL/FU kXR reading \(JD\)', 'BL/FU kXR reading \(BU\)',
                  'BL/FU kMRI reading \(BI\)', 'BL/FU kMRIreading \(BI\)', 'BL/FU kMRI reading \(FE\)', 'BL/FU kMRI reading \(VS\)',
                  'FU flXR reading \(DC\)', 'FU flXR reading \(JD\)',
                  'FU INT', 'FU WKB', 'FU WBK', 'FU SAQ', 'FU MRI', 'FU XR',
                  'Accelerometry', 'MISSED', 'Enr Expn', 'Outcomes']

# Look at how often each prefix is used
total = 0
for pre in label_prefixes:
    count = vars_df.Label.str.count(r'^' + pre + ':').sum()
    total += count
    print(pre + '\t' + str(count))
print(total)

Clearly, existing labels have typos and the prefixes aren't consistent.

Some simple examples:
* SV/EV XR - no data with this label is part of the SV datasets (all variables start with V00)
* EV vs EV WBK - 499 variables with the former, 1 with the latter prefix
* FU WKB vs FU WBK - 15 with the former label, 1600 with the latter prefix
* BL/FU kMRIreading (BI) - clearly  a typo shared by 5 variables
* SAQ - mostly items from the V00 visit, but 3 variables with this prefix are from later visits

Step one: Remove these prefixes, they don't seem to add any information that isn't stored elsewhere

In [None]:
for pre in label_prefixes:
    vars_df.Label = vars_df.Label.str.replace(r'^' + pre + ':','')

Depending on the visit, the same question may have a different question number. Knowing the question number may be handy, but put it in its own data field. We are trying to track similar questions across visits and these question number prefixes obscure that.

Step two: remove the question number prefixes

In [None]:
# I haven't found if the asterisk in a label has any meaning, remove for now (2000+ variables)
vars_df.Label = vars_df.Label.str.lstrip('*')

# Pull question label into its own column (note that 2 variables don't start the question label with Q, grr)
vars_df['Question'] = vars_df.Label.str.extract(r'(^[Q]?[D]?\d+[a-zA-Z\(\)0-9]*[\. ])')[0].str.rstrip('.')
vars_df.Question = vars_df.Question.str.strip()
vars_df.Label = vars_df.Label.str.replace(r'(^[Q]?[D]?\d+[a-zA-Z\(\)0-9]*[\. ])','')
vars_df.Label = vars_df.Label.str.strip()
vars_df.Question = vars_df.Question.astype('category')


In [None]:
# Are the question numbers sane?
if vars_df.Question.map(len).max() > 7:
    print('\n!!!Unexpectedly long question number')

Step three: correct some obviousl label typos.

In [None]:
# Correct typos in labels (found while writing other scripts)
vars_df.Label = vars_df.Label.str.replace('Isometric Strength', 'Isometric strength')
vars_df.Label = vars_df.Label.str.replace('RA Symptoms', 'RA symptoms')
vars_df.Label = vars_df.Label.str.replace(r'^$','Left knee difficulty: in car/out of car, last 7 days') # The label was blank on V01-V11DILKN7

In [None]:
vars_df.Label.nunique()

Only roughly 2,500 variables are unique. That is easier than 9,250.

### Save the variable data
Saving as a Pandas dataframe for quick loading in other notebooks.

In [None]:
pickle.dump(vars_df, open('pkl/oai_vars_labels_sources.pkl', 'wb' ))

## Clean and Save Category Data In Dataframe

In [None]:
# Put categorical data into Pandas dataframe, and optimize the storage (reduces to 1/3 size)
vars_cat_df = pd.DataFrame(variable_cats, columns=[var_tag, cat_tag, sub_tag])
vars_cat_df[var_tag] = vars_cat_df[var_tag].astype('string')
vars_cat_df[cat_tag] = vars_cat_df[cat_tag].astype('category')
vars_cat_df[sub_tag] = vars_cat_df[sub_tag].astype('category')
vars_cat_df = vars_cat_df.drop_duplicates() # make sure to drop the duplicates from the parse

In [None]:
#  A quick look at the category/subcat parse results
# print(str(vars_cat_df.memory_usage(index=True).sum()) + " bytes")
vars_cat_df.describe()

Many variables have more than one category/subcategory assigned to them, so the count being greater than 9250 is expected.

In [None]:
# View categories for sanity
    
# Check all categories 
cats = list(vars_cat_df.Category.unique())
cats.sort()
print('\nCategories(' + str(len(cats)) + '):')
for c in cats:
    print(c)
if len(cats) > 32:
    print('\n!!!Unexpected number of category types')

In [None]:
# Optional cleanup
variable_cats = None

### Save the category data
Saving as a Pandas dataframe for quick loading in other notebooks.

In [None]:
pickle.dump(vars_cat_df, open('pkl/oai_vars_categories_subcategories.pkl', 'wb' ))