# Variable groups

Aside from the categories, the majority of labels start with a prefix that groups questions together. This breaks those out and counts them.

In [1]:
import pickle
import pandas as pd

In [2]:
# Constants
visits = {'P02':'IEI', 'P01':'SV', 'V00':'EV', 'V01':'12m', 'V02':'18m', 'V03':'24m', 'V04':'30m', 'V05':'36m', 'V06':'48m', 'V07':'60m', 'V08':'72m', 'V09':'84m', 'V10':'96m', 'V11':'108m', 'V99':"Outcomes"}

In [3]:
# Created in 'Parse the VG_Form.pdf for OAI variable categories and sources' notebook
vars_df = pickle.load(open('pkl/oai_vars_labels_sources.pkl', 'rb' ))

## Variable collected per visit

In [4]:
for visit in visits.keys():
    print(visits[visit] + ':\t\t' + str(vars_df[vars_df.Variable.str.startswith(visit)].shape[0]))

IEI:		50
SV:		233
EV:		1415
12m:		1004
18m:		244
24m:		1167
30m:		244
36m:		910
48m:		1209
60m:		255
72m:		796
84m:		257
96m:		1187
108m:		193
Outcomes:		86


## Variables per label prefix
First, lets (approximately) see how many labels have prefixes.

In [5]:
# Presuming all variable with a ':' have a prefix
print('Variables w/ a prefix: ' + str(vars_df.Label.str.contains(":").sum()))

Variables w/ a prefix: 5932


In [6]:
# Provide a count of the number of variables with a given group prefix
def count_groups(df, groups, startswith=True):
    total = 0
    for group in groups:
        if startswith:
            count = df.Label.str.startswith(group + ':').sum()
        else:
            count = df.Label.str.contains(group+ ':').sum()
        total += count
        print(group + ':\t' + str(count))
    print('Total:\t' + str(total) + '\n')

# Drop these prefixes to expose the next ones
def trim_groups(df, groups):
    for group in groups:
        # Add escapes if needed for names with parens
        if '(' in group:
            group = group.replace('(', '\(')
            group = group.replace(')', '\)')
        df.Label = df.Label.str.replace(group + ':','')  

In [7]:
tmp_df = vars_df.copy()

# Prefixes denoting named question set/scoring system
question_groups = ['CAM', 'ICOAP', 'TMJ', 'SF-12', 'CSQ', 'Charlson Comorbidity', 'CES-D',
                   'Block Brief 2000', 'IADL', 'ADL', 'LLDI', 'WORMS', 'BLOKS', 'MOAKS', 'MIF']

count_groups(tmp_df, question_groups)

CAM:	152
ICOAP:	204
TMJ:	48
SF-12:	108
CSQ:	105
Charlson Comorbidity:	136
CES-D:	189
Block Brief 2000:	271
IADL:	45
ADL:	48
LLDI:	114
WORMS:	74
BLOKS:	34
MOAKS:	580
MIF:	306
Total:	2414



In [None]:
# Some of these are also sub-prefixes to the prior groups, dropping those prefixes will change the counts
#trim_groups(tmp_df, question_groups)

# Prefixes for other question groups
other_groups = ['Isometric strength',
                'Phlebotomy', 'Exercise', 'Urine collection', 'Leisure activities', 'Household activities',
                'Occupational activities', 'Quality of life', 'Laboratory processing',
                'Blood pressure', 'Radial pulse', 'Weight loss',
                'Repeated chair stands', '400-meter walk eligibility', 'Left knee', 'Right knee',
                'Right knee symptoms', 'Left knee symptoms', 'Left hip', 'Right hip',
                'Doctor said you broke or fractured bone(s), since last visit about 12 months ago',
                'RA symptoms', 'How heard about OAI',
                'Abdominal circumference', 'Height (standing)',
                'Baseline knee x-ray', 'Right knee baseline x-ray', 'Left knee baseline x-ray',
                'Cancer type', 'Cancer removed by surgery', 'Doctor said cancer spread to other parts of body']

count_groups(tmp_df, other_groups)

In [None]:
# These prefixes are part of phrase sets (e.g Left knee pain, Right knee pain, Left hip pain, etc.)
descriptive_groups = ['pain', 'pain location', 'knee difficulty',
                      'stiffness', 'stiffness location', '-meter walk',
                      '[Kk]nee exam', '[Hh]and exam', '[Ww]eight', 
                      'attempted, unable to complete', 'chair stand', 'allux valgus']

count_groups(tmp_df, descriptive_groups, startswith=False)

#tmp[tmp['Label'].str.contains(':')]['Label'].unique()

In [None]:
tmp = vars_df.copy()

for group in question_groups + other_groups:
    tmp = tmp.drop(tmp.loc[tmp.Label.str.startswith(group + ':')].index)
for group in descriptive_groups:
    tmp = tmp.drop(tmp.loc[tmp.Label.str.contains(group + ':')].index)

print('Variables w/out prefix\tUnique variables w/out prefix')
print(str(tmp.shape[0]) + '\t\t\t\t' + str(tmp.Label.nunique()))

In [None]:
len(set(vars_df.Variable))