In [144]:
import pandas as pd
import numpy as np
from itertools import combinations

## Load and Prepare Data
Data downloaded from: https://www.cdc.gov/healthyyouth/data/yrbs/data.htm
Column widths and locations found here: https://www.cdc.gov/healthyyouth/data/yrbs/pdf/2015/2015_yrbs-data-users_guide_smy_combined.pdf


In [268]:
# Hand code column locations and question titles
col_widths = [16, 1, 1, 1, 1, 7, 5, 5] + [1]*92 + [55] + [1]*90
col_names = ['BLANK'] +['Q{}'.format(x) for x in range(1, 100)] + ['BLANK'] + ['QN{}'.format(x) for x in range(8, 100)]
col_names.remove('QN67') # Does not have QN equivalent
col_names.remove('QN68') # Does not have QN equivalent

print(col_names)

['BLANK', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26', 'Q27', 'Q28', 'Q29', 'Q30', 'Q31', 'Q32', 'Q33', 'Q34', 'Q35', 'Q36', 'Q37', 'Q38', 'Q39', 'Q40', 'Q41', 'Q42', 'Q43', 'Q44', 'Q45', 'Q46', 'Q47', 'Q48', 'Q49', 'Q50', 'Q51', 'Q52', 'Q53', 'Q54', 'Q55', 'Q56', 'Q57', 'Q58', 'Q59', 'Q60', 'Q61', 'Q62', 'Q63', 'Q64', 'Q65', 'Q66', 'Q67', 'Q68', 'Q69', 'Q70', 'Q71', 'Q72', 'Q73', 'Q74', 'Q75', 'Q76', 'Q77', 'Q78', 'Q79', 'Q80', 'Q81', 'Q82', 'Q83', 'Q84', 'Q85', 'Q86', 'Q87', 'Q88', 'Q89', 'Q90', 'Q91', 'Q92', 'Q93', 'Q94', 'Q95', 'Q96', 'Q97', 'Q98', 'Q99', 'BLANK', 'QN8', 'QN9', 'QN10', 'QN11', 'QN12', 'QN13', 'QN14', 'QN15', 'QN16', 'QN17', 'QN18', 'QN19', 'QN20', 'QN21', 'QN22', 'QN23', 'QN24', 'QN25', 'QN26', 'QN27', 'QN28', 'QN29', 'QN30', 'QN31', 'QN32', 'QN33', 'QN34', 'QN35', 'QN36', 'QN37', 'QN38', 'QN39', 'QN40', 'QN41', 'QN42', 'QN43', 'QN44', 'QN

In [269]:
#df = pd.read_csv('sadc_2015_national.dat', sep='\t', skiprows=0)
r = pd.read_fwf('yrbs2015.dat', widths=col_widths,skiprows=0, header=None)
r.columns = [col_names]


In [270]:
r.head()

Unnamed: 0,BLANK,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,...,QN90,QN91,QN92,QN93,QN94,QN95,QN96,QN97,QN98,QN99
0,XX,5.0,2.0,3.0,2.0,C,1.73,54.4,3.0,2.0,...,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0
1,XX,5.0,1.0,3.0,1.0,D,1.5,51.2,6.0,2.0,...,1.0,1.0,,2.0,1.0,2.0,2.0,1.0,2.0,2.0
2,XX,6.0,2.0,4.0,2.0,E,1.9,66.6,8.0,1.0,...,1.0,1.0,,2.0,1.0,2.0,2.0,2.0,2.0,1.0
3,XX,6.0,2.0,4.0,2.0,E,,,,2.0,...,2.0,2.0,,1.0,2.0,2.0,2.0,2.0,2.0,2.0
4,XX,5.0,1.0,3.0,2.0,E,1.63,68.4,9.0,1.0,...,2.0,1.0,,1.0,2.0,2.0,2.0,2.0,2.0,1.0


In [271]:
# Select subset of QN questions where answers have been coded as binary yes/no questions
qn = r.loc[:, 'QN8':'QN99']
qn.head()

Unnamed: 0,QN8,QN9,QN10,QN11,QN12,QN13,QN14,QN15,QN16,QN17,...,QN90,QN91,QN92,QN93,QN94,QN95,QN96,QN97,QN98,QN99
0,1.0,2.0,1.0,,,2.0,2.0,2.0,2.0,1.0,...,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0
1,1.0,2.0,2.0,,,2.0,2.0,2.0,2.0,2.0,...,1.0,1.0,,2.0,1.0,2.0,2.0,1.0,2.0,2.0
2,,2.0,2.0,,,2.0,2.0,2.0,1.0,,...,1.0,1.0,,2.0,1.0,2.0,2.0,2.0,2.0,1.0
3,1.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,...,2.0,2.0,,1.0,2.0,2.0,2.0,2.0,2.0,2.0
4,,2.0,1.0,,,2.0,2.0,2.0,2.0,2.0,...,2.0,1.0,,1.0,2.0,2.0,2.0,2.0,2.0,1.0


In [272]:
# In the case of the QN questions, 1 is a positive 'yes' answer
# eg "Percentage of students who rarely or never wore a seat belt"
qn.fillna(0, inplace=True)
qn.replace(to_replace=2, value=0, inplace=True)


In [134]:
qn.head()

Unnamed: 0,QN8,QN9,QN10,QN11,QN12,QN13,QN14,QN15,QN16,QN17,...,QN90,QN91,QN92,QN93,QN94,QN95,QN96,QN97,QN98,QN99
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0


In [273]:
# Let's get the text of the questions too
with open('question_data.txt') as f:
    content = f.readlines()
content = [x.strip() for x in content] 
q_text = [x for x in content if 'Percentage of' in x]
q_text = [x.split('Percentage of ')[1] for x in q_text]


In [274]:
len(q_text)

90

### Now we're ready to start looking at lift and leverage!

In [275]:
# First, make a table that has p(A) for all questions
pA = qn.mean(axis=0)
pA = pd.DataFrame(pA)
pA['Text'] = q_text

In [276]:
pA

Unnamed: 0,0,Text
QN8,0.474462,students who rarely or never wore a bicycle he...
QN9,0.056068,students who rarely or never wore a seat belt ...
QN10,0.218062,students who rode with a driver who had been d...
QN11,0.048259,students who drove when drinking alcohol (one ...
QN12,0.220878,students who texted or e-mailed while driving ...
QN13,0.161674,"students who carried a weapon (such as a gun, ..."
QN14,0.045635,students who carried a gun (on at least 1 day ...
QN15,0.045059,students who carried a weapon on school proper...
QN16,0.063172,students who did not go to school because they...
QN17,0.060804,students who were threatened or injured with a...


In [277]:
qs = qn.columns
n = len(qs)
n = 80
num_pairs = (n * (n-1)) / 2.
print(n, num_pairs)

80 3160.0


In [278]:
# Every possible combination of questions
qs = qn.columns
pairs = combinations(qs, 2)
pairs = [pair for pair in pairs]


In [282]:
cols = ['pair', 'Question A', 'Question B', 'pApB', 'pAB']
df = pd.DataFrame(columns=cols)

In [280]:
ls = []
for pair in pairs:
    a = pair[0]
    b = pair[1]
    col = qn.apply(lambda x: x[a] == x[b] == 1, axis=1)
    pAB = col.mean(axis=0)
    pApB = pA.loc[a, 0] * pA.loc[b, 0]
    ls.append([pair, pA.loc[a, 'Text'], pA.loc[b, 'Text'], pApB, pAB])
#print(ls)

In [283]:

d = pd.DataFrame(ls, columns=cols)

In [284]:
d.head()

Unnamed: 0,pair,Question A,Question B,pApB,pAB
0,"(QN8, QN9)",students who rarely or never wore a bicycle he...,students who rarely or never wore a seat belt ...,0.026602,0.03085
1,"(QN8, QN10)",students who rarely or never wore a bicycle he...,students who rode with a driver who had been d...,0.103462,0.115719
2,"(QN8, QN11)",students who rarely or never wore a bicycle he...,students who drove when drinking alcohol (one ...,0.022897,0.027138
3,"(QN8, QN12)",students who rarely or never wore a bicycle he...,students who texted or e-mailed while driving ...,0.104798,0.123528
4,"(QN8, QN13)",students who rarely or never wore a bicycle he...,"students who carried a weapon (such as a gun, ...",0.076708,0.08967


In [285]:
d['Lift'] = d.apply(lambda x: x['pAB'] / x['pApB'], axis=1)
d['Leverage'] = d.apply(lambda x: x['pAB'] - x['pApB'], axis=1)


In [287]:
d.sort_values(by='Leverage', ascending=False)

Unnamed: 0,pair,Question A,Question B,pApB,pAB,Lift,Leverage
3308,"(QN60, QN69)",students who ever had sexual intercourse,students who described themselves as slightly ...,0.194207,0.376664,1.939496,0.182457
3304,"(QN60, QN63)",students who ever had sexual intercourse,students who were currently sexually active (s...,0.105048,0.274194,2.610171,0.169145
3413,"(QN63, QN69)",students who were currently sexually active (s...,students who described themselves as slightly ...,0.140293,0.273361,1.948499,0.133068
2294,"(QN39, QN40)",students who ever used electronic vapor produc...,students who currently used electronic vapor p...,0.113292,0.244496,2.158104,0.131204
2731,"(QN47, QN49)",students who ever used marijuana (one or more ...,students who currently used marijuana (one or ...,0.083150,0.213006,2.561696,0.129855
2301,"(QN39, QN47)",students who ever used electronic vapor produc...,students who ever used marijuana (one or more ...,0.175793,0.297619,1.693011,0.121826
2520,"(QN43, QN44)",students who currently drank alcohol (at least...,students who drank five or more drinks of alco...,0.051970,0.171211,3.294395,0.119241
3599,"(QN71, QN72)",students who did not drink fruit juice (100% f...,students who did not eat fruit (one or more ti...,0.137070,0.251664,1.836024,0.114594
2410,"(QN41, QN43)",students who ever drank alcohol (at least one ...,students who currently drank alcohol (at least...,0.184215,0.295699,1.605182,0.111484
3411,"(QN63, QN65)",students who were currently sexually active (s...,students who used a condom (during last sexual...,0.041169,0.149450,3.630112,0.108280


In [288]:
d.to_csv('final_lift_leverage.csv', index=False)


In [250]:
print(q_text)


['students who rarely or never wore a seat belt (when riding in a car driven by someone else)', 'students who rode with a driver who had been drinking alcohol (in a car or other vehicle one or more times during the 30 days before the survey)', 'students who drove when drinking alcohol (one or more times during the 30 days before the survey, among students who had driven a car or other vehicle during the 30 days before the survey)', 'students who texted or e-mailed while driving a car or other vehicle (on at least 1 day during the 30 days before the survey, among students who had driven a car or other vehicle during the 30 days before the survey)', 'students who carried a weapon (such as a gun, knife, or club on at least 1 day during the 30 days before the survey)', 'students who carried a gun (on at least 1 day during the 30 days before the survey)', 'students who carried a weapon on school property (such as a gun, knife, or club on at least 1 day during the 30  days before the survey)

In [248]:
print(len(q_text))

92
