Make sure you have plotly installed with 
conda install plotly

In [24]:
import pandas as pd
import sqlalchemy as sq # database connection
from IPython.display import display
from IPython.display import HTML
import plotly.plotly as py # interactive graphing
from plotly.graph_objs import Bar, Scatter, Marker, Layout 
import numpy as np

Connect to database:

In [2]:
engine = sq.create_engine('sqlite:///OKC_Processed_DB.db') #create connection to the database
engine2 = sq.create_engine('sqlite:///OKCdatabase.db')

Get tables:

In [3]:
df_rsp = pd.read_sql_table('question_responses',engine)

In [484]:
df_tst = pd.read_sql_table('test_responses',engine)

In [4]:
df_u = pd.read_sql_table('user_info',engine2)
user_state = df_u.d_country
gender = df_u.d_gender
religion = df_u.d_religion_type
job = df_u.d_job

In [103]:
df_num = pd.read_sql_table('numerical_attributes',engine)
CA = pd.to_numeric(df_num.CA)

In [105]:
df_num.head()

Unnamed: 0,d_income,d_age,lf_max_age,lf_min_age,p_sprit,p_sexdrive,p_pure,p_oldfash,p_adven,p_kinky,...,p_honest,p_submissive,p_laidback,p_cool,p_passion,p_greed,p_drug,p_capi,p_progress,CA
0,,25,36,25,,,,16.0,,-60.0,...,,,,17.0,90.0,,,,,0.571282953049484
1,,20,26,18,,,,,,,...,,,,,,,,,,
2,,22,40,26,40.0,-35.0,,73.0,,-44.0,...,9.0,,,,32.0,,,,,1.01226429848596
3,,29,38,28,-22.0,,,-34.0,,,...,,,,,,,,,,
4,,30,46,22,-18.0,,-3.0,-56.0,72.0,,...,,,,,,,,,,0.418363901130365


# Define Functions, Initialize global parameters

In [92]:
def get_columns_multi(df,keylist):
    headerlist = []
    for key in keylist:
        headers_all = list(df.columns)
        headers = [name for name in headers_all if name[0:len(key)]==key]
        headerlist = headerlist+headers
    return df.loc[:,headerlist]
import colorlover as cl
US_states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

# Interactive Map

### Male-Female Ratio

In [526]:
y = CA*np.nan
y[gender == 'Woman'] = 100
y[gender == 'Man'] = 0
df = pd.concat([user_state,y],axis=1)
df.columns = ['state','class']
df = df.dropna(subset=['class']).groupby('state').mean();
datalist = []
for i in range(len(df)):
    state = df.iloc[i,:].name
    score = df.iloc[i,0]
    if state in US_states:
        datalist.append([state, score])
df = pd.DataFrame.from_records(datalist)
df.columns = ['state','%female']
df_gender = df
#Female % Map object
colors = cl.scales['6']['div']['RdYlGn']
scl = [list(a) for a in list(zip(np.linspace(0,1,len(colors)),colors))]
data0 = dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale = False,
        locations = df_gender['state'],
        z = df_gender['%female'].astype(float),
        locationmode = 'USA-states',
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "% Female Users"),
        visible = True
        )

### Get drug-use dataset

In [521]:
y_drugs = get_columns_multi(df_rsp,['q80_']);
y_drugs.columns = ['occasional','regular','never','past']
import warnings
warnings.filterwarnings('ignore')#hide warnings
y = y_drugs.iloc[:,[0]]*np.nan
y[y_drugs.occasional==1] = 100
y[y_drugs.regular==1] = 100
y[y_drugs.past==1] = 100
y[y_drugs.never==1] = 0
df = pd.concat([user_state,y],axis=1)
df.columns = ['state','class']

df = df.dropna(subset=['class']).groupby('state').mean();
datalist = []
for i in range(len(df)):
    state = df.iloc[i,:].name
    score = df.iloc[i,0]
    if state in US_states:
        datalist.append([state, score])
df = pd.DataFrame.from_records(datalist)
df.columns = ['state','%UsedHardDrugs']
df_drugs = df

# drugs data map object
colors = cl.scales['6']['div']['PRGn']
scl = [list(a) for a in list(zip(np.linspace(0,1,len(colors)),colors))]
data1 =  dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale = False,
        locations = df_drugs['state'],
        z = df_drugs['%UsedHardDrugs'].astype(float),
        locationmode = 'USA-states',
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "% had experience with hard drugs"),
        visible = False
        ) 

### Get open-relationship dataset

In [370]:
y_openre = get_columns_multi(df_rsp,['q325_'])
y_openre.columns =  ['no','yes']
y = y_openre.iloc[:,[0]]*np.nan
y[y_openre.yes==1] = 100
y[y_openre.no==1] = 0
df = pd.concat([user_state,y],axis=1)
df.columns = ['state','class']
df = df.dropna(subset=['class']).groupby('state').mean();
datalist = []
for i in range(len(df)):
    state = df.iloc[i,:].name
    score = df.iloc[i,0]
    if state in US_states:
        datalist.append([state, score])
df = pd.DataFrame.from_records(datalist)
df.columns = ['state','%OpenTo']
df_openre = df

#Open-relationship Map object
colors = cl.scales['6']['div']['RdGy']
scl = [list(a) for a in list(zip(np.linspace(0,1,len(colors)),colors))]
data2 = dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale = False,
        locations = df_openre['state'],
        z = df_openre['%OpenTo'].astype(float),
        locationmode = 'USA-states',
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "% OK with an open-relationship"),
        visible = False
        )

### Get googling dataset

In [127]:
y_google = get_columns_multi(df_rsp,['q170849_']);y_google.head()
y_google.columns = ['no','yes']
y = y_google.iloc[:,[0]]*np.nan
y[y_google.yes==1] = 100
y[y_google.no==1] = 0
df = pd.concat([user_state,y],axis=1)
df.columns = ['state','class']
df = df.dropna(subset=['class']).groupby('state').mean();
datalist = []
for i in range(len(df)):
    state = df.iloc[i,:].name
    score = max(df.iloc[i,0],18.901)
    if state in US_states:
        datalist.append([state, score])
df = pd.DataFrame.from_records(datalist)
df.columns = ['state','%Google']
df_google = df

#Googling Map object
colors = cl.scales['6']['div']['BrBG']
scl = [list(a) for a in list(zip(np.linspace(0,1,len(colors)),colors))]
data3 = dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale = False,
        locations = df_google['state'],
        z = df_google['%Google'].astype(float),
        locationmode = 'USA-states',
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "% would google their first dates"),
        visible = False
        ) 

### Get cognitive ability dataset

In [308]:
df = pd.concat([user_state,CA],axis=1)
df.columns = ['state','CA']
df = df.groupby('state').mean()
datalist = []
for i in range(len(df)):
    state = df.iloc[i,:].name
    score = df.iloc[i,0]
    if state in US_states:
        datalist.append([state, score])
df = pd.DataFrame.from_records(datalist)
df.columns = ['state','CA']
df_CA = df

#Cognitive Ability Map object
colors = cl.flipper()['div']['6']['RdYlGn']
scl = [list(a) for a in list(zip(np.linspace(0,1,len(colors)),colors))]
data4 = dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale = False,
        locations = df_CA['state'],
        z = df_CA['CA'].astype(float),
        locationmode = 'USA-states',
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "Cognitive Ability Score"),
        visible = False
        )

### Get Sex-Drive Score

In [304]:
SD = pd.to_numeric(df_num.p_sexdrive)
df = pd.concat([user_state,SD],axis=1)
df.columns = ['state','SD']
df = df.groupby('state').mean()
datalist = []
for i in range(len(df)):
    state = df.iloc[i,:].name
    score = df.iloc[i,0]
    if state in US_states:
        datalist.append([state, score])
df = pd.DataFrame.from_records(datalist)
df.columns = ['state','SD']
df_SD = df

#Sex drive score map object
colors = cl.flipper()['seq']['6']['RdPu']
scl = [list(a) for a in list(zip(np.linspace(0,1,len(colors)),colors))]
data5 = dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale = False,
        locations = df_SD['state'],
        z = df_SD['SD'].astype(float),
        locationmode = 'USA-states',
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "OK Cupid Sex-Drive Score"),
        visible = False
        )

### Get Politics Score

In [214]:
polit = pd.to_numeric(df_num.p_polit)
df = pd.concat([user_state,polit],axis=1)
df.columns = ['state','polit']
df = df.groupby('state').mean()
datalist = []
for i in range(len(df)):
    state = df.iloc[i,:].name
    score = df.iloc[i,0]
    if state in US_states:
        datalist.append([state, score])
df = pd.DataFrame.from_records(datalist)
df.columns = ['state','polit']
df_polit = df

#Politics score map object
colors = cl.scales['6']['div']['RdBu']
scl = [list(a) for a in list(zip(np.linspace(0,1,len(colors)),colors))]
scl[1][0] = 0.3; scl[2][0] = 0.4; scl[3][0] = 0.45; scl[4][0] = 0.55;

data6 = dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale = False,
        locations = df_polit['state'],
        z = df_polit['polit'].astype(float),
        locationmode = 'USA-states',
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "OK Cupid Politics Score"),
        visible = False
        )

### Get Spirituality Score

In [301]:
sprit = pd.to_numeric(df_num.p_sprit)
df = pd.concat([user_state,sprit],axis=1)
df.columns = ['state','sprit']
df = df.groupby('state').mean()
datalist = []
for i in range(len(df)):
    state = df.iloc[i,:].name
    score = df.iloc[i,0]
    if state in US_states:
        datalist.append([state, score])
df = pd.DataFrame.from_records(datalist)
df.columns = ['state','sprit']
df_sprit = df

#Sex drive score map object
colors = cl.scales['7']['seq']['Greys']
scl = [list(a) for a in list(zip(np.linspace(1,0,len(colors)),colors))]
scl.reverse()
scl[3][0]=0.4; scl[4][0]=0.5; scl[5][0]=0.6
data7 = dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale = False,
        locations = df_sprit['state'],
        z = df_sprit['sprit'].astype(float),
        locationmode = 'USA-states',
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "OK Cupid Spirituality Score"),
        visible = False
        )

### Make Interactive Plot

In [525]:
from plotly.graph_objs import *

datasetlist = [data0, data1, data2, data3, data4, data5, data6, data7]
switchlist = []
for i in range(0,len(datasetlist)):
    switch = [False]*len(datasetlist)
    switch[i] = True
    switchlist.append(switch)
    
data = Data(datasetlist)
layout = dict(
        updatemenus=list([dict(
            x=-0.05,y=1,yanchor='top',
            buttons=list([dict(args=['visible', switchlist[0]], 
                               label='%Female Users', method='restyle'),
                          dict(args=['visible', switchlist[1]], 
                               label='Drug-Use', method='restyle'),
                          dict(args=['visible', switchlist[2]], 
                               label='Open-Relationship', method='restyle'),
                          dict(args=['visible', switchlist[3]], 
                               label='Date-Googling', method='restyle'),
                          dict(args=['visible', switchlist[4]], 
                               label='Cognitive Ability', method='restyle'),
                          dict(args=['visible', switchlist[5]], 
                               label='Sex Drive', method='restyle'),
                          dict(args=['visible', switchlist[6]], 
                               label='Politics', method='restyle'),
                          dict(args=['visible', switchlist[7]], 
                               label='Spirituality', method='restyle'),
                         ]))
                        ]),
        title = 'Mapping Answers of OK Cupid Users',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)'),
             )
    
fig = dict( data=data, layout=layout )
py.iplot(fig, filename='McNultyMap')

# Interactive Scatter Plot

### Get State Population

In [325]:
df = pd.concat([user_state,y_drugs.iloc[:,0]],axis=1)
df.columns=['state','class']
df = df.groupby('state').count();
datalist = []
for i in range(len(df)):
    state = df.iloc[i,:].name
    score = df.iloc[i,0]
    if state in US_states:
        datalist.append([state, score])
df = pd.DataFrame.from_records(datalist)
df.columns = ['state','N']
df_N = df

### Generate Scatter Plot

In [529]:
msize = df_N.N**0.8*4

trace = Scatter(
    x=df_gender['%female'],
    y=df_SD['SD'],
    text=df['state'],
    mode='markers',
    marker=dict(
        size=msize,
        sizemode='area',
    ),
    name = '% Female Users'
)
trace0 = Scatter(
    x=df_drugs['%UsedHardDrugs'],
    y=df_SD['SD'],
    text=df['state'],
    mode='markers',
    marker=dict(
        size=msize,
        sizemode='area',
    ),
    name = 'Drug-Use'
)
trace1 = Scatter(
    x=df_openre['%OpenTo'],
    y=df_SD['SD'],
    text=df['state'],
    mode='markers',
    marker=dict(
        size=msize,
        sizeref=1,
        sizemode='area',
    ),
    name = 'Open-Relationship?'
)

trace2 = Scatter(
    x=df_google['%Google'],
    y=df_SD['SD'],
    text=df['state'],
    mode='markers',
    marker=dict(
        size=msize,
        sizeref=1,
        sizemode='area',
    ),
    name = 'Googling'
)

trace3 = Scatter(
    x=(df_CA['CA']-np.min(df_CA['CA']))*50,
    y=df_SD['SD'],
    text=df['state'],
    mode='markers',
    marker=dict(
        size=msize,
        sizeref=1,
        sizemode='area',
    ),
    name = 'Cognitive Ability'
)

trace4 = Scatter(
    x=df_sprit['sprit']+20,
    y=df_SD['SD'],
    text=df['state'],
    mode='markers',
    marker=dict(
        size=msize,
        sizeref=1,
        sizemode='area',
    ),
    name = 'Spirituality Score'
)

trace5 = Scatter(
    x=(df_polit['polit']-np.min(df_polit['polit']))*0.8,
    y=df_SD['SD'],
    text=df['state'],
    mode='markers',
    marker=dict(
        size=msize,
        sizeref=1,
        sizemode='area',
    ),
    name = 'Politics Score'
)

data = [trace, trace0, trace1, trace2, trace3, trace4, trace5]

layout = Layout(
    title='Correlations of OK Cupid Answers to Sex-Drive Score',
    xaxis=dict(
        title='Independent Variable (click on legend to choose what to display...)',
        gridcolor='rgb(255, 255, 255)',
        range=[0, 50],
        zerolinewidth=1,
        ticklen=5,
        gridwidth=2,
    ),
    yaxis=dict(
        title='OK Cupid Sex-Drive Score',
        gridcolor='rgb(255, 255, 255)',
        range=[0, 30],
        zerolinewidth=1,
        ticklen=5,
        gridwidth=2,
    ),
    paper_bgcolor='rgb(243, 243, 243)',
    plot_bgcolor='rgb(243, 243, 243)',
)

fig = Figure(data=data, layout=layout)

py.iplot(fig, filename='McNultyScatter')

# Interactive Stacked Bar Plot

### Drugs

In [536]:
#Dataset
y_drugs = get_columns_multi(df_rsp,['q80_']);
y_drugs.columns = ['occasional','regular','never','past']
y = y_drugs.iloc[:,[0]]*np.nan
y[y_drugs.occasional==1] = 100
y[y_drugs.regular==1] = 100
y[y_drugs.past==1] = 100
y[y_drugs.never==1] = 0
df = pd.concat([job,y],axis=1)
df.columns = ['job','class']
df = df.groupby('job').mean()
datalist = []
for i in range(len(df)):
    r = df.iloc[i,:].name
    score = df.iloc[i,0]
    if r in set(job) and r != 'Other' and r!= 'Rather not say':
        datalist.append([r, score])
df = pd.DataFrame.from_records(datalist)
df.columns = ['job','%UsedHardDrugs']
df_drugs = df.sort_values(['%UsedHardDrugs'])
#Bar Plot Object
t_drugs1 = Bar(
    y=df_drugs['job'],
    x=df_drugs['%UsedHardDrugs'],
    name='Used hard drugs',
    orientation = 'h',
    marker = dict(
        color = 'rgba(20, 186, 25, 0.8)',
        line = dict(
            color = 'rgba(20, 186, 25, 1.0)',
            width = 3)
    ),
    visible = False
)
t_drugs2 = Bar(
    y=df_drugs['job'],
    x=100-df_drugs['%UsedHardDrugs'],
    name='Never used hard drugs',
    orientation = 'h',
    marker = dict(
        color = 'rgba(58, 71, 80, 0.6)',
        line = dict(
            color = 'rgba(58, 71, 80, 1.0)',
            width = 3)
    ),
    visible = False
)

### Open Relationship

In [468]:
#Dataset
y_openre = get_columns_multi(df_rsp,['q325_'])
y_openre.columns =  ['no','yes']
y = y_openre.iloc[:,[0]]*np.nan
y[y_openre.yes==1] = 100
y[y_openre.no==1] = 0
df = pd.concat([job,y],axis=1)
df.columns = ['job','class']
df = df.dropna(subset=['class']).groupby('job').mean();
datalist = []
for i in range(len(df)):
    r = df.iloc[i,:].name
    score = df.iloc[i,0]
    if r in set(job) and r != 'Other' and r!= 'Rather not say':
        datalist.append([r, score])
df = pd.DataFrame.from_records(datalist)
df.columns = ['job','%OpenTo']
df_openre = df.sort_values(['%OpenTo'])
#Bar Plot Object
t_openre1 = Bar(
    y=df_openre['job'],
    x=df_openre['%OpenTo'],
    name='Okay with open relationship',
    orientation = 'h',
    marker = dict(
        color = 'rgba(246, 78, 139, 0.6)',
        line = dict(
            color = 'rgba(246, 78, 139, 1.0)',
            width = 3)
    ),
    visible = False
)
t_openre2 = Bar(
    y=df_openre['job'],
    x=100-df_openre['%OpenTo'],
    name='Not okay with open relationship',
    orientation = 'h',
    marker = dict(
        color = 'rgba(58, 71, 80, 0.6)',
        line = dict(
            color = 'rgba(58, 71, 80, 1.0)',
            width = 3)
    ),
    visible = False
)

### Date Googling

In [534]:
#Dataset
y_google = get_columns_multi(df_rsp,['q170849_'])
y_google.columns =  ['no','yes']
y = y_google.iloc[:,[0]]*np.nan
y[y_google.yes==1] = 100
y[y_google.no==1] = 0
df = pd.concat([job,y],axis=1)
df.columns = ['job','class']
df = df.dropna(subset=['class']).groupby('job').mean();
datalist = []
for i in range(len(df)):
    r = df.iloc[i,:].name
    score = df.iloc[i,0]
    if r in set(job) and r != 'Other' and r!= 'Rather not say':
        datalist.append([r, score])
df = pd.DataFrame.from_records(datalist)
df.columns = ['job','%Google']
df_google = df.sort_values(['%Google'])
#Bar Plot Object
t_google1 = Bar(
    y=df_google['job'],
    x=df_google['%Google'],
    name='Google their first dates',
    orientation = 'h',
    marker = dict(
        color = 'rgba(45, 45, 246, 0.6)',
        line = dict(
            color = 'rgba(45, 45, 246, 1.0)',
            width = 3)
    ),
    visible = False
)
t_google2 = Bar(
    y=df_google['job'],
    x=100-df_google['%Google'],
    name='Does not want to spoil the mystery!',
    orientation = 'h',
    marker = dict(
        color = 'rgba(58, 71, 80, 0.6)',
        line = dict(
            color = 'rgba(58, 71, 80, 1.0)',
            width = 3)
    ),
    visible = False
)

### Gender data

In [537]:
y = CA*np.nan
y[gender == 'Woman'] = 100
y[gender == 'Man'] = 0
df = pd.concat([job,y],axis=1)
df.columns = ['job','class']
df = df.dropna(subset=['class']).groupby('job').mean();
datalist = []
for i in range(len(df)):
    r = df.iloc[i,:].name
    score = df.iloc[i,0]
    if r in set(job) and r != 'Other' and r!= 'Rather not say':
        datalist.append([r, score])
df = pd.DataFrame.from_records(datalist)
df.columns = ['job','%Female']
df_gender = df.sort_values(['%Female'])
#Bar Plot Object
t_gender1 = Bar(
    y=df_gender['job'],
    x=df_gender['%Female'],
    name='Female',
    orientation = 'h',
    marker = dict(
        color = 'rgba(246, 45, 45, 0.6)',
        line = dict(
            color = 'rgba(246, 45, 45, 1.0)',
            width = 3)
    ),
)
t_gender2 = Bar(
    y=df_gender['job'],
    x=100-df_gender['%Female'],
    name='Male',
    orientation = 'h',
    marker = dict(
        color = 'rgba(45, 45, 246, 0.6)',
        line = dict(
            color = 'rgba(45, 45, 246, 1.0)',
            width = 3)
    ),
)

### Plotting the Bar Plot

In [641]:
datasetlist = [t_gender1, t_gender2, t_drugs1, t_drugs2, t_openre1, t_openre2, t_google1, t_google2]

data = Data(datasetlist)
layout = Layout(
    updatemenus=list([dict(
            x=0.95,y=1.1,yanchor='top',
            buttons=list([dict(args=['visible', [True, True, False, False, False, False, False, False]], 
                               label='%Female', method='restyle'),
                          dict(args=['visible', [False, False, True, True, False, False, False, False]], 
                               label='Drug-Use', method='restyle'),
                          dict(args=['visible', [False, False, False, False, True, True, False, False]], 
                               label='Open-Relationship', method='restyle'),
                          dict(args=['visible', [False, False, False, False, False, False, True, True]], 
                               label='Googling', method='restyle'),
                         ]))
                        ]),
    
    xaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=True,
        zeroline=False,
        domain=[0.15, 1]
        ),
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=True,
        zeroline=False,
        ),
    barmode='stack'
)

fig = Figure(data=data, layout=layout)
py.iplot(fig, filename='McNultyBarPlotJobs')

# Features Bar Plot

In [642]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

df_key0 = pd.read_sql_table('questions_key',engine)
df_key = df_key0[df_key0.loc[:,'N']>=31548]

In [643]:
tst = get_columns_multi(df_tst,['q18698_'])
rsp = get_columns_multi(df_rsp,['q1062_','q1128_','q123_','q136_','q15280_','q1597_','q19874_','q274_','q35_','q358084_',
                                'q37693_','q393_','q4018_','q43261_','q60100_','q66506_','q77_','q79_','q81_'])
q_list = ['q18698_','q1062_','q1128_','q123_','q136_','q15280_','q1597_','q19874_','q274_','q35_','q358084_',
                                'q37693_','q393_','q4018_','q43261_','q60100_','q66506_','q77_','q79_','q81_']
X = pd.concat([tst, rsp],axis=1)

y0 = y_drugs.iloc[:,[0]]*np.nan
y0.columns = ['y']
y0[gender == 'Man'] = 'Man'
y0[gender == 'Woman'] = 'Woman'
dataset0 = pd.concat([y0, X],axis=1)
dataset0.dropna(subset=['y'],inplace=True)

y_drugs = get_columns_multi(df_rsp,['q80_']);
y_drugs.columns = ['occasional','regular','never','past']
y1 = y_drugs.iloc[:,[0]]*np.nan
y1.columns = ['y']
y1[y_drugs.occasional==1] = 'not never'
y1[y_drugs.regular==1] = 'not never'
y1[y_drugs.past==1] = 'not never'
y1[y_drugs.never==1] = 'never'
dataset1 = pd.concat([y1, X],axis=1)
dataset1.dropna(subset=['y'],inplace=True)

y_openre = get_columns_multi(df_rsp,['q325_'])
y_openre.columns =  ['no','yes']
y2 = y_openre.iloc[:,[0]]*np.nan
y2.columns = ['y']
y2[y_openre.yes==1] = 'yes'
y2[y_openre.no==1] = 'no'
dataset2 = pd.concat([y2, X],axis=1)
dataset2.dropna(subset=['y'],inplace=True)

y_google = get_columns_multi(df_rsp,['q170849_'])
y_google.columns =  ['no','yes']
y3 = y_google.iloc[:,[0]]*np.nan
y3.columns = ['y']
y3[y_google.yes==1] = 'yes'
y3[y_google.no==1] = 'no'
dataset3 = pd.concat([y3, X],axis=1)
dataset3.dropna(subset=['y'],inplace=True)

In [644]:
X_train, X_test, y_train, y_test = train_test_split(dataset0.iloc[:,1:],dataset0.iloc[:,0],test_size=0.5)
model = LogisticRegression()
model.fit(X_train, y_train)
fit = model.fit(X_train,y_train)
features = {}
for i in range(len(list(fit.coef_[0]))):
    headerstr = X.columns[i]
    headerstr = headerstr.split('_')
    qID = headerstr[0]+'_'
    if qID in features:
        features[qID] = np.max([np.abs(fit.coef_[0][i]),features[qID]])
    else:
        features[qID] = np.abs(fit.coef_[0][i])
coef = []
for q in q_list:
    if q == 'q18698_':
        question = "What is the Ideal Gas Law?"
    else:
        question = get_question(df_key,q[:-1]).text.item()
    coef.append([features[q],question])
coef0 = pd.DataFrame.from_records(sorted(coef))

X_train, X_test, y_train, y_test = train_test_split(dataset1.iloc[:,1:],dataset1.iloc[:,0],test_size=0.5)
model = LogisticRegression()
model.fit(X_train, y_train)
fit = model.fit(X_train,y_train)
features = {}
for i in range(len(list(fit.coef_[0]))):
    headerstr = X.columns[i]
    headerstr = headerstr.split('_')
    qID = headerstr[0]+'_'
    if qID in features:
        features[qID] = np.max([np.abs(fit.coef_[0][i]),features[qID]])
    else:
        features[qID] = np.abs(fit.coef_[0][i])
coef = []
for q in q_list:
    if q == 'q18698_':
        question = "What is the Ideal Gas Law?"
    else:
        question = get_question(df_key,q[:-1]).text.item()
    coef.append([features[q],question])
coef1 = pd.DataFrame.from_records(sorted(coef))

X_train, X_test, y_train, y_test = train_test_split(dataset2.iloc[:,1:],dataset2.iloc[:,0],test_size=0.5)
model = LogisticRegression()
model.fit(X_train, y_train)
fit = model.fit(X_train,y_train)
features = {}
for i in range(len(list(fit.coef_[0]))):
    headerstr = X.columns[i]
    headerstr = headerstr.split('_')
    qID = headerstr[0]+'_'
    if qID in features:
        features[qID] = np.max([np.abs(fit.coef_[0][i]),features[qID]])
    else:
        features[qID] = np.abs(fit.coef_[0][i])
coef = []
for q in q_list:
    if q == 'q18698_':
        question = "What is the Ideal Gas Law?"
    else:
        question = get_question(df_key,q[:-1]).text.item()
    coef.append([features[q],question])
coef2 = pd.DataFrame.from_records(sorted(coef))

X_train, X_test, y_train, y_test = train_test_split(dataset3.iloc[:,1:],dataset3.iloc[:,0],test_size=0.5)
model = LogisticRegression()
model.fit(X_train, y_train)
fit = model.fit(X_train,y_train)
features = {}
for i in range(len(list(fit.coef_[0]))):
    headerstr = X.columns[i]
    headerstr = headerstr.split('_')
    qID = headerstr[0]+'_'
    if qID in features:
        features[qID] = np.max([np.abs(fit.coef_[0][i]),features[qID]])
    else:
        features[qID] = np.abs(fit.coef_[0][i])
coef = []
for q in q_list:
    if q == 'q18698_':
        question = "What is the Ideal Gas Law?"
    else:
        question = get_question(df_key,q[:-1]).text.item()
    coef.append([features[q],question])
coef3 = pd.DataFrame.from_records(sorted(coef))

In [650]:
data0 = Bar(
    y=coef0[1],
    x=coef0[0],
    name='Feature Strength',
    orientation = 'h',
    marker = dict(
        color = 'rgba(58, 71, 80, 0.6)',
        line = dict(
            color = 'rgba(58, 71, 80, 1.0)',
            width = 3)
    ),
    visible = True
)

data1 = Bar(
    y=coef1[1],
    x=coef1[0],
    name='Feature Strength',
    orientation = 'h',
    marker = dict(
        color = 'rgba(20, 186, 25, 0.6)',
        line = dict(
            color = 'rgba(20, 186, 25, 1.0)',
            width = 3)
    ),
    visible = False
)

data2 = Bar(
    y=coef2[1],
    x=coef2[0],
    name='Feature Strength',
    orientation = 'h',
    marker = dict(
        color = 'rgba(246, 78, 139, 0.6)',
        line = dict(
            color = 'rgba(246, 78, 139, 1.0)',
            width = 3)
    ),
    visible = False
)

data3 = Bar(
    y=coef3[1],
    x=coef3[0],
    name='Feature Strength',
    orientation = 'h',
    marker = dict(
        color = 'rgba(58, 71, 246, 0.6)',
        line = dict(
            color = 'rgba(58, 71, 246, 1.0)',
            width = 3)
    ),
    visible = False
)

datasetlist = [data0, data1, data2, data3]

data = Data(datasetlist)
layout = Layout(
    updatemenus=list([dict(
            x=0,y=1.15,yanchor='top',
            buttons=list([dict(args=['visible', [True, False, False, False]], 
                               label='Male/Female', method='restyle'),
                          dict(args=['visible', [False, True, False, False]], 
                               label='Drug-Use', method='restyle'),
                          dict(args=['visible', [False, False, True, False]], 
                               label='Open-Relationship', method='restyle'),
                          dict(args=['visible', [False, False, False, True]], 
                               label='Googling', method='restyle'),
                         ]))
                        ]),
    
    xaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=True,
        zeroline=False,
        domain=[0.65, 1]
        ),
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=True,
        zeroline=False,
        ),
    barmode='stack',
    title = 'Ranking feature strength for each predictor'
)

fig = Figure(data=data, layout=layout)
py.iplot(fig, filename='McNultyBarPlot')