In [105]:
import pandas as pd
import sqlalchemy as sq
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
plt.style.use('fivethirtyeight')

%matplotlib inline

# Connect to database

In [106]:
engine = sq.create_engine('sqlite:///OKC_Processed_DB.db') #create connection to the database

# Get table

In [107]:
df_rsp = pd.read_sql_table('question_responses',engine)

# Function to get the columns we need

In [108]:
def get_columns_multi(df,keylist):
    headerlist = []
    for key in keylist:
        headers_all = list(df.columns)
        headers = [name for name in headers_all if name[0:len(key)]==key]
        headerlist = headerlist+headers
    return df.loc[:,headerlist]

# Questions list

In [110]:
q_drugs = ['q79_','q77_','q1062_','qq66506_','q43261_',
            'q1052_','q21411_','q45428_','q35355_','q31877_']
q_openre = ['q34113_','q1597_','q35_','q1128_','q17_',
            'q24375_','q393_','q35203_','q136_','q9688_']
q_google = ['q19874_','q4018_','q60100_','q308_','q358084_',
            'q358077_','q12964_','q501_','q154_','q20930_']

a_drugs = ['q80_']
a_openre = ['q325_']
a_google = ['q170849_']

In [111]:
q_all = q_drugs + q_openre + q_google
a_all = a_drugs + a_openre + a_google

In [112]:
X = get_columns_multi(df_rsp,q_all);np.shape(X)
X.fillna(value=0,inplace=True)

# Modelling Drug Experience

In [450]:
y_drugs = get_columns_multi(df_rsp,a_drugs);y_drugs.head()

Unnamed: 0,q80_I do drugs occasionally.,q80_I do drugs regularly.,q80_I never do drugs.,"q80_I've done drugs in the past, but no longer."
0,0,0,1,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0


In [451]:
y_drugs.columns =  ['occasional','regular','never','past']
np.sum(y_drugs,0)

occasional     4169
regular         192
never         35650
past          10096
dtype: int64

In [331]:
import warnings
warnings.filterwarnings('ignore')#hide warnings

y = y_drugs.iloc[:,[0]]
y.columns = ['class']
y[y_drugs.occasional==1] = 'Not Never'
y[y_drugs.regular==1] = 'Not Never'
y[y_drugs.past==1] = 'Not Never'
y[y_drugs.never==1] = 'Never'
y[y==0] = np.nan #remove non-respondents after other classes have been assigned

In [332]:
dfdrugs = pd.concat([y,X],axis=1)
dfdrugs.dropna(subset=['class'],inplace=True)

In [442]:
from sklearn.calibration import CalibratedClassifierCV
X_train, X_test, y_train, y_test = train_test_split(dfdrugs.iloc[:,1:],dfdrugs.iloc[:,0],test_size=0.5)
LR = LogisticRegression()
model = CalibratedClassifierCV(LR, cv=10, method='sigmoid')
model.fit(X_train, y_train)
fit = model.fit(X_train,y_train)
y_pred = fit.predict(X_test)
accuracy_score(y_pred,y_test)

0.80989063622575241

In [446]:
bachelorID = np.random.choice(range(len(X)))
RandomBachelor = X.iloc[bachelorID,:].values.reshape(1,-1)
y_pred = fit.predict_proba(RandomBachelor)
print('Probability that Bachelor #'+str(bachelorID)+
      ' has tried drugs harder than marijuana:',
      str(int(y_pred[0][1]*100))+'%')

Probability that Bachelor #29481 has tried drugs harder than marijuana: 60%
