## FIT5120: IE Studio Project

### {EPIC7} Addiction risk prediction model

#### Author: Mandeep Singh

In [1]:
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder


dict_regr = dict()

df_X = pd.read_csv('df_X.csv')
df_Y = pd.read_csv('df_cause_effect.csv')


In [2]:
df_X.columns


Index(['Unnamed: 0', 'adwrdiet', 'talkprob', 'auoptyr', 'yetlknon', 'yowrslep',
       'pnrlwd3sx', 'pnrrsotrs2', 'aualtyr', 'ciginctl', 'cigcrave', 'iralcfy',
       'irmjfy', 'cig30use'],
      dtype='object')

In [3]:
df_X.adwrdiet.unique()

array([99, 98,  2,  1, 94, 97])

In [4]:
set_unique_X = set()
for col in df_X.columns:
    if col not in {'Unnamed: 0', 'cig30use', 'iralcfy', 'irmjfy'}:
        set_unique_X = set_unique_X | set(df_X[col].unique())

set_unique_X

{0.0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 10,
 11,
 13,
 14,
 16,
 18,
 19,
 26,
 34,
 71,
 72,
 83,
 85,
 89,
 91,
 93,
 94,
 97,
 98,
 99,
 983,
 985,
 991,
 993,
 994,
 997,
 998,
 999}

In [5]:
df_Y.columns

Index(['Unnamed: 0', 'alcserpb', 'mrjlawtr', 'halulawtr', 'herserpb',
       'alclawtr', 'pnrlphctd', 'trqlserpb', 'alcemctd', 'methphctd',
       'coclawtr', 'haluemctd', 'stimemctd', 'pnrlpdang', 'mrjfmctd',
       'heremctd', 'methemctd', 'inhllsact', 'sedvserpb', 'trqlphlpb',
       'stimlawtr', 'inhlpdang', 'sedvemctd', 'alcphctd', 'alclsact',
       'halulsact', 'inhlemctd', 'herphlpb', 'cocserpb', 'inhllawtr',
       'sedvpdang', 'cocfmctd', 'mrjphlpb', 'trqllawtr', 'halupdang',
       'mrjpdang', 'trqlemctd', 'stimpdang', 'haluphctd', 'cocemctd',
       'stimlsact', 'sedvlsact', 'methfmctd', 'trqllsact', 'trqlpdang',
       'herpdang', 'herfmctd', 'stimphctd', 'pnrlphlpb', 'mrjemctd',
       'inhlserpb', 'sedvphlpb', 'cocpdang', 'haluserpb', 'mrjlsact',
       'trqlphctd', 'sedvfmctd', 'sedvphctd', 'methserpb', 'pnrllsact',
       'alcpdang', 'stimfmctd', 'haluphlpb', 'methpdang', 'sedvlawtr',
       'alcfmctd', 'inhlphctd', 'herphctd', 'mrjserpb', 'coclsact',
       'methphlpb

In [6]:
df_X['Unnamed: 0']

0              0
1              1
2              2
3              3
4              4
           ...  
282763    282763
282764    282764
282765    282765
282766    282766
282767    282767
Name: Unnamed: 0, Length: 282768, dtype: int64

In [7]:
df_X.drop(columns='Unnamed: 0', inplace=True)

In [8]:
df_Y.drop(columns='Unnamed: 0', inplace=True)

In [9]:
from sklearn.compose import ColumnTransformer

preprocess = ColumnTransformer(transformers=
    [
        ('num', StandardScaler(), ['cig30use', 'iralcfy', 'irmjfy']),
        ('cat', OneHotEncoder(), list(set(df_X.columns) - {'cig30use', 'iralcfy', 'irmjfy'}))
    ]
)

In [10]:
df_X.shape

(282768, 13)

In [11]:
df_Y.shape

(282768, 97)

In [12]:
import pickle
from sklearn.neural_network import MLPClassifier

df_X_ = preprocess.fit_transform(df_X)
preprocess_y = ColumnTransformer(
    [
        ('cat', OneHotEncoder(), df_Y.columns)
    ]
)

df_Y_ = preprocess_y.fit_transform(df_Y)

print(f"df_X_.shape: {df_X_.shape}, df_Y_.shape: {df_Y_.shape}")

X_train, X_test, Y_train, Y_test = train_test_split(df_X_, df_Y_,#.to_numpy(),
                                                    random_state=1)

print(f"X_train.shape: {X_train.shape}, Y_train: {Y_train.shape}")


df_X_.shape: (282768, 104), df_Y_.shape: (282768, 854)
X_train.shape: (212076, 104), Y_train: (212076, 854)


In [16]:
import os

if not os.path.isdir('./mlp_clf'):
    os.mkdir('mlp_clf')

dict_clf = {}
print("MLPClassifier:")
for i, col in enumerate(df_Y.columns):
    if col != 'Unnamed: 0':
        curr_clf = MLPClassifier(hidden_layer_sizes=(64,))

        # print(X_train.shape, Y_train[:,i].shape)
        curr_clf.fit(X_train, Y_train[:,i].toarray().ravel())
        dict_clf[col] = {'clf': curr_clf,
                          'score': curr_clf.score(X_test, Y_test[:,i].toarray().ravel())}

        print(f"trained using Y[{col}] w/ score={dict_clf[col]['score']}")

        with open('./mlp_clf/mlp_clf_' + col + '.pkl', 'wb') as file:
            pickle.dump(curr_clf, file)


MLPClassifier:
trained using Y[alcserpb] w/ score=0.9854863350874215
trained using Y[mrjlawtr] w/ score=0.9715667968086912
trained using Y[halulawtr] w/ score=0.9979347026537656
trained using Y[herserpb] w/ score=0.9999292706388276
trained using Y[alclawtr] w/ score=0.9327080857805692




trained using Y[pnrlphctd] w/ score=0.9197504668137837
trained using Y[trqlserpb] w/ score=0.9994200192383862
trained using Y[alcemctd] w/ score=0.9993351440049794
trained using Y[methphctd] w/ score=0.9938182538335314
trained using Y[coclawtr] w/ score=0.9970718044474622
trained using Y[haluemctd] w/ score=0.9891076783794489
trained using Y[stimemctd] w/ score=0.9992361228993379
trained using Y[pnrlpdang] w/ score=1.0
trained using Y[mrjfmctd] w/ score=0.998387370565269




trained using Y[heremctd] w/ score=0.9934646070276694
trained using Y[methemctd] w/ score=0.9998443954054207
trained using Y[inhllsact] w/ score=0.9992361228993379
trained using Y[sedvserpb] w/ score=0.9963079273468002
trained using Y[trqlphlpb] w/ score=0.999561477960731
trained using Y[stimlawtr] w/ score=0.972938946415436
trained using Y[inhlpdang] w/ score=0.9999858541277655
trained using Y[sedvemctd] w/ score=0.8810190686357721
trained using Y[alcphctd] w/ score=0.8922225994454818
trained using Y[alclsact] w/ score=0.9998726871498896
trained using Y[halulsact] w/ score=0.999915124766593
trained using Y[inhlemctd] w/ score=0.9927290216714763
trained using Y[herphlpb] w/ score=0.9982317659706897
trained using Y[cocserpb] w/ score=0.9973405760199174
trained using Y[inhllawtr] w/ score=0.9817801165619872
trained using Y[sedvpdang] w/ score=0.9870140892887456
trained using Y[cocfmctd] w/ score=0.9999858541277655
trained using Y[mrjphlpb] w/ score=1.0
trained using Y[trqllawtr] w/ score



trained using Y[cocemctd] w/ score=0.9224664742828043
trained using Y[stimlsact] w/ score=0.9995190403440276
trained using Y[sedvlsact] w/ score=0.9994341651106208
trained using Y[methfmctd] w/ score=0.9937899620890625
trained using Y[trqllsact] w/ score=0.9996322073219035
trained using Y[trqlpdang] w/ score=0.9992927063882759
trained using Y[herpdang] w/ score=1.0
trained using Y[herfmctd] w/ score=0.9999858541277655
trained using Y[stimphctd] w/ score=1.0
trained using Y[pnrlphlpb] w/ score=1.0
trained using Y[mrjemctd] w/ score=1.0
trained using Y[inhlserpb] w/ score=0.9997595201720139
trained using Y[sedvphlpb] w/ score=0.9997312284275448
trained using Y[cocpdang] w/ score=0.9985288292876139
trained using Y[haluserpb] w/ score=0.9981185989928139
trained using Y[mrjlsact] w/ score=0.9751315566117806
trained using Y[trqlphctd] w/ score=0.9956713630962485
trained using Y[sedvfmctd] w/ score=0.9998868330221241
trained using Y[sedvphctd] w/ score=0.948608046172127
trained using Y[methse



trained using Y[herphctd] w/ score=0.919142194307701
trained using Y[mrjserpb] w/ score=0.999971708255531
trained using Y[coclsact] w/ score=0.9993209981327449
trained using Y[methphlpb] w/ score=0.9929695014994625
trained using Y[mrjphctd] w/ score=0.9446189102020031
trained using Y[inhlphlpb] w/ score=0.9998161036609517
trained using Y[pnrlserpb] w/ score=0.9997736660442483
trained using Y[herlsact] w/ score=1.0
trained using Y[trqlfmctd] w/ score=0.9534176427318508
trained using Y[pnrlfmctd] w/ score=0.961961749561478
trained using Y[inhlfmctd] w/ score=0.9999434165110621
trained using Y[cocphlpb] w/ score=0.9983024953318622
trained using Y[herlawtr] w/ score=0.9929695014994625
trained using Y[methlsact] w/ score=0.9990663724325242
trained using Y[cocphctd] w/ score=0.9774231879137668
trained using Y[stimphlpb] w/ score=1.0
trained using Y[alcphlpb] w/ score=0.9000877044078538
trained using Y[pnrlemctd] w/ score=0.9099473773552877
trained using Y[methlawtr] w/ score=0.99995756238329

In [17]:
with open('dict_mlp.pkl', 'wb') as file:
    pickle.dump(dict_clf, file)

In [18]:
with open('X_column_transformer.pkl', 'wb') as file:
    pickle.dump(preprocess.get_params(), file)

preprocess_Y = ColumnTransformer(
    [
        ('cat', OneHotEncoder(), df_Y.columns)
    ]
)

df_Y__ = preprocess_Y.fit_transform(df_Y)

with open('Y_column_transformer.pkl', 'wb') as file:
    pickle.dump(preprocess_Y.get_params(), file)


In [19]:
with open('X_column_transformer.pkl', 'wb') as file:
    pickle.dump(preprocess, file)


with open('Y_column_transformer.pkl', 'wb') as file:
    pickle.dump(preprocess_y, file)



In [None]:
df_Y__.shape

In [None]:
import json
with open('X_Y_columns.json', 'w') as file:
    json.dump({
      'X_col': list(df_X.columns),
      'Y_col': list(df_Y.columns)
    }, file)

### Inference

In [None]:
list(df_X.columns)

In [None]:
import numpy as np

# sample instance
x = np.array([1.,  1.,  1.,  1.,  2.,  1., 18.,  1.,  5.,  1., 12.,  5.,  1.])#.reshape(1,len(x))
dict_x = {}
for i, col in enumerate(df_X.columns):
    dict_x[col] = x[i]
df_x = pd.DataFrame(dict_x, index=[0])
print(df_x.shape)


In [None]:
import pickle

preprocess = ColumnTransformer(transformers=
    [
        ('num', StandardScaler(), ['cig30use', 'iralcfy', 'irmjfy']),
        ('cat', OneHotEncoder(), list(set(df_X.columns) - {'cig30use', 'iralcfy', 'irmjfy'}))
    ]
)

preprocess.fit_transform(df_X)




In [None]:
with open("X_column_transformer.pkl", 'wb') as file:
    pickle.dump(preprocess, file)

In [None]:
with open("Y_column_transformer.pkl", 'wb') as file:
    pickle.dump(preprocess_y, file)

In [None]:
import heapq

# load the pre-processor
with open('X_column_transformer.pkl', "rb") as file:
    preprocess1 = pickle.load(file)

x_transformed = preprocess1.transform(df_x)

# causal effect variable options: 1=Yes, 2=No, 91=never used/misused, 93=didn't misuse in past 12 months
list_y_pred_prob = []

# heap of top5 prediction probability for each classes
heap_y_pred_prob_top5 = []
heapq.heappush(heap_y_pred_prob_top5, (0, -1))

# joint probabil
joint_y_pred_prob = None
for i, col in enumerate(df_Y.columns):
    list_y_pred_prob.append(dict_clf[col]['clf'].predict_proba(x_transformed))
    # update queue of top5
    if max(list_y_pred_prob[-1]) > heap_y_pred_prob_top5[0]:
        if len(heap_y_pred_prob_top5) == 5:
            heapq.heappop(heap_y_pred_prob_top5)
        heapq.heappush(heap_y_pred_prob_top5, (max(list_y_pred_prob[-1]), i))
        heapq.heapify(heap_y_pred_prob_top5)

    if joint_y_pred_prob is None:
        joint_y_pred_prob = list_y_pred_prob[-1]
    else:
        joint_y_pred_prob = joint_y_pred_prob + list_y_pred_prob[-1]

joint_y_pred_prob = joint_y_pred_prob/len(list_y_pred_prob)

np_y_pred_prob = np.asarray(list_y_pred_prob)
np_y_pred_prob.shape


In [None]:
df_Y.adwrdiet[0]

Clustering Algorithm