In [1]:
import pandas as pd
from datetime import datetime
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import math
import scipy.spatial.distance as ssd
from sklearn import metrics
from sklearn.metrics import classification_report
import scipy.interpolate
import scipy.cluster.hierarchy as shc
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression


# Import environment tools
import re
import itertools
import warnings
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
import pandas as pd
import numpy as np
import keras

# Import keras tools
from keras import regularizers
from keras.callbacks import History 
from keras.layers import Dense, Input, Dropout
from keras.models import Sequential
from keras.utils import np_utils
from keras.utils.np_utils import to_categorical
from keras.wrappers.scikit_learn import KerasClassifier

# Import other tools
from __future__ import print_function
from pandas import read_excel
from IPython.display import Image
from collections import Counter
from itertools import cycle
from scipy import stats, integrate, interp
from subprocess import check_output




def clean_cols(df):
    import re
    
    cols=list(df.columns.values)
    
    # Lowercase everything
    cols=list(map(lambda x: x.lower(), cols))
    
    # Remove special characters 
    cols = [re.sub(r'[^a-zA-Z0-9]','_',string) for string in cols]
    
    # Rename colums
    df.columns = cols

    return df

In [88]:
# Import relevant machine learning models

import sklearn
# Gradient Boosters
import xgboost as xgb # Accuracy
import lightgbm as lgb # Speed

from sklearn import decomposition, preprocessing, svm
# Dimensionality Reduction
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
# Ensemble
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier, ExtraTreesClassifier
# Guassian
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
# Regression
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
# Bayesian
from sklearn.naive_bayes import GaussianNB
# Instance Based
from sklearn.neighbors import KNeighborsClassifier
# Nueral Network
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

# Import relevant machine learning analyis tools
from sklearn import metrics
#from sklearn.cross_validation import KFold, train_test_split
# Imputation
from sklearn.impute import SimpleImputer 
from sklearn.metrics import mean_absolute_error,roc_curve,accuracy_score,auc,roc_auc_score,confusion_matrix,precision_score,recall_score,f1_score, classification_report
from sklearn.metrics.cluster import fowlkes_mallows_score
from sklearn.model_selection import BaseCrossValidator, GridSearchCV, train_test_split,cross_val_score,cross_validate,cross_val_predict, KFold, StratifiedKFold, learning_curve
from sklearn.pipeline import Pipeline
# Standardization
from sklearn.preprocessing import LabelEncoder, StandardScaler, label_binarize

# Create random state
random_state=50

# Data Prep

In [None]:
# Read in data
df=pd.read_csv(r"C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\test\df.csv")

cols=['interaction_id', 'id', 'rec_date', 'month_id', 'flag', 'value', 'dob', 'ethnicity', 'race', 'state', 'year', 'age', 'row', 'max','age_grp']
df.columns = cols
df.head()

## Pivot Table

In [None]:
values='value'
index=['interaction_id', 'id', 'rec_date', 'age', 'age_grp', 'race', 'state']
columns='flag'
aggfunc=max

df_pivot = data_pivot(df,values,index,columns,aggfunc)
df_pivot.head()

In [None]:
q=df_pivot[df_pivot['last_period']==1]
q=q[['id','rec_date']]
q=q.groupby(['id']).agg(['min']).reset_index()
q.columns = ['id', 'min_last_period_date']

w = pd.merge(df_pivot, q, how="left", on="id")
w['post_meno']=np.where(w['rec_date']>= w['min_last_period_date'],1,0)
w.head()

## Aggregate to Quarter

In [None]:
# Aggregate to quarter
w['quarter'] = pd.PeriodIndex(w.rec_date, freq='Q').to_timestamp()
# Write to csv
w.to_csv(r"C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\test\df_quarter.csv")

# Max to the quarter
q=w.groupby(['id', 'quarter']).max().reset_index()
q.to_csv(r"C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\test\df_quarter_agg.csv")

## Make there is certain number of interactions and BMI records

In [None]:
w=pd.read_csv(r"C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\test\df_quarter_agg.csv")
w['max']=w['id'].groupby(w['id']).transform('count')

# Get list of ids that meet threshold of interactions
w.groupby(['max'])['id'].nunique()
pts_int_thresh=w[w['max']>10]
pts_int_thresh=pts_int_thresh[pts_int_thresh['max']<20]
pts_int_thresh_list=set(pts_int_thresh['id'])
pts_int_thresh.groupby(['max'])['id'].nunique()
print(pts_int_thresh.shape)
print(len(pts_int_thresh_list))

# Make there is certain number of interactions
w.columns
e=w[w['id'].isin(pts_int_thresh_list)]
print('Pts before filter ', w['id'].nunique())
print('Pts that meet interaction threshold ', len(pts_int_thresh_list))
print('Pts after filter ', e['id'].nunique())

e.to_csv(r"C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\test\df_quarter_agg2.csv")


## BMI Cleanup

In [None]:
# Remove BMI Outliers
w=pd.read_csv(r"C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\test\df_quarter_agg2.csv")

#Replace any BMI values below 15 and above 50 with 0
w1=w[(w['bmi']<15) | (w['bmi']>50)]
w2=w[~w['interaction_id'].isin(w1['interaction_id'])]
w1['bmi']=0

# Add bmi data back
w3 = pd.concat([w1, w2])

print(w.shape[0], ' interactions in original df')
print(w3.shape[0], ' interactions in cleansed df')
print('BMI Range (original): ', max(w['bmi']),' -', min(w['bmi']))
print('BMI Range (cleansed): ', max(w3['bmi']),' -', min(w3['bmi']))

w3.to_csv(r"C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\test\df_quarter_agg2_clean.csv")

In [None]:
# Make sure there are atleast 2 BMI records
w=pd.read_csv(r"C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\test\df_quarter_agg2_clean.csv")

#Replace any BMI values below 15 with 0


# Get count of records for each id with bmi values that aren't 0
yes_bmi=w[w['bmi']>0].groupby('id')['bmi'].count().reset_index()

# Get ids that have mort than 2 records
yes_bmi=yes_bmi[yes_bmi['bmi']>2]
yes_bmi.to_csv(r"C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\test\crap.csv")

# Filter the dataset to only include these ids
q=w[w['id'].isin(yes_bmi['id'])]
print('# of patients with more than 1 good BMI record',len(yes_bmi['id']))
print('# of patients to keep',q['id'].nunique())

q.to_csv(r"C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\test\df_quarter_agg3.csv")


## Interpolate BMI

In [None]:
#https://kanoki.org/2020/04/14/resample-and-interpolate-time-series-data/
q=pd.read_csv(r"C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\test\df_quarter_agg3.csv")

q[ 'quarter' ] = pd.to_datetime(q['quarter'])

# Replace 0 with NAN
q.replace(0, np.nan, inplace=True)

def interpolate_bmi(df):
    # Interpolate BMI
    q1=df[['quarter', 'bmi']].set_index('quarter').resample('Q', label='left').mean().interpolate('spline', order=1)
    
    # Replace BMI with nearest when unable to interpolate
    mask = np.isnan(q1['bmi'])
    q1['bmi'][mask] = np.interp(np.flatnonzero(mask), np.flatnonzero(~mask), q1['bmi'][~mask])

    # Reset Index
    q1=q1.reset_index()
    
    # Fix date
    q1['quarter']=pd.DatetimeIndex(q1['quarter']) + pd.DateOffset(1)
    
    # Merge back with dataset
    del(df['bmi'])
    q2=df.merge(q1, how='left', on='quarter')
    return q2

q1=q.groupby('id').apply(lambda x: interpolate_bmi(x))

q1.to_csv(r"C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\test\df_quarter_agg3_bmi.csv")

## Convert Race, Remove Ineligible patients

In [138]:
e=pd.read_csv(r"C:\Users\A4023862\OneDrive - Astellas Pharma Inc\data\emr\test\df_quarter_agg3_bmi.csv")
e=e.fillna(0)

# Convert race to categorical
q=e[['race']]
q['race']=q['race'].str.lower()
q=pd.get_dummies(q)
q['interaction_id']=e['interaction_id']

# Add back new race columns
e=e.merge(q, on='interaction_id')

# Filter patients with surgical menopause, cancer
w1=e[e['bi_oophorectomy']==1]
w2=e[e['hysterectomy']==1]
w3=e[e['uni_oophorectomy']==1]
w4=e[(e['cancer']==1) | (e['breast_cancer']==1)]
w5 = pd.concat([w1,w2,w3, w4],ignore_index=True)

e=e[~e.id.isin(w5['id'])]

# Combine fatigue and sleep_distrubance
e['fatigue_sleep_disturbances']=e['fatigue']+e['sleep_disturbance']

# Change dry_skin to skin_changes
e['skin_changes']=e['dry_skin']

symp_list=list(e.columns)
unwanted = {'interaction_id','min_last_period_date', 'rec_date', 
            'Unnamed: 0', 'Unnamed: 1', 'Unnamed: 0', 'Unnamed: 0.1', 
            'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1', 'id.1',
            'age_grp', 'state', 'last_period', 'max', 
            'dry_skin','fatigue', 'sleep_disturbance'}
cofactor_list=['race_african american','race_asian','race_caucasian',
               'race_hispanic','race_other','race_unknown', 
               'breast_cancer','cancer','alcohol_consumption', 
               'bc_implant', 'bc_injection', 'bc_oral', 'bc_other','bc_patch', 
               'bi_oophorectomy', 'birth_control', 'endometrial ablation',
               'hrt', 'hyst_oophorectomy', 'hysterectomy',
               'menopause', 'smoker','uni_oophorectomy', 'bmi']

symp_list = [e for e in symp_list if e not in unwanted]
symp_list = [e for e in symp_list if e not in cofactor_list]

train=e[symp_list]
cofactors=e[cofactor_list+['id', 'age','quarter']]

# Reduce sparsity in data
# train['sym_count']=train.drop(['id','age'],axis=1).sum(1)
# train = train[train['sym_count']>3]

# Copy dataset, so changes won't happen to train df
df1=train.copy()

# Change column names and move to front
df1.insert(0, 'age', df1.pop('age'))
df1.insert(0, 'id', df1.pop('id'))
cofactors.insert(0, 'age', cofactors.pop('age'))
cofactors.insert(0, 'id', cofactors.pop('id'))

# Make df distinct by id, age, and quarter
df1=df1.groupby(['id', 'age', 'quarter']).first().reset_index()
cofactors=cofactors.groupby(['id', 'age', 'quarter']).first().reset_index()

# Keep certain features
features=list(df1.columns)
df1 = df1[features]

# Merge symtoms and cofactors
risk_pred=df1.merge(cofactors, on=['id', 'age', 'quarter'])

print(df1.shape)
print(cofactors.shape)
print(risk_pred.shape)
risk_pred.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  q['race']=q['race'].str.lower()


(124092, 42)
(124092, 27)
(124092, 66)


Unnamed: 0,id,age,quarter,race,amenorrhea,anxiety,bloating,dec_libido,depression,dizziness,...,bi_oophorectomy,birth_control,endometrial ablation,hrt,hyst_oophorectomy,hysterectomy,menopause,smoker,uni_oophorectomy,bmi
0,612281,51,2011-01-01,CAUCASIAN,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,17.92
1,612281,51,2011-04-01,CAUCASIAN,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,18.84
2,612281,51,2011-07-01,CAUCASIAN,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.19
3,612281,51,2011-10-01,CAUCASIAN,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.02
4,612281,52,2012-01-01,CAUCASIAN,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,18.278615


# Arguments For Training the Model

In [139]:
# What variable are you assessing risk for?
risk_param='hot_flash'

# Risk at what time period (quarters)
risk_time=1

# What variables will be inputs to the model?
cofactor_list=[
# COFACTORS
'race_african american',
 'race_asian',
 'race_caucasian',
 'race_hispanic',
 'race_other',
 'race_unknown',
 # 'breast_cancer',
 # 'cancer',
 'alcohol_consumption',
 'bc_implant',
 'bc_injection',
 'bc_oral',
 'bc_other',
 'bc_patch',
 'birth_control',
 'bi_oophorectomy',
 # 'endometrial ablation',
 # 'hrt',
 # 'hyst_oophorectomy',
 'hysterectomy',
 'menopause',
 'smoker',
 # 'uni_oophorectomy',
 # 'bmi'
# SYMPTOMS
 # 'amenorrhea',
 'anxiety',
 # 'bloating',
 'dec_libido',
 'depression',
 # 'dizziness',
 'dyspareunia',
 'fatigue_sleep_disturbances',
 'hair_loss',
 'headache_migraine',
 # 'headache_migraine_freq',
 # 'headache_migraine_rx',
 'hot_flash',
 # 'hot_flash_freq',
 # 'hot_flash_rx',
 # 'hot_flash_sev',
 # 'incontinence',
 'irritability',
 'memory_lapse',
 # 'menstrual_changes',
 'night_sweats',
 # 'night_sweats_freq',
 # 'night_sweats_rx',
 # 'night_sweats_sev',
 'oab_incontinence',
 # 'oligomenorrhea',
 # 'osteoporosis',
 # 'sexual_dysfunction',
'skin_changes',
 # 'sleep_disturbance',
 # 'stress_incontinence',
 'urge_incontinence',
 'uti',
 'vaginal_dryness',
 # 'vaginal_dryness_freq',
 # 'vaginal_dryness_rx',
 # 'vaginal_dryness_sev',
 'weight_gain',
 # 'post_meno'
]

# Test Model for Selection and Deployment to API

## Create dependant variable

In [140]:
# Create dependant variable
grouped = risk_pred.groupby('id')
L = []
for id, id_df in grouped:
    id_df['risk']=id_df[risk_param].shift(risk_time)
    L.append(id_df)

risk_pred=pd.concat(L)

## Address Class Imbalance

In [141]:
# class count
class_count_0, class_count_1 = risk_pred['risk'].value_counts()

# Separate class
class_0 = risk_pred[risk_pred['risk'] == 0]
class_1 = risk_pred[risk_pred['risk'] == 1]

# print the shape of the class
print('class 0:', class_0.shape)
print('class 1:', class_1.shape)

class_1_over = class_1.sample(class_count_0, replace=True)

test_over = pd.concat([class_1_over, class_0], axis=0)

print("total class of 1 and 0:",test_over[risk_param].value_counts())

class 0: (113872, 67)
class 1: (947, 67)
total class of 1 and 0: 0.0    203082
1.0     24662
Name: hot_flash, dtype: int64


## Create Train and Test Data

In [142]:
X = test_over[cofactor_list]
y = test_over['risk']

print("Cofactors:", X.shape)
print("Risk:",y.shape)

# Create Train and Test Data
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=random_state, test_size=.3)


Cofactors: (227744, 34)
Risk: (227744,)


## Test Multiple Algorithms

In [143]:
names = [
        "Random Forest"
        # ,"k-Nearest Neighbors"         
         # ,"Support Vector Machine"
         # ,"Linear SVM"
         # ,"RBF SVM"
         # ,"Gaussian Process"
         # ,"Decision Tree"
         # ,"Extra Trees"
         # ,"Extra Forest"
         # ,"AdaBoost"
         # ,"Gaussian Naive Bayes"
         # ,"LDA"
         # ,"QDA"
         # ,"Logistic Regression"
         # ,"SGD Classifier"
         # ,"Multilayer Perceptron"
         # ,"Voting Classifier"
        ]

algorithms = [
                RandomForestClassifier(random_state=random_state)
                # ,KNeighborsClassifier(n_neighbors=3)
               # ,SVC(random_state=random_state)
               # ,SVC(kernel="linear",random_state=random_state)
               # ,SVC(kernel="rbf",random_state=random_state)
               # ,GaussianProcessClassifier()
               # ,DecisionTreeClassifier(random_state=random_state)
               # ,ExtraTreesClassifier(random_state=random_state)
               # ,GradientBoostingClassifier(random_state=random_state)
               # ,AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),n_estimators=10,learning_rate=0.1,random_state=random_state)
               # ,GaussianNB()
               # ,LinearDiscriminantAnalysis()
               # ,QuadraticDiscriminantAnalysis()
               # ,LogisticRegression(random_state=random_state)
               # ,SGDClassifier()
               # ,MLPClassifier(hidden_layer_sizes=(100,),momentum=0.9,solver='sgd',random_state=random_state)
               # ,VotingClassifier(estimators=[('log', LogisticRegression()), ('SVM',SVC(C=1000)), ('MLP', MLPClassifier(hidden_layer_sizes=(100,)))], voting='hard')
              ]
#algorithms.append(SVC(random_state=random_state))

classifiers = {
                     "Random Forest" : RandomForestClassifier(random_state=random_state)
                    # ,"k-Nearest Neighbors" : KNeighborsClassifier(n_neighbors=3)
                 # ,"Support Vector Machine" :  SVC(random_state=random_state)
                 # ,"Linear SVM" :  SVC(kernel="linear",random_state=random_state)
              #    ,"RBF SVM" :  SVC(kernel="rbf",random_state=random_state)
              #    ,"Gaussian Process" : GaussianProcessClassifier()
              #    ,"Decision Tree" : DecisionTreeClassifier(random_state=random_state)
              #    ,"Extra Trees" : ExtraTreesClassifier(random_state=random_state)
                 # ,"Extra Forest" : GradientBoostingClassifier(random_state=random_state)
                 # ,"AdaBoost" : AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),n_estimators=10,random_state=random_state,learning_rate=0.1)
              #    ,"Gaussian Naive Bayes" : GaussianNB()
              #    ,"LDA" : LinearDiscriminantAnalysis()
              #    ,"QDA" :  QuadraticDiscriminantAnalysis()
              #    ,"Logistic Regression" : LogisticRegression(random_state=random_state)
              #    ,"SGD Classifier" : SGDClassifier()
              #    ,"Multilayer Perceptron" :  MLPClassifier(hidden_layer_sizes=(100,),momentum=0.9,solver='sgd',random_state=random_state)
              #    ,"Voting Classifier" : VotingClassifier(estimators=[('log', LogisticRegression()), ('SVM',SVC(C=1000)), ('MLP', MLPClassifier(hidden_layer_sizes=(100,)))], voting='hard')
              }


# Test different algorithms
data_copy=[]
for model in algorithms:
    model.fit(X_train,y_train)
    pred_test = model.predict(X_test)
    data_copy.append(metrics.f1_score(pred_test, y_test))
    
models_df = pd.DataFrame(data_copy, index=names)   
models_df.columns=['F1 Score']
models_df

Unnamed: 0,F1 Score
Random Forest,0.628345


## Select Model Winner

In [144]:
# Select Model Winner
print('The winner is: ')

winner=models_df[models_df['F1 Score']==max(models_df['F1 Score'])].reset_index()
winner

The winner is: 


Unnamed: 0,index,F1 Score
0,Random Forest,0.628345


# Train the winner and output into pickle

In [145]:
winner_name=winner['index'][0]
index = names.index(winner_name)

srp = algorithms[index]

# fit the predictor and target
srp.fit(X_train, y_train)

# predict
srp_predict = srp.predict(X_test)# check performance
print('ROCAUC score:',roc_auc_score(y_test, srp_predict))
print('Accuracy score:',accuracy_score(y_test, srp_predict))
print('F1 score:',f1_score(y_test, srp_predict))

ROCAUC score: 0.6754748716695069
Accuracy score: 0.6747555763714068
F1 score: 0.6283449290875033


# Simulate User Input into Model

## Test Survey Input

In [148]:
# Dataframe of survey inputs
survey_inputs=[{'race_african american':0,
 'race_asian':0,
 'race_caucasian':1, 
 'race_hispanic':0, 
 'race_other':0, 
 'race_unknown':0, 
 'alcohol_consumption':0, 
 'bc_implant':0,
 'bc_injection':0, 
 'bc_oral':0, 
 'bc_other':0, 
 'bc_patch':0, 
 'bi_oophorectomy':0, 
 'birth_control':0, 
 'hysterectomy':0, 
 'menopause':0, 
 'smoker':1, 
 'anxiety':0, 
 'dec_libido':0, 
 'depression':0,  
 'dyspareunia':0, 
 'fatigue_sleep_disturbances':0, 
 'hair_loss':0, 
 'headache_migraine':0, 
 'hot_flash':1, 
'irritability':0,
 'memory_lapse':0, 
 'night_sweats':0, 
 'oab_incontinence':0,
'skin_changes':0,
 'urge_incontinence':0, 
 'uti':0, 
 'vaginal_dryness':0, 
 'weight_gain':0}]
survey_inputs = pd.DataFrame(survey_inputs)

# Test Input
print("Your probability of", risk_param, 'in the next', 
      risk_time, 'quarter, is:', 
      round(float(srp.predict_proba(survey_inputs)[:,1]),3))

Your probability of hot_flash in the next 1 quarter, is: 0.995


Feature names must be in the same order as they were in fit.

