In [1]:
# !pip install awswrangler

In [2]:
# !pip install numpy

In [1]:
import sys
sys.path.append('../support_files/')

import constants as params

In [2]:
import pandas as pd

import boto3
import numpy as np
import io
import awswrangler as wr
import sagemaker
from sagemaker import get_execution_role
from sagemaker.session import s3_input, Session
import pickle
from sagemaker.amazon.amazon_estimator import get_image_uri

from sklearn.preprocessing import OneHotEncoder
# Save the encoded categorical model
import tempfile
import boto3
import joblib

import s3fs
from sagemaker.predictor import csv_serializer
import tempfile
import joblib
from sagemaker.predictor import csv_serializer
import constants as params

import warnings
warnings.filterwarnings('ignore')

In [3]:
# !pip install numpy

In [4]:
def create_columns_types(df_cols, y_name):
    
    ind_cols = list(df_cols[df_cols.str.contains('_ind')])
    pmpm_cols =  list(df_cols[df_cols.str.contains('_pmpm_')])
    score_cols = list(df_cols[df_cols.str.contains('_score')])

    categorical_cols = ind_cols + [y_name]
    numerical_cols = pmpm_cols + score_cols
    ordinal_cols = []
    return (categorical_cols, numerical_cols, ordinal_cols)

In [5]:
def data_type_conversion(df, categorical_cols, numerical_cols):
    for col in numerical_cols:
        df[col] = df[col].apply(pd.to_numeric)
    df[categorical_cols] = df[categorical_cols].astype("category")
    return df

#### Creating summation features

In [6]:
def sum_feature_generation(df, categories, categorical_cols, numerical_cols):
    for category in categories:
        new_ind_name = category + "_ind_sum"
        subset_cols_criteria = "_" + category + "_"
        subset_cols = [x for x in df.columns if subset_cols_criteria in x and '_ind' in x]
        df[new_ind_name] = df[subset_cols].apply(pd.to_numeric, errors='coerce').sum(axis=1)
        df[subset_cols] = df[subset_cols].astype("category")
        df[new_ind_name] = df[new_ind_name].astype("category")
        numerical_cols.append(new_ind_name)
        

        new_pmpm_name = category + "_pmpm_sum"
        subset_cols_criteria = "_" + category + "_"
        subset_cols = [x for x in df.columns if subset_cols_criteria in x and '_pmpm' in x]

        df[new_pmpm_name] = df[subset_cols].apply(pd.to_numeric, errors='coerce').sum(axis=1)
        numerical_cols.append(new_pmpm_name)
#         df = add_columns(df=df, col_name=new_ind_name, columns= subset_cols)
    return (df, categorical_cols, numerical_cols)

#### creating a final column to have the count of number of services used by each person using ind values


In [7]:
def total_sum_features(df, numerical_cols, categorical_cols):
    df['total_ind'] = df[df.columns[df.columns.str.contains('_ind_sum')]].sum(axis=1)
    df['total_ind'] = df['total_ind'].apply(pd.to_numeric, errors = 'coerce')
    numerical_cols.append('total_ind')
    
    df['total_pmpm'] = df[df.columns[df.columns.str.contains('_pmpm_sum')]].sum(axis=1)
    df['total_pmpm'] = df['total_pmpm'].apply(pd.to_numeric, errors = 'coerce')
    numerical_cols.append('total_pmpm')

    df['service_bool'] = np.where(df['total_ind'] == 0, 0, 1)
    df['service_bool'] = df['service_bool'].astype("category")
    categorical_cols.append('service_bool')
    return df, numerical_cols, categorical_cols

In [8]:
# # To create a new bucket
# bucket_name = 'bucket-name55422' # new bucket_name
# my_region = boto3.session.Session().region_name

# s3 = boto3.resource('s3')
# try:
#     if my_region == "us-east-1":
#         s3.create_bucket(Bucket = bucket_name)
#         print('S3 bucket created successfully')
# except Exception as e:
#     print('S3 error')

In [82]:
sess = sagemaker.Session()
role = get_execution_role()

bucket = "humana-data"
prefix_rawdata = 'rawdata/original_pq_files'
prefix_fe_data = "intermediate/data"
prefix_metadata = "intermediate/metadata"
prefix_model = "intermediate/condition/models"


conn = boto3.client('s3')
# contents = conn.list_objects(Bucket=bucket, Prefix=prefix)['Contents']

In [10]:
condition_file = "Condition.pq"
dependent_file = "dependent.pq"

condition_df = wr.s3.read_parquet(path = f's3://{bucket}/{prefix_rawdata}/{condition_file}')
condition_df = condition_df.set_index(['person_id_syn'])
condition_df.columns = condition_df.columns.str.lower()

dependent_df = wr.s3.read_parquet(path = f's3://{bucket}/{prefix_rawdata}/{dependent_file}')
dependent_df = dependent_df.set_index(['person_id_syn'])
dependent_df['transportation_issues'] = dependent_df['transportation_issues'].astype("category")
dependent_df.columns = dependent_df.columns.str.lower()

df_subset = dependent_df.merge(condition_df, how='left', left_index=True, right_index=True)

categorical_cols, numerical_cols, ordinal_cols = create_columns_types(df_cols = df_subset.columns,
                                                                      y_name = params.dependent_variable)

df_subset[categorical_cols] = np.where(df_subset[categorical_cols] != 0, 1,0)

df_subset, categorical_cols, numerical_cols = sum_feature_generation(df_subset, params.condition_categories, 
                                              categorical_cols =categorical_cols,
                                             numerical_cols = numerical_cols)

df_subset, numerical_cols, categorical_cols = total_sum_features(df = df_subset, 
                                                                 numerical_cols=numerical_cols, 
                                                                 categorical_cols=categorical_cols)


df_subset = data_type_conversion(df=df_subset, 
                                 categorical_cols=categorical_cols, 
                                 numerical_cols=numerical_cols)

In [11]:
train_df = df_subset.sample(frac=0.7, random_state=543)
valid_df = df_subset[~(df_subset.index.isin(train_df.index))]

In [32]:
# filename_train ='train_fe.pq'
# filename_valid ='valid_fe.pq'

# wr.s3.to_parquet(train_df, path = f's3://{bucket}/{prefix_data}/{filename_train}', compression='gzip', index=True)
# wr.s3.to_parquet(valid_df, path = f's3://{bucket}/{prefix_data}/{filename_valid}', compression='gzip', index=True)

{'paths': ['s3://humana-data/intermediate/condition/data/valid_fe.pq'],
 'partitions_values': {}}

#### Local training

In [33]:
# filename_train ='train_fe.pq'
# filename_valid ='valid_fe.pq'

# train_df = wr.s3.read_parquet(path = f's3://{bucket}/{prefix_data}/{filename_train}')
# valid_df = wr.s3.read_parquet(path = f's3://{bucket}/{prefix_data}/{filename_valid}')

In [34]:
# prefix_model = "intermediate/condition/models"
# filename_encoder ='enc.pkl'
# path = prefix_model + '/' + filename_encoder

# # READ
# with tempfile.TemporaryFile() as fp:
#     conn.download_fileobj(Fileobj=fp, Bucket=bucket, Key=path)
#     fp.seek(0)
#     enc = joblib.load(fp)

In [79]:
# # # to read the file
# prefix_metadata = "intermediate/metadata"
# filename_metadata = prefix_metadata + '/' + 'metadata'

# metadata_obj = conn.get_object(Bucket=bucket, Key=filename_metadata)
# serializedObject = metadata_obj['Body'].read()
# metadata_dict = pickle.loads(serializedObject)

# categorical_cols = metadata_dict['categorical_cols']
# numerical_cols = metadata_dict['numerical_cols']

In [None]:
# one hot encoding
enc = OneHotEncoder(sparse=False, handle_unknown="ignore" )
one_hot_encode_cols = [x for x in categorical_cols if x != params.dependent_variable]
train_df_encoded = pd.DataFrame(enc.fit_transform(train_df[one_hot_encode_cols]), index=train_df.index)
train_df_encoded.columns = enc.get_feature_names_out()

train_model_df = pd.concat([train_df.drop(one_hot_encode_cols, axis=1), train_df_encoded], axis=1, ignore_index=False)

In [36]:
valid_df_encoded = pd.DataFrame(enc.transform(np.asarray(valid_df[one_hot_encode_cols], dtype=object)),
                                index=valid_df.index)
valid_df_encoded.columns = enc.get_feature_names_out()
valid_model_df = pd.concat([valid_df.drop(one_hot_encode_cols, axis=1), valid_df_encoded], axis=1, ignore_index=False)

In [37]:
X_train = train_model_df.sample(10000).drop(columns= ['transportation_issues'])
y_train = train_model_df[train_model_df.index.isin(X_train.index)]['transportation_issues'].astype(int)

X_valid = valid_model_df.drop(columns= ['transportation_issues'])
y_valid = valid_model_df[valid_model_df.index.isin(X_valid.index)]['transportation_issues'].astype(int)

In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

logistic_regression= LogisticRegression()
logistic_regression.fit(X_train,y_train)
y_pred_valid=logistic_regression.predict(X_valid)
y_pred_train=logistic_regression.predict(X_train)

print('Accuracy: ',metrics.accuracy_score(y_valid, y_pred_valid))
print('Accuracy: ',metrics.accuracy_score(y_train, y_pred_train))

Accuracy:  0.8525297048677655
Accuracy:  0.8528


In [42]:
# !pip install xgboost

In [43]:
# Hyperparameter tuning grid
params = {'min_child_weight': [5],
        'gamma': [1.5],
        'subsample': [0.8],
        'colsample_bytree': [0.8],
        'max_depth': [4]}

from sklearn.model_selection import GridSearchCV
from xgboost.sklearn import XGBClassifier
# Initialize XGBoost model, use growth tree algorithm similar to lightgbm
bst = XGBClassifier(n_jobs=-1,grow_policy='lossguide',tree_method ='hist',n_estimators=150)
# Gridsearch on parameter grid, use AUC to determine the best model
cv = GridSearchCV(bst, params)
# Fit model on the training data, track logloss on training and validing set
cv.fit(X_train, y_train,eval_metric='logloss',eval_set=[(X_train,y_train),(X_valid,y_valid)],verbose=False)



KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

import cf_matrix
from cf_matrix import make_confusion_matrix
from sklearn.metrics import confusion_matrix,accuracy_score

y_xg_pred_valid = cv.best_estimator_.predict(X_valid)
y_xg_pred_train = cv.best_estimator_.predict(X_train)

print(metrics.accuracy_score(y_valid, y_xg_pred_valid))
print(metrics.accuracy_score(y_train, y_xg_pred_train))


cm = confusion_matrix(y_valid, y_xg_pred_valid)
plt.figure(figsize=(20,15))
sns.set(font_scale=1.4) # for label size
make_confusion_matrix(cm, group_names=['no- issue','issue'],categories=['no-issue','issue'], cmap='binary')
b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) 
plt.show()

import joblib

# Print out the best parameters
cv.best_params_

#save model
joblib.dump(cv.best_estimator_, 'xgb.model')


#load saved model
xgb = joblib.load('xgb.model')


import sklearn.metrics as metrics
# calculate the fpr and tpr for all thresholds of the classification
probs = xgb.predict_proba(X_valid)
preds = probs[:,1]
# preds = [1]*17119
fpr, tpr, threshold = metrics.roc_curve(y_valid, preds)
roc_auc = metrics.auc(fpr, tpr)
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


# retrieve performance metrics
results = xgb.evals_result()
epochs = len(results['validation_0']['logloss'])
x_axis = range(0, epochs)
# plot log loss
fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
ax.legend()
plt.ylabel('Log Loss')
plt.title('XGBoost Log Loss');

# Feature importance plot
feature_importance = pd.DataFrame({'feature':xgb.get_booster().feature_names, 'importance':xgb.feature_importances_}).sort_values('importance',ascending=False).reset_index().drop(columns='index')
fig, ax = plt.subplots()
fig.set_size_inches(8.27,15)
plt.title('Feature Importance Plot')
sns.barplot(x='importance',y='feature',ax=ax,data=feature_importance[:50])


In [87]:
# top_featured_columns = list(feature_importance[feature_importance.importance>0.006].feature)

top_features = ['submcc_ner_deg_ind_1',
 'cmsd2_sns_general_ind_0',
 'submcc_ben_othr_ind_0',
 'submcc_dig_ugi_ind_0',
 'submcc_cir_hbp_ind_0',
 'submcc_ner_migr_pmpm_ct',
 'cmsd2_can_unc_neo/plycyth/myelo_ind_0',
 'ner_ind_sum',
 'submcc_dia_eye_pmpm_ct',
 'skn_ind_sum',
 'submcc_cir_hbp_ind_1',
 'submcc_dig_lgi_ind_0',
 'submcc_ner_deg_pmpm_ct',
 'submcc_can_skn_pmpm_ct',
 'submcc_men_depr_pmpm_ct',
 'submcc_inf_sep_pmpm_ct',
 'submcc_rsk_chol_ind_0',
 'cmsd2_mus_polyarthropath_ind_0',
 'submcc_can_othr_pmpm_ct',
 'submcc_can_skn_ind_1',
 'submcc_end_thy_ind_0',
 'submcc_rsk_chol_pmpm_ct',
 'rar_pmpm_sum',
 'submcc_bld_anem_ind_0',
 'inf_ind_sum',
 'submcc_end_gld_pmpm_ct',
 'sns_pmpm_sum',
 'submcc_cad_ang_pmpm_ct',
 'submcc_mus_arth_pmpm_ct',
 'submcc_cir_othr_pmpm_ct',
 'skn_pmpm_sum',
 'inf_pmpm_sum',
 'submcc_cad_ashd_ind_1']

In [88]:
cat_top_features = (pd.Series([x for x in top_features if '_ind' in x and '_sum' not in x]).str.split('_ind_').str[0] + "_ind").tolist()
num_sum_features = (pd.Series([x for x in top_features if '_ind' in x and '_sum' in x]).str.split('_ind_').str[0] + "_ind_sum").tolist()
num_top_features = [x for x in top_features if '_ind' not in x]

In [104]:
credit_fe_df = df_subset[set(cat_top_features + num_sum_features + num_top_features)]

file_condition_fe ='condition_fe.parquet.gzip'
credit_fe_df.to_parquet(path = f's3://{bucket}/{prefix_fe_data}/{file_condition_fe}', compression='gzip', index=True)

# conn.put_object(Bucket=bucket, Key=filename_metadata, Body=pickle.dumps(metadata))


In [118]:
prefix_metadata = "intermediate/metadata"
filename_metadata = prefix_metadata + "/" + "metadata"

# # to read the file
metadata_obj = conn.get_object(Bucket=bucket, Key=filename_metadata)
serializedObject = metadata_obj['Body'].read()
metadata = pickle.loads(serializedObject)


metadata['condition']['category_cols'] = cat_top_features.copy()
metadata['condition']['numeric_cols'] = num_sum_features + num_top_features

conn.put_object(Bucket=bucket, Key=filename_metadata, Body=pickle.dumps(metadata))


{'ResponseMetadata': {'RequestId': 'KQDVCADCMVFRY5ZB',
  'HostId': 'VQ+HfENGV6+2jMWxVqLvdqfDGC2J9jqGNnZTRVHrybGK5G8DCMHWs/AkJrJWLgh2Hc6fl0tTV4c=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'VQ+HfENGV6+2jMWxVqLvdqfDGC2J9jqGNnZTRVHrybGK5G8DCMHWs/AkJrJWLgh2Hc6fl0tTV4c=',
   'x-amz-request-id': 'KQDVCADCMVFRY5ZB',
   'date': 'Mon, 14 Nov 2022 21:21:52 GMT',
   'x-amz-version-id': 'Faq5_ct0sJByEqNr97Lnb0FNmsG4OVlg',
   'etag': '"e7aa5dd72047e6fee69dc77c407a5a33"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"e7aa5dd72047e6fee69dc77c407a5a33"',
 'VersionId': 'Faq5_ct0sJByEqNr97Lnb0FNmsG4OVlg'}

In [121]:
metadata['credit']['numeric']

['credit_bal_1stmtgcredit_60dpd',
 'credit_bal_agencyfirstmtg_60dpd',
 'credit_bal_heloc_60dpd',
 'credit_bal_nonagnfirstmtg_60dpd',
 'credit_bal_nonmtgcredit_60dpd',
 'credit_bal_studentloan_60dpd',
 'credit_bal_totalallcredit_60dpd',
 'credit_bal_autobank',
 'credit_bal_autofinance',
 'credit_bal_consumerfinance',
 'credit_minmob_mtgcredit']

#### Build XgBoost model - Sagemaker

In [12]:
# filename_train ='train_fe.pq'
# filename_valid ='valid_fe.pq'

# train_df = wr.s3.read_parquet(path = f's3://{bucket}/{prefix_data}/{filename_train}')
# valid_df = wr.s3.read_parquet(path = f's3://{bucket}/{prefix_data}/{filename_valid}')

In [13]:


# one hot encoding
enc = OneHotEncoder(sparse=False, handle_unknown="ignore" )
one_hot_encode_cols = [x for x in categorical_cols if x != params.dependent_variable]
train_df_encoded = pd.DataFrame(enc.fit_transform(train_df[one_hot_encode_cols]), index=train_df.index)
train_df_encoded.columns = enc.get_feature_names_out()

train_model_df = pd.concat([train_df.drop(one_hot_encode_cols, axis=1), train_df_encoded], axis=1, ignore_index=False)

In [14]:


filename_encoder ='enc.pkl'
path = prefix_model + '/' + filename_encoder

# WRITE
with tempfile.TemporaryFile() as fp:
    joblib.dump(enc, fp)
    fp.seek(0)
    conn.put_object(Body=fp.read(), Bucket=bucket, Key=path)

# READ
# with tempfile.TemporaryFile() as fp:
#     conn.download_fileobj(Fileobj=fp, Bucket=bucket, Key=path)
#     fp.seek(0)
#     enc2 = joblib.load(fp)

# # DELETE
# conn.delete_object(Bucket=bucket, Key=path)

In [15]:
valid_df_encoded = pd.DataFrame(enc.transform(np.asarray(valid_df[one_hot_encode_cols], dtype=object)),
                                index=valid_df.index)
valid_df_encoded.columns = enc.get_feature_names_out()
valid_model_df = pd.concat([valid_df.drop(one_hot_encode_cols, axis=1), valid_df_encoded], axis=1, ignore_index=False)

In [16]:
# # Write the metadata of index
# train_id_index = train_model_df.reset_index()["person_id_syn"]
# train_id_index = train_id_index.reset_index().rename(columns={'index':'index_num'})
# filename_train_id_index = "train_index_id.pq"
# wr.s3.to_parquet(train_id_index, path = 's3://{}/{}/{}'.format(bucket, prefix_metadata, filename_train_id_index),
#                  compression='gzip', index=False)


# valid_id_index = valid_model_df.reset_index()["person_id_syn"]
# valid_id_index = valid_id_index.reset_index().rename(columns={'index':'index_num'})
# filename_valid_id_index = "valid_index_id.pq"
# wr.s3.to_parquet(valid_id_index, path = 's3://{}/{}/{}'.format(bucket, prefix_metadata, filename_valid_id_index),
#                  compression='gzip', index=False)


# # Write the metadata of columns
# train_model_columns = train_model_df.columns
# test_model_columns = [x for x in train_model_columns if x != params.dependent_variable]

In [31]:
prefix_metadata = "intermediate/metadata"
filename_metadata = prefix_metadata + "/" + "metadata"

# # to read the file
metadata_obj = conn.get_object(Bucket=bucket, Key=filename_metadata)
serializedObject = metadata_obj['Body'].read()
metadata = pickle.loads(serializedObject)
# metadata = {}
metadata['condition'] = {}
metadata['condition']['categorical_cols'] = [x for x in categorical_cols if x != params.dependent_variable]
metadata['condition']['numerical_cols'] = numerical_cols.copy()

conn.put_object(Bucket=bucket, Key=filename_metadata, Body=pickle.dumps(metadata))

{'ResponseMetadata': {'RequestId': 'XMY07M4F3VZC96SQ',
  'HostId': 'B3+WHCvhmA4Dzq0LvpadealKkM2lz/G+kJ1uWHEyivV8/Mk1G1C9yf/bKFXgAvyG78jxyShDIFjXEWIVLm1mPQ==',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'B3+WHCvhmA4Dzq0LvpadealKkM2lz/G+kJ1uWHEyivV8/Mk1G1C9yf/bKFXgAvyG78jxyShDIFjXEWIVLm1mPQ==',
   'x-amz-request-id': 'XMY07M4F3VZC96SQ',
   'date': 'Mon, 14 Nov 2022 20:33:55 GMT',
   'x-amz-version-id': 'ycOcy6lHIfhlUpTCy9e1SO3xiBqwG6Ex',
   'etag': '"fe74f5149607eec215fd6f99b91d0a34"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"fe74f5149607eec215fd6f99b91d0a34"',
 'VersionId': 'ycOcy6lHIfhlUpTCy9e1SO3xiBqwG6Ex'}

In [28]:
# # Write the metadata of columns, index of train and valid and test
# prefix_metadata = "intermediate/condition/models/metadata"
# filename_metadata = prefix_metadata + '/' + 'metadata_dict'

# metadata_dict = {}
# metadata_dict['condition'] = {}
# # metadata_dict['condition']['train_index'] = dict(zip(list(train_model_df.reset_index()['person_id_syn'].index), train_model_df.reset_index()['person_id_syn']))
# # metadata_dict['condition']['valid_index'] = dict(zip(list(valid_model_df.reset_index()['person_id_syn'].index), valid_model_df.reset_index()['person_id_syn']))
# # metadata_dict['condition']['train_columns'] = list(train_model_df.columns)
# # metadata_dict['condition']['test_columns'] = [x for x in train_model_df.columns if x != params.dependent_variable]
# metadata_dict['condition']['categorical_cols'] = [x for x in categorical_cols if x != params.dependent_variable]
# metadata_dict['condition']['numerical_cols'] = numerical_cols.copy()


# # conn.put_object(Bucket=bucket, Key=filename_metadata, Body=pickle.dumps(metadata_dict))


# # # to read the file
# # metadata_dict = conn.get_object(Bucket=bucket, Key=filename_metadata)
# # serializedObject = metadata_dict['Body'].read()
# # metadata_dict = pickle.loads(serializedObject)
# # metadata_dict

In [18]:


# prefix_data = "intermediate/condition/data"
# filename_train_model ='train_model.csv'
# filename_valid_model ='valid_model.csv'

# bytes_to_write = train_model_df.to_csv(index=False, header=False).encode()
# fs = s3fs.S3FileSystem()
# with fs.open('s3://{}/{}/{}'.format(bucket, prefix_data, filename_train_model), 'wb') as f:
#     f.write(bytes_to_write)

# bytes_to_write = valid_model_df.to_csv(index=False, header=False).encode()
# fs = s3fs.S3FileSystem()
# with fs.open('s3://{}/{}/{}'.format(bucket, prefix_data, filename_valid_model), 'wb') as f:
#     f.write(bytes_to_write)

In [19]:
# prefix_data = "intermediate/condition/data"
# filename_train_model ='train_model.csv'
# filename_valid_model ='valid_model.csv'

# ## Specifies the path to training and validation in S3
# s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/{}'.format(bucket, prefix_data, filename_train_model), content_type='csv')
# s3_input_valid = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/{}'.format(bucket, prefix_data, filename_valid_model), content_type='csv')


In [22]:
prefix_model_xgboost = "intermediate/condition/models/xgboost-condition-data"
output_path  = 's3://{}/{}/output'.format(bucket, prefix_model_xgboost)
output_path

's3://humana-data/intermediate/condition/models/xgboost-condition-data/output'

In [23]:
region_name = boto3.Session().region_name

In [24]:
container = get_image_uri(region_name, 'xgboost', 'latest')

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [28]:
hyperparameters = {
    "num_round":100,
    "max_depth":8,
    "eta":0.2,
    "gamma": 3,
    "objective":"binary:logistic",
}

In [27]:
estimator = sagemaker.estimator.Estimator(image_uri = container,
                                         hyperparameters = hyperparameters,
                                         role = sagemaker.get_execution_role(),
                                         instance_count = 1,
                                         instance_type = 'ml.m5.large',
                                         volume_size = 5,
                                         output_path = output_path,
                                         use_spot_instances = True,
                                          max_run = 300,
                                          max_wait = 600)


In [29]:
estimator.fit({'train':s3_input_train, 'validation':s3_input_valid})

2022-11-12 00:32:06 Starting - Starting the training job...
2022-11-12 00:32:31 Starting - Preparing the instances for trainingProfilerReport-1668213125: InProgress
.........
2022-11-12 00:33:56 Downloading - Downloading input data...
2022-11-12 00:34:32 Training - Downloading the training image...
2022-11-12 00:34:57 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2022-11-12:00:35:03:INFO] Running standalone xgboost training.[0m
[34m[2022-11-12:00:35:03:INFO] File size need to be processed in the node: 153.89mb. Available memory size in the node: 381.85mb[0m
[34m[2022-11-12:00:35:03:INFO] Determined delimiter of CSV input is ','[0m
[34m[00:35:03] S3DistributionType set as FullyReplicated[0m
[34m[00:35:04] 48700x544 matrix with 26492800 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2022-11-12:00:35:04:INFO] Determined delimiter of CSV input is ','[0m
[34m[00:35:04] S3Distributio

[34m[00:35:50] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 66 extra nodes, 28 pruned nodes, max_depth=8[0m
[34m[41]#011train-error:0.113018#011validation-error:0.147566[0m
[34m[00:35:51] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 16 pruned nodes, max_depth=8[0m
[34m[42]#011train-error:0.112772#011validation-error:0.14747[0m
[34m[00:35:52] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 70 extra nodes, 26 pruned nodes, max_depth=8[0m
[34m[43]#011train-error:0.112361#011validation-error:0.147614[0m
[34m[00:35:53] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 64 pruned nodes, max_depth=8[0m
[34m[44]#011train-error:0.112218#011validation-error:0.147901[0m
[34m[00:35:54] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 152 extra nodes, 56 pruned nodes, max_depth=8[0m
[34m[45]#011train-error:0.111745#011validation-error:0.148189[0m
[34m[00:35:55] src/tree/updater_prune.cc:74: tree prun

[34m[00:36:40] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 68 extra nodes, 94 pruned nodes, max_depth=8[0m
[34m[89]#011train-error:0.09694#011validation-error:0.148524[0m
[34m[00:36:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 106 extra nodes, 42 pruned nodes, max_depth=8[0m
[34m[90]#011train-error:0.096407#011validation-error:0.148908[0m
[34m[00:36:42] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 70 extra nodes, 40 pruned nodes, max_depth=8[0m
[34m[91]#011train-error:0.09614#011validation-error:0.148476[0m
[34m[00:36:43] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 160 extra nodes, 122 pruned nodes, max_depth=8[0m
[34m[92]#011train-error:0.095216#011validation-error:0.148716[0m
[34m[00:36:44] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 18 extra nodes, 42 pruned nodes, max_depth=7[0m
[34m[93]#011train-error:0.095236#011validation-error:0.148764[0m
[34m[00:36:45] src/tree/updater_prune.cc:74: tree pru

### Deploy ML model

In [47]:
xgb_condition_predictor = estimator.deploy(initial_instance_count = 1, instance_type='ml.m5.large',
                                         serializer=sagemaker.serializers.CSVSerializer() )

------!

In [39]:
# sagemaker.Session().delete_endpoint(xgb_condition_predictor.endpoint)

In [41]:
# region_name

In [42]:
# import boto3

# # Specify your AWS Region
# aws_region=region_name

# # Specify the name of your endpoint
# endpoint_name='xgboost-2022-11-11-23-55-05-168'

# # Create a low-level SageMaker service client.
# sagemaker_client = boto3.client('sagemaker', region_name=aws_region)

# # Delete endpoint
# sagemaker_client.delete_endpoint(EndpointName=endpoint_name)

### Predicts on Test Data

In [49]:
def predict(data, predictor):
    predictions = []
    confidences = []
    for row in data:
        response = np.fromstring(predictor.predict(row).decode("utf-8")[1:], sep=",")
        pred = response.argmax()
        confidence = max(response)
        predictions.extend([pred])
        confidences.extend([confidence])

    return predictions, confidences

In [51]:
xgb_condition_predictor.predict(train_model_df.head(1000).drop(columns = 'transportation_issues').to_numpy()).decode("utf-8")

'0.15203361213207245,0.53171306848526,0.024640358984470367,0.12627477943897247,0.15970207750797272,0.03704105317592621,0.683900773525238,0.11185868084430695,0.19280855357646942,0.20396365225315094,0.03894664719700813,0.18964733183383942,0.038110386580228806,0.08595790714025497,0.09573307633399963,0.05436033010482788,0.174422487616539,0.07120764255523682,0.07873384654521942,0.11581337451934814,0.07643716782331467,0.1364915817975998,0.08681432902812958,0.07370735704898834,0.0857640951871872,0.09271731227636337,0.2565670907497406,0.08997442573308945,0.1992834508419037,0.12607493996620178,0.03594943881034851,0.2160201519727707,0.09177536517381668,0.06457207351922989,0.1349855363368988,0.07308997958898544,0.04260220006108284,0.038506001234054565,0.32696712017059326,0.11865590512752533,0.598294198513031,0.06526705622673035,0.47588804364204407,0.06787604093551636,0.10071051865816116,0.2733558714389801,0.09273490309715271,0.024320820346474648,0.10856478661298752,0.050042521208524704,0.07446022

### Test Data



In [None]:
def sum_feature_generation_test(df, categories):
    for category in categories:
        new_ind_name = category + "_ind_sum"
        subset_cols_criteria = "_" + category + "_"
        subset_cols = [x for x in df.columns if subset_cols_criteria in x and '_ind' in x]
        df[new_ind_name] = df[subset_cols].apply(pd.to_numeric, errors='coerce').sum(axis=1)
        df[subset_cols] = df[subset_cols].astype("category")
        df[new_ind_name] = df[new_ind_name].astype("category")

        

        new_pmpm_name = category + "_pmpm_sum"
        subset_cols_criteria = "_" + category + "_"
        subset_cols = [x for x in df.columns if subset_cols_criteria in x and '_pmpm' in x]

        df[new_pmpm_name] = df[subset_cols].apply(pd.to_numeric, errors='coerce').sum(axis=1)
        numerical_cols.append(new_pmpm_name)
#         df = add_columns(df=df, col_name=new_ind_name, columns= subset_cols)
    return (df)

In [10]:
def total_sum_features_test(df):
    df['total_ind'] = df[df.columns[df.columns.str.contains('_ind_sum')]].sum(axis=1)
    df['total_ind'] = df['total_ind'].apply(pd.to_numeric, errors = 'coerce')
    
    df['total_pmpm'] = df[df.columns[df.columns.str.contains('_pmpm_sum')]].sum(axis=1)
    df['total_pmpm'] = df['total_pmpm'].apply(pd.to_numeric, errors = 'coerce')

    df['service_bool'] = np.where(df['total_ind'] == 0, 0, 1)
    df['service_bool'] = df['service_bool'].astype("category")
    return df

In [11]:
def data_type_conversion_test(df, categorical_cols, numerical_cols):
    for col in numerical_cols:
        test_df[col] = test_df[col].apply(pd.to_numeric)
    df[categorical_cols] = df[categorical_cols].astype("category")
    return df

In [12]:

sess = sagemaker.Session()
role = get_execution_role()

conn = boto3.client('s3')

bucket = "humana-data"
prefix_testdata = 'rawdata/original_raw_files/test'
filename_test = "2020_Competition_Holdout.csv"
key_test = prefix_testdata + '/' + filename_test
location_test = 's3://{}/{}'.format(bucket, key_test)

test_df = pd.read_csv(location_test)
test_df = test_df.set_index(['person_id_syn'])

test_df_backup = test_df.copy()

  exec(code_obj, self.user_global_ns, self.user_ns)


In [15]:
# # to read the file
prefix_metadata = "intermediate/condition/models/metadata"
filename_metadata = prefix_metadata + '/' + 'metadata_dict'

metadata_dict = conn.get_object(Bucket=bucket, Key=filename_metadata)
serializedObject = metadata_dict['Body'].read()
metadata_dict = pickle.loads(serializedObject)
categorical_cols = metadata_dict['categorical_cols']
numerical_cols = metadata_dict['numerical_cols']

In [18]:
test_df = sum_feature_generation_test(df=test_df, categories=params.categories)
test_df = total_sum_features_test(df = test_df)

  df[new_pmpm_name] = df[subset_cols].apply(pd.to_numeric, errors='coerce').sum(axis=1)
  df[new_ind_name] = df[subset_cols].apply(pd.to_numeric, errors='coerce').sum(axis=1)
  df['total_ind'] = df[df.columns[df.columns.str.contains('_ind_sum')]].sum(axis=1)
  df['total_pmpm'] = df[df.columns[df.columns.str.contains('_pmpm_sum')]].sum(axis=1)
  df['service_bool'] = np.where(df['total_ind'] == 0, 0, 1)


In [20]:
test_df = test_df[categorical_cols + numerical_cols]

In [21]:
test_df[categorical_cols] = np.where(test_df[categorical_cols] != 0, 1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[iloc] = igetitem(value, i)


In [22]:
test_df = data_type_conversion_test(df= test_df, 
                                 categorical_cols=categorical_cols, 
                                 numerical_cols=numerical_cols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[col] = test_df[col].apply(pd.to_numeric)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [23]:

prefix_model = "intermediate/condition/models"
filename_encoder ='enc.pkl'
path = prefix_model + '/' + filename_encoder

# READ
with tempfile.TemporaryFile() as fp:
    conn.download_fileobj(Fileobj=fp, Bucket=bucket, Key=path)
    fp.seek(0)
    enc = joblib.load(fp)

In [24]:
one_hot_encode_cols = [x for x in categorical_cols if x != params.dependent_variable]

In [76]:
test_df_encoded = pd.DataFrame(enc.transform(np.asarray(test_df[one_hot_encode_cols], dtype=object)), index=test_df.index)
test_df_encoded.columns = enc.get_feature_names_out()

test_model_df = pd.concat([test_df.drop(one_hot_encode_cols, axis=1), test_df_encoded], axis=1, ignore_index=False)



In [83]:
test_model_np = test_model_df.to_numpy()

In [79]:
def predict(data, predictor):
    predictions = []
    confidences = []
    for row in data:
        response = np.fromstring(predictor.predict(row).decode("utf-8")[1:], sep=",")
        pred = response.argmax()
        confidence = max(response)
        predictions.extend([pred])
        confidences.extend([confidence])

    return predictions, confidences

In [None]:
xgb_condition_predictor.predict(train_model_df.head(1000).drop(columns = 'transportation_issues').to_numpy()).decode("utf-8")