# Clinical, Genomic, and Imaging data - Training and Testing 

In [32]:
import boto3
import sagemaker
from sagemaker.session import Session
from sagemaker import get_execution_role
from sagemaker.serializers import CSVSerializer
from sagemaker.image_uris import retrieve
from sagemaker.session import TrainingInput

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.io as sio
import io, os
import sys
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import matplotlib.gridspec as gridspec
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [None]:
!pip install autogluon

In [None]:
from autogluon.tabular import TabularPredictor
import autogluon as ag

## Get data type to train model

In [33]:
supported_data_type = ('genomic', 'genomic-clinical', 'genomic-clinical-imaging', 'clinical', 'imaging')
#data_type = 'genomic'
#data_type = 'genomic-clinical'
data_type = 'genomic-clinical-imaging'
# data_type = 'clinical'
#data_type = 'imaging'

## Set up S3 buckets and session

In [34]:
sm_session = sagemaker.Session()
bucket = sm_session.default_bucket()
region = boto3.Session().region_name
role = get_execution_role()

boto_session = boto3.Session(region_name=region)
sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

s3_client = boto3.client('s3', region_name=region)

default_s3_bucket_name = sm_session.default_bucket()
prefix = 'multi-model-health-ml'


## Get features from SageMaker FeatureStore based on data type

In [35]:
from sagemaker.feature_store.feature_group import FeatureGroup

genomic_feature_group_name = 'genomic-feature-group'
clinical_feature_group_name = 'clinical-feature-group'
imaging_feature_group_name = 'imaging-feature-group'

genomic_feature_group = FeatureGroup(name=genomic_feature_group_name, sagemaker_session=feature_store_session)
clinical_feature_group = FeatureGroup(name=clinical_feature_group_name, sagemaker_session=feature_store_session)
imaging_feature_group = FeatureGroup(name=imaging_feature_group_name, sagemaker_session=feature_store_session)

In [36]:
genomic_query = genomic_feature_group.athena_query()
clinical_query = clinical_feature_group.athena_query()
imaging_query = imaging_feature_group.athena_query()

genomic_table = genomic_query.table_name
clinical_table = clinical_query.table_name
imaging_table = imaging_query.table_name

print('Table names')
print(genomic_table)
print(clinical_table)
print(imaging_table)


Table names
genomic_feature_group_1688073848
clinical_feature_group_1688073736
imaging_feature_group_1688074260


In [37]:
def get_features(data_type, output_location):  
    if (data_type == 'clinical'):
        query_string = f'SELECT * FROM "{clinical_table}" ORDER BY patientid'
        print(query_string)

        clinical_query.run(query_string=query_string, output_location=output_location)
        clinical_query.wait()
        dataset = clinical_query.as_dataframe()
        
    elif (data_type == 'imaging'):
        query_string = f'SELECT * FROM "{imaging_table}" ORDER BY patientid'
        print(query_string)

        imaging_query.run(query_string=query_string, output_location=output_location)
        imaging_query.wait()
        dataset = imaging_query.as_dataframe()
    
    elif (data_type == 'genomic'):
        query_string = f'SELECT * FROM "{genomic_table}" ORDER BY patientid'
        print(query_string)
        genomic_query.run(query_string=query_string, output_location=output_location)
        genomic_query.wait()
        dataset = genomic_query.as_dataframe()
        
    elif (data_type == 'genomic-clinical'):
        query_string = f'''SELECT * FROM "{clinical_table}", "{genomic_table}"
                           WHERE "{clinical_table}".patientid = "{genomic_table}".patientid
                           ORDER BY "{clinical_table}".patientid ASC'''
        print(query_string)

        genomic_query.run(query_string=query_string, output_location=output_location)
        genomic_query.wait()
        dataset = genomic_query.as_dataframe()
        
    elif (data_type == 'genomic-clinical-imaging'):
        query_string = f'''SELECT * FROM "{genomic_table}", "{clinical_table}", "{imaging_table}"
                           WHERE "{genomic_table}".patientid = "{clinical_table}".patientid
                           AND "{genomic_table}".patientid = "{imaging_table}".patientid
                           ORDER BY "{clinical_table}".patientid ASC'''                   
        print(query_string)
        
        genomic_query.run(query_string=query_string, output_location=output_location)
        genomic_query.wait()
        dataset = genomic_query.as_dataframe()
        
    elif data_type not in supported_data_type:
        raise KeyError(f'data_type {data_type} is not supported for this analysis.')
        
    return dataset

In [44]:
fs_output_location = f's3://{default_s3_bucket_name}/{prefix}/feature-store-queries'
dataset = get_features(data_type, fs_output_location)
dataset = dataset.astype(str).replace({"{":"", "}":""}, regex=True)

# Write to csv in S3 without headers and index column.
filename=f'{data_type}-dataset.csv'
dataset_uri_prefix = f's3://{default_s3_bucket_name}/{prefix}/training_input/';

dataset.to_csv(filename)
s3_client.upload_file(filename, default_s3_bucket_name, f'{prefix}/training_input/{filename}')

SELECT * FROM "genomic_feature_group_1688073848", "clinical_feature_group_1688073736", "imaging_feature_group_1688074260"
                           WHERE "genomic_feature_group_1688073848".patientid = "clinical_feature_group_1688073736".patientid
                           AND "genomic_feature_group_1688073848".patientid = "imaging_feature_group_1688074260".patientid
                           ORDER BY "clinical_feature_group_1688073736".patientid ASC


In [39]:
dataset.head(10)

Unnamed: 0,patientid,gene_info,clinical_significance,contigname,start,referenceallele,alternatealleles,phased,calls,alzheimers_prediction,...,original_ngtdm_strength,imagesetid,alzheimers_prediction.2,coronary_heart_disease_prediction.2,stroke_prediction.2,hypertension_prediction.2,eventtime.2,write_time.2,api_invocation_time.2,is_deleted.2
0,0074596f-5fd0-7965-db0f-cce71c81567d,"'SOD3', None, 'TNF', 'LPL', 'F5', 'PON1', 'PPA...",'Conflicting_interpretations_of_pathogenicity'...,"'14', '9', '6', '16', '17', '8', '10', '11', '...","53767041, 44919688, 31667848, 12351625, 449086...","'T', 'G', 'A', 'C'","'[A]', '[C]', '[G]', '[T]'",False,"'[1, 1]', '[0, 1]'",0,...,8.547276310306838,'d7298fe7dde4537b8343b5a702979aea',0,0,0,0,1688074260.0,2023-06-29 21:37:00.939,2023-06-29 21:31:33.000,False
1,0618424e-ed51-3100-ea5c-e46492bfd65b,"'SOD3', None, 'TNF', 'LPL', 'F5', 'PON1', 'PPA...",'Conflicting_interpretations_of_pathogenicity'...,"'14', '9', '6', '16', '17', '8', '10', '11', '...","53767041, 44919688, 12351625, 31667848, 449086...","'T', 'G', 'A', 'C'","'[A]', '[C]', '[G]', '[T]'",False,"'[1, 1]', '[0, 1]'",0,...,9.241749116927542,'e5e2ccf0487eed5523395431675fa708',0,0,0,0,1688074260.0,2023-06-29 21:36:51.705,2023-06-29 21:31:33.000,False
2,06cc033a-f09a-0fb2-4a1a-4c4d99d88839,"'SOD3', None, 'TNF', 'LPL', 'F5', 'PON1', 'PPA...",'Conflicting_interpretations_of_pathogenicity'...,"'14', '9', '6', '16', '17', '8', '10', '11', '...","53767041, 44919688, 31667848, 12351625, 449086...","'T', 'G', 'A', 'C'","'[A]', '[C]', '[G]', '[T]'",False,"'[1, 1]', '[0, 1]'",1,...,9.02168678253902,'1a3c6b0044e2b58a13c106896da369ef',1,0,1,1,1688074260.0,2023-06-29 21:37:00.886,2023-06-29 21:31:33.000,False
3,06d43bcb-8322-3c2b-40f1-189b2852b5ab,"'SOD3', None, 'TNF', 'LPL', 'F5', 'PON1', 'PPA...",'Conflicting_interpretations_of_pathogenicity'...,"'14', '9', '6', '16', '17', '8', '10', '11', '...","53767041, 44919688, 12351625, 31667848, 449086...","'T', 'G', 'A', 'C'","'[A]', '[C]', '[G]', '[T]'",False,"'[1, 1]', '[0, 1]'",0,...,8.776968231552623,'41b14124cdd7f0ea3d459bcfe6120e7f',0,0,0,0,1688074260.0,2023-06-29 21:36:51.674,2023-06-29 21:31:33.000,False
4,08acefb1-271e-01d2-1bb6-aae9f65dbb42,"'SOD3', None, 'TNF', 'LPL', 'F5', 'PON1', 'PPA...",'Conflicting_interpretations_of_pathogenicity'...,"'14', '9', '6', '16', '17', '8', '10', '11', '...","53767041, 44919688, 12351625, 31667848, 449086...","'T', 'G', 'A', 'C'","'[A]', '[C]', '[G]', '[T]'",False,"'[1, 1]', '[0, 1]'",0,...,8.601098896888011,'dd9a6771bf229a91f03f608c336141de',0,0,1,0,1688074260.0,2023-06-29 21:36:59.693,2023-06-29 21:31:33.000,False
5,0b73e9c9-18c0-5c0e-c5d5-95d7ca56c050,"'SOD3', None, 'TNF', 'LPL', 'F5', 'PON1', 'PPA...",'Conflicting_interpretations_of_pathogenicity'...,"'14', '9', '6', '16', '17', '8', '10', '11', '...","53767041, 44919688, 31667848, 12351625, 449086...","'T', 'G', 'A', 'C'","'[A]', '[C]', '[G]', '[T]'",False,"'[1, 1]', '[0, 1]'",1,...,9.106481386718237,'1526fea901654ad5e45f7d5f22377f81',1,0,1,1,1688074260.0,2023-06-29 21:36:51.701,2023-06-29 21:31:33.000,False
6,0bb4b1f7-4128-89ef-a023-b81379c30f4f,"'SOD3', None, 'TNF', 'LPL', 'F5', 'PON1', 'PPA...",'Conflicting_interpretations_of_pathogenicity'...,"'14', '9', '6', '16', '17', '8', '10', '11', '...","53767041, 44919688, 31667848, 12351625, 449086...","'T', 'G', 'A', 'C'","'[A]', '[C]', '[G]', '[T]'",False,"'[1, 1]', '[0, 1]'",0,...,9.38894545484775,'b94c9e7c782da3660ea6a905e2a91b34',0,0,0,0,1688074260.0,2023-06-29 21:37:00.006,2023-06-29 21:31:33.000,False
7,0c8ad7c0-b403-98d1-d6c9-ff1dbbce7d7c,"'SOD3', None, 'TNF', 'LPL', 'F5', 'PON1', 'PPA...",'Conflicting_interpretations_of_pathogenicity'...,"'14', '9', '6', '16', '17', '8', '10', '11', '...","53767041, 44919688, 31667848, 12351625, 449086...","'T', 'G', 'A', 'C'","'[A]', '[C]', '[G]', '[T]'",False,"'[1, 1]', '[0, 1]'",0,...,9.202929567908852,'bccc8f07e6b97098dbb92cbafef2949c',0,0,1,1,1688074260.0,2023-06-29 21:36:59.699,2023-06-29 21:31:33.000,False
8,0f1c1042-f103-e85b-1a7a-03442861947d,"'SOD3', None, 'TNF', 'LPL', 'F5', 'PON1', 'PPA...",'Conflicting_interpretations_of_pathogenicity'...,"'14', '9', '6', '16', '17', '8', '10', '11', '...","53767041, 44919688, 12351625, 31667848, 449086...","'T', 'G', 'A', 'C'","'[A]', '[C]', '[G]', '[T]'",False,"'[1, 1]', '[0, 1]'",0,...,11.269159030234578,'df83b4160e70a45bee0af99f9785d1eb',0,0,0,0,1688074260.0,2023-06-29 21:36:59.693,2023-06-29 21:31:33.000,False
9,14e884a7-dc93-7909-9585-2857c6eb29e2,"'SOD3', None, 'TNF', 'LPL', 'F5', 'PON1', 'PPA...",'Conflicting_interpretations_of_pathogenicity'...,"'14', '9', '6', '16', '17', '8', '10', '11', '...","53767041, 44919688, 12351625, 31667848, 449086...","'T', 'G', 'A', 'C'","'[A]', '[C]', '[G]', '[T]'",False,"'[1, 1]', '[0, 1]'",0,...,8.875513496551257,'20e77ca11e2b601b8ff9b9743efc1014',0,0,1,1,1688074260.0,2023-06-29 21:37:00.854,2023-06-29 21:31:33.000,False


In [15]:
ag.core.utils.random.seed(25)

NameError: name 'ag' is not defined

## Alzheimers Prediction
Splitting data for training and testing

In [45]:
#Alzheimers Prediction
#Splitting data into training and testing 80:20
dataset = dataset.loc[:, ~dataset.columns.str.startswith('diagnostics')]
dataset = dataset.drop(columns = ['eventtime', 'write_time', 'api_invocation_time', 'is_deleted', 'eventtime.1', 'write_time.1', 'api_invocation_time.1', 'is_deleted.1', 'alzheimers_prediction.1',
                                    'coronary_heart_disease_prediction.1', 'stroke_prediction.1', 'hypertension_prediction.1', 'patientid.1', 'eventtime.2', 'write_time.2', 'api_invocation_time.2', 'is_deleted.2', 
                                   'alzheimers_prediction.2', 'coronary_heart_disease_prediction.2', 'stroke_prediction.2', 'hypertension_prediction.2', 'patientid.2'])
training= dataset.sample(frac=0.8, random_state=21)
training = training.drop(columns = ['patientid', 'coronary_heart_disease_prediction', 'stroke_prediction', 'hypertension_prediction'])
testing = dataset.drop(training.index)
testing = testing.drop(columns = ['patientid', 'coronary_heart_disease_prediction', 'stroke_prediction', 'hypertension_prediction'])
X_test = testing.drop(columns = ['alzheimers_prediction'])
print("Training size = ", len(training))
print("Out of sample testing size = ", len(testing))

# Splitting data into training and testing for deploying to an endpoint
training_target = training.pop("alzheimers_prediction")
training.insert(0, 'alzheimers_prediction', training_target)
testing_target = testing.pop("alzheimers_prediction")
testing.insert(0, 'alzheimers_prediction', testing_target)
training.to_csv('s3://multimodal-dataset-clinical-genomic-imaging/multimodal_alzheimers_training.csv', index=False)
testing.to_csv('s3://multimodal-dataset-clinical-genomic-imaging/multimodal_alzheimers_testing.csv', index=False)

Training size =  121
Out of sample testing size =  30


### Alzheimers prediction on clinical, genomic, and imaging data using Autogluon

In [65]:
import time
start_time = time.time()
buckt = sm_session.default_bucket()
prefix= "genomic-clinical-imaging-alzheimers-prediction"
save_file = 's3://{}/{}'.format(buckt, prefix)
predictor = TabularPredictor(label= 'alzheimers_prediction', problem_type= 'binary', path=save_file).fit(train_data=training, holdout_frac=0.1, excluded_model_types=['CAT', 'XGB'])
print("--- Training time= %s seconds ---" % (time.time() - start_time))

save object to s3://sagemaker-us-east-1-659535263284/genomic-clinical-imaging-alzheimers-prediction/learner.pkl
save object to s3://sagemaker-us-east-1-659535263284/genomic-clinical-imaging-alzheimers-prediction/predictor.pkl
Beginning AutoGluon training ...
AutoGluon will save models to "s3://sagemaker-us-east-1-659535263284/genomic-clinical-imaging-alzheimers-prediction/"
AutoGluon Version:  0.6.2
Python Version:     3.7.10
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Mar 23 09:54:12 UTC 2023
Train Data Rows:    121
Train Data Columns: 144
Label Column: alzheimers_prediction
Preprocessing data ...
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    2780.96 MB
	Train Data (Original)  Memory Usage: 1.42 MB (0.1% of available memory)
	Inferring data type of each feature based on column values. Set feature_meta

--- Training time= 21.408601760864258 seconds ---


In [66]:
predictor.evaluate_predictions(y_true=testing['alzheimers_prediction'], y_pred=predictor.predict(X_test), auxiliary_metrics=True, detailed_report=True)

Evaluation: accuracy on test data: 0.8
Evaluations on test data:
{
    "accuracy": 0.8,
    "balanced_accuracy": 0.625,
    "mcc": 0.44320263021395917,
    "f1": 0.4,
    "precision": 1.0,
    "recall": 0.25
}
Detailed (per-class) classification report:
{
    "0": {
        "precision": 0.7857142857142857,
        "recall": 1.0,
        "f1-score": 0.88,
        "support": 22
    },
    "1": {
        "precision": 1.0,
        "recall": 0.25,
        "f1-score": 0.4,
        "support": 8
    },
    "accuracy": 0.8,
    "macro avg": {
        "precision": 0.8928571428571428,
        "recall": 0.625,
        "f1-score": 0.64,
        "support": 30
    },
    "weighted avg": {
        "precision": 0.8428571428571429,
        "recall": 0.8,
        "f1-score": 0.752,
        "support": 30
    }
}


{'accuracy': 0.8,
 'balanced_accuracy': 0.625,
 'mcc': 0.44320263021395917,
 'f1': 0.4,
 'precision': 1.0,
 'recall': 0.25,
 'confusion_matrix':     0  1
 0  22  0
 1   6  2,
 'classification_report': {'0': {'precision': 0.7857142857142857,
   'recall': 1.0,
   'f1-score': 0.88,
   'support': 22},
  '1': {'precision': 1.0, 'recall': 0.25, 'f1-score': 0.4, 'support': 8},
  'accuracy': 0.8,
  'macro avg': {'precision': 0.8928571428571428,
   'recall': 0.625,
   'f1-score': 0.64,
   'support': 30},
  'weighted avg': {'precision': 0.8428571428571429,
   'recall': 0.8,
   'f1-score': 0.752,
   'support': 30}}}

## Coronary heart disease Prediction
Splitting data for training and testing

In [28]:
#coronary_heart_disease_prediction
#Splitting data into training and testing 80:20
training = dataset.sample(frac=0.8, random_state=25)
training =  training.drop(columns = ['patientid', 'alzheimers_prediction', 'stroke_prediction', 'hypertension_prediction'])
testing = dataset.drop(training.index)
testing = testing.drop(columns = ['patientid', 'alzheimers_prediction', 'stroke_prediction', 'hypertension_prediction'])
X_test = testing.drop(columns = ['coronary_heart_disease_prediction'])
print("Training size = ", len(training))
print("Out of sample testing size = ", len(testing))

# Splitting data into training and testing for deploying to an endpoint
training_target = training.pop("coronary_heart_disease_prediction")
training.insert(0, 'coronary_heart_disease_prediction', training_target)
testing_target = testing.pop("coronary_heart_disease_prediction")
testing.insert(0, 'coronary_heart_disease_prediction', testing_target)
training.to_csv('s3://multimodal-dataset-clinical-genomic-imaging/multimodal_coronary_heart_disease_training.csv', index=False)
testing.to_csv('s3://multimodal-dataset-clinical-genomic-imaging/multimodal_coronary_heart_disease_testing.csv', index=False)

Training size =  121
Out of sample testing size =  30


In [68]:
### Coronary heart disease prediction on clinical,  genomic, and imaging data using Autogluon

In [69]:
import time
start_time = time.time()
buckt = sm_session.default_bucket()
prefix= "genomic-clinical-imaging-coronary-heart-disease-prediction"
save_file = 's3://{}/{}'.format(buckt, prefix)
predictor = TabularPredictor(label= 'coronary_heart_disease_prediction', problem_type= 'binary', path=save_file).fit(train_data=training, holdout_frac=0.1, excluded_model_types=['CAT', 'XGB'])
print("--- Training time= %s seconds ---" % (time.time() - start_time))

save object to s3://sagemaker-us-east-1-659535263284/genomic-clinical-imaging-coronary-heart-disease-prediction/learner.pkl
save object to s3://sagemaker-us-east-1-659535263284/genomic-clinical-imaging-coronary-heart-disease-prediction/predictor.pkl
Beginning AutoGluon training ...
AutoGluon will save models to "s3://sagemaker-us-east-1-659535263284/genomic-clinical-imaging-coronary-heart-disease-prediction/"
AutoGluon Version:  0.6.2
Python Version:     3.7.10
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Mar 23 09:54:12 UTC 2023
Train Data Rows:    121
Train Data Columns: 144
Label Column: coronary_heart_disease_prediction
Preprocessing data ...
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    2779.88 MB
	Train Data (Original)  Memory Usage: 1.52 MB (0.1% of available memory)
	Inferring data type of each 

--- Training time= 21.052737712860107 seconds ---


In [70]:
predictor.evaluate_predictions(y_true=testing['coronary_heart_disease_prediction'], y_pred=predictor.predict(X_test), auxiliary_metrics=True, detailed_report=True)

Evaluation: accuracy on test data: 0.8333333333333334
Evaluations on test data:
{
    "accuracy": 0.8333333333333334,
    "balanced_accuracy": 0.6923076923076923,
    "mcc": 0.35082320772281167,
    "f1": 0.4444444444444445,
    "precision": 0.4,
    "recall": 0.5
}
Detailed (per-class) classification report:
{
    "0": {
        "precision": 0.92,
        "recall": 0.8846153846153846,
        "f1-score": 0.9019607843137256,
        "support": 26
    },
    "1": {
        "precision": 0.4,
        "recall": 0.5,
        "f1-score": 0.4444444444444445,
        "support": 4
    },
    "accuracy": 0.8333333333333334,
    "macro avg": {
        "precision": 0.66,
        "recall": 0.6923076923076923,
        "f1-score": 0.673202614379085,
        "support": 30
    },
    "weighted avg": {
        "precision": 0.8506666666666668,
        "recall": 0.8333333333333334,
        "f1-score": 0.8409586056644881,
        "support": 30
    }
}


{'accuracy': 0.8333333333333334,
 'balanced_accuracy': 0.6923076923076923,
 'mcc': 0.35082320772281167,
 'f1': 0.4444444444444445,
 'precision': 0.4,
 'recall': 0.5,
 'confusion_matrix':     0  1
 0  23  3
 1   2  2,
 'classification_report': {'0': {'precision': 0.92,
   'recall': 0.8846153846153846,
   'f1-score': 0.9019607843137256,
   'support': 26},
  '1': {'precision': 0.4,
   'recall': 0.5,
   'f1-score': 0.4444444444444445,
   'support': 4},
  'accuracy': 0.8333333333333334,
  'macro avg': {'precision': 0.66,
   'recall': 0.6923076923076923,
   'f1-score': 0.673202614379085,
   'support': 30},
  'weighted avg': {'precision': 0.8506666666666668,
   'recall': 0.8333333333333334,
   'f1-score': 0.8409586056644881,
   'support': 30}}}

## Stroke Prediction
Splitting data for training and testing

In [29]:
#stroke_prediction
#Splitting data into training and testing 80:20
training = dataset.sample(frac=0.8, random_state=30)
training =  training.drop(columns = ['patientid', 'alzheimers_prediction', 'coronary_heart_disease_prediction', 'hypertension_prediction'])
testing = dataset.drop(training.index)
testing = testing.drop(columns = ['patientid', 'alzheimers_prediction', 'coronary_heart_disease_prediction', 'hypertension_prediction'])
X_test = testing.drop(columns = ['stroke_prediction'])
print("Training size = ", len(training))
print("Out of sample testing size = ", len(testing))

# Splitting data into training and testing for deploying to an endpoint
training_target = training.pop("stroke_prediction")
training.insert(0, 'stroke_prediction', training_target)
testing_target = testing.pop("stroke_prediction")
testing.insert(0, 'stroke_prediction', testing_target)
training.to_csv('s3://multimodal-dataset-clinical-genomic-imaging/multimodal_stroke_training.csv', index=False)
testing.to_csv('s3://multimodal-dataset-clinical-genomic-imaging/multimodal_stroke_testing.csv', index=False)

Training size =  121
Out of sample testing size =  30


### Stroke prediction on clinical, genomic, and imaging data using Autogluon

In [72]:
import time
start_time = time.time()
buckt = sm_session.default_bucket()
prefix= "genomic-clinical-imaging-stroke_prediction"
save_file = 's3://{}/{}'.format(buckt, prefix)
predictor = TabularPredictor(label= 'stroke_prediction', problem_type= 'binary', path=save_file).fit(train_data=training, holdout_frac=0.1, excluded_model_types=['CAT', 'XGB'])
print("--- Training time= %s seconds ---" % (time.time() - start_time))

save object to s3://sagemaker-us-east-1-659535263284/genomic-clinical-imaging-stroke_prediction/learner.pkl
save object to s3://sagemaker-us-east-1-659535263284/genomic-clinical-imaging-stroke_prediction/predictor.pkl
Beginning AutoGluon training ...
AutoGluon will save models to "s3://sagemaker-us-east-1-659535263284/genomic-clinical-imaging-stroke_prediction/"
AutoGluon Version:  0.6.2
Python Version:     3.7.10
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Mar 23 09:54:12 UTC 2023
Train Data Rows:    121
Train Data Columns: 144
Label Column: stroke_prediction
Preprocessing data ...
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    2778.9 MB
	Train Data (Original)  Memory Usage: 1.38 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manual

--- Training time= 19.940380811691284 seconds ---


In [73]:
predictor.evaluate_predictions(y_true=testing['stroke_prediction'], y_pred=predictor.predict(X_test), auxiliary_metrics=True, detailed_report=True)

Evaluation: accuracy on test data: 0.9666666666666667
Evaluations on test data:
{
    "accuracy": 0.9666666666666667,
    "balanced_accuracy": 0.9642857142857143,
    "mcc": 0.9348527048856053,
    "f1": 0.9696969696969697,
    "precision": 0.9411764705882353,
    "recall": 1.0
}
Detailed (per-class) classification report:
{
    "0": {
        "precision": 1.0,
        "recall": 0.9285714285714286,
        "f1-score": 0.962962962962963,
        "support": 14
    },
    "1": {
        "precision": 0.9411764705882353,
        "recall": 1.0,
        "f1-score": 0.9696969696969697,
        "support": 16
    },
    "accuracy": 0.9666666666666667,
    "macro avg": {
        "precision": 0.9705882352941176,
        "recall": 0.9642857142857143,
        "f1-score": 0.9663299663299664,
        "support": 30
    },
    "weighted avg": {
        "precision": 0.9686274509803922,
        "recall": 0.9666666666666667,
        "f1-score": 0.9665544332210999,
        "support": 30
    }
}


{'accuracy': 0.9666666666666667,
 'balanced_accuracy': 0.9642857142857143,
 'mcc': 0.9348527048856053,
 'f1': 0.9696969696969697,
 'precision': 0.9411764705882353,
 'recall': 1.0,
 'confusion_matrix':     0   1
 0  13   1
 1   0  16,
 'classification_report': {'0': {'precision': 1.0,
   'recall': 0.9285714285714286,
   'f1-score': 0.962962962962963,
   'support': 14},
  '1': {'precision': 0.9411764705882353,
   'recall': 1.0,
   'f1-score': 0.9696969696969697,
   'support': 16},
  'accuracy': 0.9666666666666667,
  'macro avg': {'precision': 0.9705882352941176,
   'recall': 0.9642857142857143,
   'f1-score': 0.9663299663299664,
   'support': 30},
  'weighted avg': {'precision': 0.9686274509803922,
   'recall': 0.9666666666666667,
   'f1-score': 0.9665544332210999,
   'support': 30}}}

## Hypertension Prediction
Splitting data for training and testing

In [30]:
#hypertension_prediction
#Splitting data into training and testing 80:20
training = dataset.sample(frac=0.8, random_state=25)
training = training.drop(columns = ['patientid', 'alzheimers_prediction', 'coronary_heart_disease_prediction', 'stroke_prediction'])
testing = dataset.drop(training.index)
testing = testing.drop(columns = ['patientid', 'alzheimers_prediction', 'coronary_heart_disease_prediction', 'stroke_prediction'])
X_test = testing.drop(columns = ['hypertension_prediction'])
print("Training size = ", len(training))
print("Out of sample testing size = ", len(testing))

# Splitting data into training and testing for deploying to an endpoint
training_target = training.pop("hypertension_prediction")
training.insert(0, 'hypertension_prediction', training_target)
testing_target = testing.pop("hypertension_prediction")
testing.insert(0, 'hypertension_prediction', testing_target)
training.to_csv('s3://multimodal-dataset-clinical-genomic-imaging/multimodal_hypertension_training.csv', index=False)
testing.to_csv('s3://multimodal-dataset-clinical-genomic-imaging/multimodal_hypertension_testing.csv', index=False)

Training size =  121
Out of sample testing size =  30


### Hypertension prediction on clinical, genomic, and imaging data using Autogluon

In [75]:
import time
start_time = time.time()
buckt = sm_session.default_bucket()
prefix= "genomic-clinical-imaging-hypertension-prediction"
save_file = 's3://{}/{}'.format(buckt, prefix)
predictor = TabularPredictor(label= 'hypertension_prediction', problem_type= 'binary', path=save_file).fit(train_data=training, holdout_frac=0.1, excluded_model_types=['CAT', 'XGB'])
print("--- Training time= %s seconds ---" % (time.time() - start_time))

save object to s3://sagemaker-us-east-1-659535263284/genomic-clinical-imaging-hypertension-prediction/learner.pkl
save object to s3://sagemaker-us-east-1-659535263284/genomic-clinical-imaging-hypertension-prediction/predictor.pkl
Beginning AutoGluon training ...
AutoGluon will save models to "s3://sagemaker-us-east-1-659535263284/genomic-clinical-imaging-hypertension-prediction/"
AutoGluon Version:  0.6.2
Python Version:     3.7.10
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Mar 23 09:54:12 UTC 2023
Train Data Rows:    121
Train Data Columns: 144
Label Column: hypertension_prediction
Preprocessing data ...
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    2779.76 MB
	Train Data (Original)  Memory Usage: 1.52 MB (0.1% of available memory)
	Inferring data type of each feature based on column values. Set feat

--- Training time= 21.77889585494995 seconds ---


In [76]:
predictor.evaluate_predictions(y_true=testing['hypertension_prediction'], y_pred=predictor.predict(X_test), auxiliary_metrics=True, detailed_report=True)

Evaluation: accuracy on test data: 0.8666666666666667
Evaluations on test data:
{
    "accuracy": 0.8666666666666667,
    "balanced_accuracy": 0.9047619047619048,
    "mcc": 0.7486251134176306,
    "f1": 0.8181818181818181,
    "precision": 0.6923076923076923,
    "recall": 1.0
}
Detailed (per-class) classification report:
{
    "0": {
        "precision": 1.0,
        "recall": 0.8095238095238095,
        "f1-score": 0.8947368421052632,
        "support": 21
    },
    "1": {
        "precision": 0.6923076923076923,
        "recall": 1.0,
        "f1-score": 0.8181818181818181,
        "support": 9
    },
    "accuracy": 0.8666666666666667,
    "macro avg": {
        "precision": 0.8461538461538461,
        "recall": 0.9047619047619048,
        "f1-score": 0.8564593301435406,
        "support": 30
    },
    "weighted avg": {
        "precision": 0.9076923076923077,
        "recall": 0.8666666666666667,
        "f1-score": 0.8717703349282296,
        "support": 30
    }
}


{'accuracy': 0.8666666666666667,
 'balanced_accuracy': 0.9047619047619048,
 'mcc': 0.7486251134176306,
 'f1': 0.8181818181818181,
 'precision': 0.6923076923076923,
 'recall': 1.0,
 'confusion_matrix':     0  1
 0  17  4
 1   0  9,
 'classification_report': {'0': {'precision': 1.0,
   'recall': 0.8095238095238095,
   'f1-score': 0.8947368421052632,
   'support': 21},
  '1': {'precision': 0.6923076923076923,
   'recall': 1.0,
   'f1-score': 0.8181818181818181,
   'support': 9},
  'accuracy': 0.8666666666666667,
  'macro avg': {'precision': 0.8461538461538461,
   'recall': 0.9047619047619048,
   'f1-score': 0.8564593301435406,
   'support': 30},
  'weighted avg': {'precision': 0.9076923076923077,
   'recall': 0.8666666666666667,
   'f1-score': 0.8717703349282296,
   'support': 30}}}