### Disease Prediction using ML in Sagemaker

In [1]:
import sagemaker
from sklearn.model_selection import train_test_split
import boto3
import pandas as pd

sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = 'disease-sagemaker' # Mention the created S3 bucket name here
print("Using bucket " + bucket)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
Using bucket disease-sagemaker


In [4]:
import warnings
import pandas as pd

# Ignore the warning
warnings.filterwarnings("ignore", message="Your installed version of s3fs is very old and known to cause severe performance issues,")

# Read the CSV file from S3
df = pd.read_csv("s3://disease-sagemaker/Disease_Training.csv")

# Restore warnings to default behavior (optional)
warnings.resetwarnings()



In [5]:
df.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
df.shape

(4920, 133)

In [8]:
df.columns

Index(['itching', 'skin_rash', 'nodal_skin_eruptions', 'continuous_sneezing',
       'shivering', 'chills', 'joint_pain', 'stomach_pain', 'acidity',
       'ulcers_on_tongue',
       ...
       'blackheads', 'scurring', 'skin_peeling', 'silver_like_dusting',
       'small_dents_in_nails', 'inflammatory_nails', 'blister',
       'red_sore_around_nose', 'yellow_crust_ooze', 'prognosis'],
      dtype='object', length=133)

In [9]:
df.shape

(4920, 133)

In [11]:
features = list(df.columns)
features

['itching',
 'skin_rash',
 'nodal_skin_eruptions',
 'continuous_sneezing',
 'shivering',
 'chills',
 'joint_pain',
 'stomach_pain',
 'acidity',
 'ulcers_on_tongue',
 'muscle_wasting',
 'vomiting',
 'burning_micturition',
 'spotting_ urination',
 'fatigue',
 'weight_gain',
 'anxiety',
 'cold_hands_and_feets',
 'mood_swings',
 'weight_loss',
 'restlessness',
 'lethargy',
 'patches_in_throat',
 'irregular_sugar_level',
 'cough',
 'high_fever',
 'sunken_eyes',
 'breathlessness',
 'sweating',
 'dehydration',
 'indigestion',
 'headache',
 'yellowish_skin',
 'dark_urine',
 'nausea',
 'loss_of_appetite',
 'pain_behind_the_eyes',
 'back_pain',
 'constipation',
 'abdominal_pain',
 'diarrhoea',
 'mild_fever',
 'yellow_urine',
 'yellowing_of_eyes',
 'acute_liver_failure',
 'fluid_overload',
 'swelling_of_stomach',
 'swelled_lymph_nodes',
 'malaise',
 'blurred_and_distorted_vision',
 'phlegm',
 'throat_irritation',
 'redness_of_eyes',
 'sinus_pressure',
 'runny_nose',
 'congestion',
 'chest_pain',


In [12]:
label = features.pop(-1)
label

'prognosis'

In [13]:
x = df[features]
y = df[label]

In [14]:
x.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,pus_filled_pimples,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
x.shape

(4920, 132)

In [17]:
y.value_counts()

1     120
2     120
3     120
4     120
5     120
6     120
7     120
8     120
9     120
10    120
11    120
12    120
13    120
14    120
15    120
16    120
17    120
18    120
19    120
20    120
21    120
22    120
23    120
24    120
25    120
26    120
27    120
28    120
29    120
30    120
31    120
32    120
33    120
34    120
35    120
36    120
37    120
38    120
39    120
40    120
41    120
Name: prognosis, dtype: int64

In [18]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.15, random_state=0)

In [19]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4182, 132)
(738, 132)
(4182,)
(738,)


In [20]:
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test

In [21]:
print(trainX.shape)
print(testX.shape)

(4182, 133)
(738, 133)


In [22]:
trainX.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
1328,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,10
3099,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,25
4663,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,31
799,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,39
195,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,20


In [23]:
trainX.isnull().sum()

itching                 0
skin_rash               0
nodal_skin_eruptions    0
continuous_sneezing     0
shivering               0
                       ..
inflammatory_nails      0
blister                 0
red_sore_around_nose    0
yellow_crust_ooze       0
prognosis               0
Length: 133, dtype: int64

In [24]:
testX.isnull().sum()

itching                 0
skin_rash               0
nodal_skin_eruptions    0
continuous_sneezing     0
shivering               0
                       ..
inflammatory_nails      0
blister                 0
red_sore_around_nose    0
yellow_crust_ooze       0
prognosis               0
Length: 133, dtype: int64

In [25]:
trainX.to_csv("train-V-1.csv",index = False)
testX.to_csv("test-V-1.csv", index = False)

In [26]:
bucket

'disease-sagemaker'

In [33]:
# send data to S3. SageMaker will take training data from s3
sk_prefix = "disease"
trainpath = sess.upload_data(
    path="train-V-1.csv", bucket=bucket, key_prefix=sk_prefix
)

testpath = sess.upload_data(
    path="test-V-1.csv", bucket=bucket, key_prefix=sk_prefix
)
print(trainpath)
print(testpath)

s3://disease-sagemaker/disease/train-V-1.csv
s3://disease-sagemaker/disease/test-V-1.csv


In [37]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO 
import argparse
import joblib
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf
    
if __name__ == "__main__":

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-V-1.csv")
    parser.add_argument("--test-file", type=str, default="test-V-1.csv")

    args, _ = parser.parse_known_args()
    
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (85%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (15%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
  
    print("Training RandomForest Model.....")
    print()
    model =  RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state, verbose = 3,n_jobs=-1)
    model.fit(X_train, y_train)
    print()
    

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model,model_path)
    print("Model persisted at " + model_path)
    print()

    
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test,y_pred_test)
    test_rep = classification_report(y_test,y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Accuracy is: ', test_acc)
    print('[TESTING] Testing Report: ')
    print(test_rep)

    # Calculate F1 score
    f1 = f1_score(y_test, y_pred_test)
    print('[TESTING] F1 Score: ', f1)

    # Calculate Sensitivity (Recall)
    sensitivity = recall_score(y_test, y_pred_test)
    print('[TESTING] Sensitivity (Recall): ', sensitivity)

    # Calculate Specificity
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_test).ravel()
    specificity = tn / (tn + fp)
    print('[TESTING] Specificity: ', specificity)

    # Plot ROC curve
    y_pred_proba = model.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc="lower right")
    plt.show()


Overwriting script.py


In [40]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role="arn:aws:iam::730335379918:role/service-role/AmazonSageMaker-ExecutionRole-20240312T224673",
    instance_type="ml.m4.xlarge",
    instance_count=1,
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn",
    hyperparameters={
        "n_estimators": 100,
        "max_depth": None,  # Limit the depth of the trees
        "min_samples_split": 2,  # Minimum number of samples required to split a node
        "min_samples_leaf": 1,  # Minimum number of samples required at each leaf node
        "random_state": 42,  # Fixing random state for reproducibility
    },
    use_spot_instances=True,
    max_wait=7200,
    max_run=3600
)


In [41]:
# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)
# sklearn_estimator.fit({"train": datapath}, wait=True)

INFO:sagemaker:Creating training-job with name: RF-custom-sklearn-2024-03-12-16-39-30-447


2024-03-12 16:39:30 Starting - Starting the training job......
2024-03-12 16:40:04 Starting - Preparing the instances for training......
2024-03-12 16:41:13 Downloading - Downloading input data...
2024-03-12 16:41:39 Downloading - Downloading the training image...
2024-03-12 16:42:19 Training - Training image download completed. Training in progress..[34m2024-03-12 16:42:28,121 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2024-03-12 16:42:28,125 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-03-12 16:42:28,180 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2024-03-12 16:42:28,360 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-03-12 16:42:28,374 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-03-12 16:42:28,388 sagemaker-training-toolkit INFO     No GP

In [42]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)


2024-03-12 16:42:45 Starting - Preparing the instances for training
2024-03-12 16:42:45 Downloading - Downloading the training image
2024-03-12 16:42:45 Training - Training image download completed. Training in progress.
2024-03-12 16:42:45 Uploading - Uploading generated training model
2024-03-12 16:42:45 Completed - Training job completed
Model artifact persisted at s3://sagemaker-ca-central-1-730335379918/RF-custom-sklearn-2024-03-12-16-39-30-447/output/model.tar.gz


In [43]:
artifact

's3://sagemaker-ca-central-1-730335379918/RF-custom-sklearn-2024-03-12-16-39-30-447/output/model.tar.gz'

In [44]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Disease-Prediction-Model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name =  model_name,
    model_data=artifact,
    role="arn:aws:iam::730335379918:role/service-role/AmazonSageMaker-ExecutionRole-20240312T224673",
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)

In [45]:
model_name

'Custom-sklearn-model-2024-03-12-16-44-50'

In [47]:
##Endpoints deployment
endpoint_name = "Disease-Prediction-Model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
)

INFO:sagemaker:Creating model with name: Custom-sklearn-model-2024-03-12-16-44-50


EndpointName=Custom-sklearn-model-2024-03-12-16-45-26


INFO:sagemaker:Creating endpoint-config with name Custom-sklearn-model-2024-03-12-16-45-26
INFO:sagemaker:Creating endpoint with name Custom-sklearn-model-2024-03-12-16-45-26


-----!

In [48]:
endpoint_name

'Custom-sklearn-model-2024-03-12-16-45-26'

In [None]:
testX[features][0:2].values.tolist()

In [None]:
print(predictor.predict(testX[features][0:2].values.tolist()))

In [None]:
predictions = predictor.predict(testX[features][0:2].values.tolist())

# Assuming testY contains the actual labels/targets
actual_values = trainX.iloc[0:2]  # Assuming testY is a pandas DataFrame/Series

# Compare predictions with actual values and calculate accuracy
correct_predictions = sum(predictions == actual_values)
total_predictions = len(predictions)
accuracy = correct_predictions / total_predictions

print("Accuracy:", accuracy)

In [None]:
sm_boto3.delete_endpoint(EndpointName=endpoint_name)