In [48]:
# import libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import boto3
warnings.simplefilter('ignore')

### About data


Aim of the problem is to detect the presence or absence of cardiovascular disease in person based on the given features.
Features available are:


- Age | Objective Feature | age | int (days)
- Height | Objective Feature | height | int (cm) |
- Weight | Objective Feature | weight | float (kg) |
- Gender | Objective Feature | gender | categorical code |
- Systolic blood pressure | Examination Feature | ap_hi | int |
- Diastolic blood pressure | Examination Feature | ap_lo | int |
- Cholesterol | Examination Feature | cholesterol | 1: normal, 2: above normal, 3: well above normal |
- Glucose | Examination Feature | gluc | 1: normal, 2: above normal, 3: well above normal |
- Smoking | Subjective Feature | smoke | binary |
- Alcohol intake | Subjective Feature | alco | binary |
- Physical activity | Subjective Feature | active | binary |
- Presence or absence of cardiovascular disease | Target Variable | cardio | binary |

Note that:
- Objective: factual information;
- Examination: results of medical examination;
- Subjective: information given by the patient.

Data Source:https://www.kaggle.com/sulianova/cardiovascular-disease-dataset

**Downloading data from s3**

In [49]:
# Get the service client
s3 = boto3.client('s3')
bucket_name = 'sagemaker-learner'
folder_path = 'data'
data_uri = "s3://sagemaker-learner/data/cardio_train.csv"

try: 
    s3.download_file("sagemaker-learner", "data/cardio.csv", "cardio.csv")
    print(f"File downloaded")
except:
    print(f"Error in file download")

File downloaded


In [50]:
df = pd.read_csv('cardio.csv',sep=";")
df = df.drop(columns=['id'])
print(df.shape)
df.head(3)

(70000, 12)


Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1


In [51]:
# since the age is given in days, we convert it into years

df['age'] = df['age']/365


In [52]:
## checking null values
df.isnull().sum()

age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

### Train test split

In [53]:
df['cardio'].value_counts()

0    35021
1    34979
Name: cardio, dtype: int64

In [54]:
y = df['cardio']
x = df.drop(columns='cardio')

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(x,y,stratify=y,random_state=42)



# PCA using Sagemaker

In [47]:
import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
# get region
region_name = boto3.Session().region_name
# get execution role
role =  sagemaker.get_execution_role()


In [58]:
# upload data for pca
import io
import sagemaker.amazon.common as smac


bucket_name = 'sagemaker-learner'
data_key = 'cardio_vascular/pca_data'

x = x.to_numpy()
x = x.astype('float32')


s3_client = boto3.client('s3') 
buf = io.BytesIO() # create an in-memory byte array (buf is a buffer I will be writing to)
smac.write_numpy_to_dense_tensor(buf, x)
buf.seek(0) 
s3_client.upload_fileobj(buf, bucket_name, data_key)


In [60]:
from sagemaker.amazon.amazon_estimator import get_image_uri


container = get_image_uri(region_name, 'pca')


# We have pass in the container, the type of instance that we would like to use for training 
# output path and sagemaker session into the Estimator. 
# We can also specify how many instances we would like to use for training
output_model_location = "s3://sagemaker-learner/cardio_vascular/"
s3_train_data = "s3://sagemaker-learner/cardio_vascular/pca_data"


pca = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.c4.xlarge',
                                    output_path=output_model_location,
                                    sagemaker_session=sagemaker_session)

# We can tune parameters like the number of features that we are passing in, mode of algorithm, mini batch size and number of pca components
pca.set_hyperparameters(feature_dim=11,
                        num_components=6,
                        subtract_mean=False,
                        algorithm_mode='regular',
                        mini_batch_size=100)


# Pass in the training data from S3 to train the pca model


pca.fit({'train': s3_train_data})

# Let's see the progress using cloudwatch logs

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


2022-09-06 05:32:21 Starting - Starting the training job...
2022-09-06 05:32:45 Starting - Preparing the instances for trainingProfilerReport-1662442340: InProgress
.........
2022-09-06 05:34:17 Downloading - Downloading input data
2022-09-06 05:34:17 Training - Downloading the training image...........[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[09/06/2022 05:35:59 INFO 140137731159872] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-conf.json: {'algorithm_mode': 'regular', 'subtract_mean': 'true', 'extra_components': '-1', 'force_dense': 'true', 'epochs': 1, '_log_level': 'info', '_kvstore': 'dist_sync', '_num_kv_servers': 'auto', '_num_gpus': 'auto'}[0m
[34m[09/06/2022 05:35:59 INFO 140137731159872] Merging with provided configuration from /opt/ml/input/config/hyperparameters.json: {'algorithm_mode': 'regular', 'feature_dim': '11', 'mini_batch_size': 

**Deploying pca endpoint**

In [61]:
pca_reduction = pca.deploy(initial_instance_count = 1,
                                          instance_type = 'ml.m4.xlarge')


--------!

In [65]:
from sagemaker.predictor import csv_serializer, json_deserializer

pca_reduction.serializer = csv_serializer
pca_reduction.deserializer = json_deserializer

X_test = X_test.to_numpy()
X_test = X_test.astype('float32')
result = pca_reduction.predict(X_test)
predictions = np.array([r['projection'] for r in result['projections']])
predictions[0]

array([  -0.42187485,   -8.00320816,  -11.90813637,   47.10366058,
         46.62126541, -225.4831543 ])

In [66]:
# Delete the end-point

pca_reduction.delete_endpoint()