In [3]:
import pandas as pd
import sagemaker
from sagemaker import get_execution_role

# Define S3 bucket and file details
bucket = 'test-bucket-hattabi'
data_key = 'processed/titanic.csv'
prefix = 'train_test'
data_location = 's3://{}/{}'.format(bucket, data_key)

# Read data into a Pandas DataFrame
df = pd.read_csv(data_location)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



In [14]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Ticket,Fare,Sex_male,Embarked_Q,Embarked_S,Titre
0,1,0,3,22.0,1,0,A/5 21171,7.2500,1,0,1,2
1,2,1,1,38.0,1,0,PC 17599,71.2833,0,0,0,3
2,3,1,3,26.0,0,0,STON/O2. 3101282,7.9250,0,0,1,1
3,4,1,1,35.0,1,0,113803,53.1000,0,0,1,3
4,5,0,3,35.0,0,0,373450,8.0500,1,0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,27.0,0,0,211536,13.0000,1,0,1,4
887,888,1,1,19.0,0,0,112053,30.0000,0,0,1,1
888,889,0,3,19.0,1,2,W./C. 6607,23.4500,0,0,1,1
889,890,1,1,26.0,0,0,111369,30.0000,1,0,0,2


In [15]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Sex_male         int64
Embarked_Q       int64
Embarked_S       int64
Titre            int64
dtype: object

In [16]:
df.drop(['Ticket'], axis=1, inplace=True)

In [17]:
from sklearn.model_selection import train_test_split

# Split data into train and test sets
X_train,X_test,y_train,y_test = train_test_split(df.drop(["Survived", 'PassengerId'],axis=1),df["Survived"], test_size=0.2, random_state=42)


In [34]:
import pandas as pd
# Combine the features and the target variable for train and test sets
train_data = pd.concat([y_train,X_train], axis=1)
test_data = pd.concat([y_test,X_test], axis=1)
test_features = pd.concat([X_test], axis=1)

# Save the datasets as CSV files
train_data.to_csv('train.csv', index=False,header=False)
test_data.to_csv('test.csv', index=False,header=False)
test_features.to_csv('test_features.csv', index=False,header=False)

In [35]:
import boto3
import sagemaker
# Get the current SageMaker session
sagemaker_session = sagemaker.Session()
# Get the S3 client
s3 = boto3.client('s3')
# Upload the train set
s3.upload_file('train.csv', bucket, f'{prefix}/train.csv')
# Upload the test set
s3.upload_file('test.csv', bucket, f'{prefix}/test.csv')
# Upload the test set features only
s3.upload_file('test_features.csv', bucket, f'{prefix}/test_features.csv')
print("Upload Successful")

Upload Successful


# Start a Training job

In [21]:
from sagemaker import image_uris
import sagemaker
from sagemaker import get_execution_role
container = image_uris.retrieve("xgboost", region='eu-west-3', version='latest')

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [22]:

s3_input_train = sagemaker.inputs.TrainingInput(s3_data="s3://{}/{}/train.csv".format(bucket,prefix),content_type="csv")
s3_input_test = sagemaker.inputs.TrainingInput(s3_data="s3://{}/{}/test.csv".format(bucket,prefix),content_type="csv")


In [25]:
sess = sagemaker.Session()
role = get_execution_role()
xgb = sagemaker.estimator.Estimator(container,role,
                                    instance_count=1,
                                    instance_type='ml.m5.large',
                                    output_path="s3://{}/output".format(bucket),
                                   sagemaker_session=sess)

xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        verbosity=1,
                        objective='binary:logistic',
                        num_round=40,
                        eval_metric='auc')  # Add this line to monitor AUC


In [26]:
xgb.fit({'train':s3_input_train, 'validation': s3_input_test})

INFO:sagemaker:Creating training-job with name: xgboost-2024-01-19-12-41-43-265


2024-01-19 12:41:43 Starting - Starting the training job...
2024-01-19 12:42:11 Starting - Preparing the instances for training.........
2024-01-19 12:43:32 Downloading - Downloading input data......
2024-01-19 12:44:27 Downloading - Downloading the training image..[34mArguments: train[0m
[34m[2024-01-19:12:44:57:INFO] Running standalone xgboost training.[0m
[34m[2024-01-19:12:44:57:INFO] File size need to be processed in the node: 0.02mb. Available memory size in the node: 354.82mb[0m
[34m[2024-01-19:12:44:57:INFO] Determined delimiter of CSV input is ','[0m
[34m[12:44:57] S3DistributionType set as FullyReplicated[0m
[34m[12:44:57] 712x9 matrix with 6408 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2024-01-19:12:44:57:INFO] Determined delimiter of CSV input is ','[0m
[34m[12:44:57] S3DistributionType set as FullyReplicated[0m
[34m[12:44:57] 179x9 matrix with 1611 entries loaded from /opt/ml/input/data/validation?format=csv

# Start HyperParameters tunning job

In [47]:
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

# Define the hyperparameter ranges
hyperparameter_ranges = {
    'max_depth': IntegerParameter(3, 10),
    'eta': ContinuousParameter(0.01, 0.2),
    'min_child_weight': IntegerParameter(1, 10),
    'subsample': ContinuousParameter(0.5, 1),
    'gamma': ContinuousParameter(0, 5)
}

# Specify the objective metric that we'd like to tune and its definition
objective_metric_name = 'validation:auc'  # Example metric
objective_type = 'Maximize'  # Can be 'Maximize' or 'Minimize' depending on the metric

# Configure the tuner object
tuner = HyperparameterTuner(
    estimator=xgb,  # The estimator object to use as the basis for the training jobs.
    objective_metric_name=objective_metric_name,  # The metric used to compare trained models.
    hyperparameter_ranges=hyperparameter_ranges,  # The range of hyperparameters to tune.
    metric_definitions=[{'Name': objective_metric_name, 'Regex': 'validation-auc:([0-9\\.]+)'}],  # The regex to extract the metric from the logs
    max_jobs=20,  # The total number of models to train
    max_parallel_jobs=3,  # The number of models to train in parallel
    objective_type=objective_type
)

# Start the hyperparameter tuning job
tuner.fit({'train': s3_input_train, 'validation': s3_input_test})


INFO:sagemaker:Creating hyperparameter tuning job with name: xgboost-240119-1322


.............................................................................................................!


# Batch Prediction

In [4]:
best_training_job_name = 'xgboost-240119-1322-001-9a4cfdec'
best_estimator = sagemaker.estimator.Estimator.attach(best_training_job_name)


2024-01-19 13:26:31 Starting - Preparing the instances for training
2024-01-19 13:26:31 Downloading - Downloading the training image
2024-01-19 13:26:31 Training - Training image download completed. Training in progress.
2024-01-19 13:26:31 Uploading - Uploading generated training model
2024-01-19 13:26:31 Completed - Resource reused by training job: xgboost-240119-1322-006-a2b1def2


In [5]:
best_estimator.hyperparameters()

{'_tuning_objective_metric': 'validation:auc',
 'eta': '0.13077357310460885',
 'eval_metric': 'auc',
 'gamma': '2.2879899294244583',
 'max_depth': '6',
 'min_child_weight': '3',
 'num_round': '40',
 'objective': 'binary:logistic',
 'subsample': '0.9725941126497504',
 'verbosity': '1'}

In [6]:
best_model_transformer = best_estimator.transformer(
    instance_count=1,
    instance_type='ml.m5.large',
    output_path='s3://test-bucket-hattabi/predictions/'  # Specify your output path
)

In [10]:
best_model_transformer.transform(
    data='s3://test-bucket-hattabi/train_test/test_features.csv',  # Specify your input data path
    content_type='text/csv',  # The content type of the input data
    split_type='Line',  # How the input data is split
)
best_model_transformer.wait()

INFO:sagemaker:Creating transform job with name: xgboost-2024-01-25-08-19-58-117


...........................
[34mArguments: serve[0m
[34m[2024-01-25 08:24:30 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2024-01-25 08:24:30 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2024-01-25 08:24:30 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2024-01-25 08:24:30 +0000] [20] [INFO] Booting worker with pid: 20[0m
[34m[2024-01-25 08:24:30 +0000] [21] [INFO] Booting worker with pid: 21[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2024-01-25:08:24:30:INFO] Model loaded successfully for worker : 20[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2024-01-25:08:24:30:INFO] Model loaded successfully for worker : 21[0m
[34m[2024-01-25:08:24:35:INFO] Sniff delimiter as ','[0m
[34m[2024-01-25:08:24:35:INFO] Determined delimiter of CSV input is ','[0m
[35m[2024-01-25:08:24:35:INFO] Sniff delimiter as ','[0m
[35m[2024-01-25:08:24:35:INFO] Determined delimiter of CSV input is ','[0m
[32m2024-01-25T08:24:35.080:[sagemaker logs]: MaxConcur

# Evaluation

In [25]:
import boto3
import os
s3 = boto3.client('s3')
# Bucket Name where the file is located
bucket_name = 'test-bucket-hattabi'
# Path in S3
s3_file_key = 'predictions/test_features.csv.out'
# Local file name to save the data
local_file_name = 'test_features.csv.out'
# Download the file from S3
s3.download_file(bucket_name, s3_file_key, local_file_name)


In [27]:
# Threshold
threshold = 0.5

# Read the probabilities from the file (assuming the file is named 'predictions.txt')
with open('./test_features.csv.out', 'r') as file:
    probabilities = file.readlines()

# Convert probabilities to class labels
y_pred = [int(float(prob.strip()) >= threshold) for prob in probabilities]

test = pd.read_csv("s3://test-bucket-hattabi/train_test/test.csv", header=None)
y_true = test.iloc[:,0]

In [30]:
from sklearn.metrics import classification_report
print(classification_report(y_true,y_pred))

              precision    recall  f1-score   support

           0       0.82      0.87      0.84       105
           1       0.79      0.73      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179

