# Passo 1: Ajustando o SageMaker Environment

In [None]:
import sagemaker
from sagemaker import get_execution_role
import boto3

# Initialize SageMaker session
sagemaker_session = sagemaker.Session()

# Get the execution role
role = get_execution_role()

# Define the S3 bucket and prefix
bucket = 'sagemaker-grupo2'
prefix = 'dengue-prediction'

# Passo 2: Preparando os Dados

In [None]:
# Upload the dataset to S3
dataset_path = 'dengue_sintomas_brasilia_limpo.csv'
Xtrain_path = 'X_train.csv'
Xtest_path = 'X_test.csv'
ytrain_path = 'y_train.csv'
ytest_path = 'y_test.csv'

s3_input_path = sagemaker_session.upload_data(path=dataset_path, bucket=bucket, key_prefix=prefix)
s3_input_path_Xtrain = sagemaker_session.upload_data(path=Xtrain_path, bucket=bucket, key_prefix=prefix)    
s3_input_path_Xtest = sagemaker_session.upload_data(path=Xtest_path, bucket=bucket, key_prefix=prefix)
s3_input_path_ytrain = sagemaker_session.upload_data(path=ytrain_path, bucket=bucket, key_prefix=prefix)    
s3_input_path_ytest = sagemaker_session.upload_data(path=ytest_path, bucket=bucket, key_prefix=prefix)

print(f'Dataset uploaded to: {s3_input_path}')
print(f'X_train uploaded to: {s3_input_path_Xtrain}')
print(f'X_test uploaded to: {s3_input_path_Xtest}')
print(f'y_train uploaded to: {s3_input_path_ytrain}')
print(f'y_test uploaded to: {s3_input_path_ytest}')

# Passo 3: Definindo o script de treino

In [None]:
%%writefile train.py

import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
import xgboost as xgb
import joblib

def model_fn(model_dir):
    """Load model from the model_dir directory."""
    model = xgb.Booster()
    model.load_model(os.path.join(model_dir, "model.xgb"))
    return model

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    
    # SageMaker specific arguments
    parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
    
    args = parser.parse_args()
    
    # Load the dataset
    X_train = pd.read_csv(f'{args.train}/X_train.csv')
    X_test = pd.read_csv(f'{args.train}/X_test.csv')
    y_train = pd.read_csv(f'{args.train}/y_train.csv')
    y_test = pd.read_csv(f'{args.train}/y_test.csv')
    
    # Initialize the model with the best parameters
    model = xgb.XGBClassifier(
        eval_metric='mlogloss',
        learning_rate=0.1,
        max_depth=3,
        n_estimators=300,
        scale_pos_weight=5,
        random_state=42
    )
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Calculate accuracy on the test set
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model accuracy on test set: {accuracy:.4f}")
    
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Save the model in .xgb format
    model_path = os.path.join(args.model_dir, "model.xgb")
    model.get_booster().save_model(model_path)

# Passo 4: Criar estimator

In [None]:
from sagemaker.xgboost.estimator import XGBoost

best_xgb_params = {
    'eval_metric': 'mlogloss',
    'learning_rate': 0.1,
    'max_depth': 3,
    'scale_pos_weight': 5,
    'n_estimators': 300,
}

# Define the estimator
xgboost_estimator = XGBoost(
    entry_point='train.py',
    role=role,
    instance_type='ml.m5.large',
    instance_count=1,
    framework_version='1.3-1',
    hyperparameters=best_xgb_params,
    sagemaker_session=sagemaker_session,
    source_dir='training_code', 
    dependencies=['training_code/requirements.txt']
)

# Passo 5: Treinar o modelo

In [None]:
# Launch the training job
xgboost_estimator.fit({'train': f's3://{bucket}/{prefix}'})