# Tratamento da base de dados

In [2]:
import pandas as pd
import numpy as np
base_census = pd.read_csv('census.csv')
colunas = []
colunas.append('income')
for i in range(len(base_census.columns[:-1])):
    colunas.append(base_census.columns[i])
base_census = base_census[colunas]
base_census

Unnamed: 0,income,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,inative-country
0,<=50K,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,<=50K,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,<=50K,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,<=50K,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,<=50K,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,<=50K,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States
32557,>50K,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
32558,<=50K,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States
32559,<=50K,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States


In [3]:
def income(text):
    if text == ' >50K':
        return 1.0
    else:
        return 0.0

In [4]:
base_census['income'] = base_census['income'].apply(income)

In [5]:
base_census

Unnamed: 0,income,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,inative-country
0,0.0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,0.0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,0.0,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,0.0,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,0.0,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.0,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States
32557,1.0,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
32558,0.0,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States
32559,0.0,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States


In [6]:
base_census = pd.get_dummies(base_census,prefix=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'inative-country'], 
                            columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'inative-country'])

In [7]:
base_treinamento = base_census.iloc[0:22792,:]
base_treinamento.shape

(22792, 109)

In [8]:
base_teste = base_census.iloc[22792:,:]
base_teste.shape

(9769, 109)

In [9]:
22792 + 9769

32561

In [10]:
X_teste = base_teste.iloc[:,1:len(base_census)].values
X_teste

array([[    30,  75167,     13, ...,      1,      0,      0],
       [    39, 176296,      9, ...,      1,      0,      0],
       [    19,  93518,     10, ...,      1,      0,      0],
       ...,
       [    58, 151910,      9, ...,      1,      0,      0],
       [    22, 201490,      9, ...,      1,      0,      0],
       [    52, 287927,      9, ...,      1,      0,      0]])

In [11]:
X_teste.shape

(9769, 108)

In [12]:
y_teste = base_teste.iloc[:, 0].values
y_teste

array([0., 1., 0., ..., 0., 0., 1.])

In [13]:
base_treinamento.to_csv('census_train_xgboost.csv', header = False, index = False)
base_teste.to_csv('census_test_xgboost.csv', header = False, index = False)

# Configurações SageMaker

In [14]:
import sagemaker
import boto3
from sagemaker import Session
import sagemaker.amazon.common as smac # sagemaker commom library
import io
import os

In [15]:
session = sagemaker.Session()
bucket = 'cursoawssagemaker'
subpasta_modelo = 'modelos/census/xgboost'
subpasta_dataset = 'datasets/census'
key_train = 'census-train-data-xgboost'
key_test = 'census-test-data-xgboost'
role = sagemaker.get_execution_role()
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, subpasta_dataset, key_train)
s3_test_data = 's3://{}/{}/test/{}'.format(bucket, subpasta_dataset, key_test)
output_location = 's3://{}/{}/output'.format(bucket, subpasta_modelo)
print('Role: {}'.format(role))
print('Localização da base de treinamento: {}'.format(s3_train_data))
print('Localização da base de teste: {}'.format(s3_test_data))
print('Modelo final será salvo em: {}'.format(output_location))

Role: arn:aws:iam::936535973187:role/service-role/AmazonSageMaker-ExecutionRole-20220510T125992
Localização da base de treinamento: s3://cursoawssagemaker/datasets/census/train/census-train-data-xgboost
Localização da base de teste: s3://cursoawssagemaker/datasets/census/test/census-test-data-xgboost
Modelo final será salvo em: s3://cursoawssagemaker/modelos/census/xgboost/output


In [16]:
with open('census_train_xgboost.csv', 'rb') as f:
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(subpasta_dataset, 'train', key_train)).upload_fileobj(f)

In [17]:
with open('census_test_xgboost.csv', 'rb') as f:
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(subpasta_dataset, 'test', key_test)).upload_fileobj(f)

# Treinamento do XGBoost

In [18]:
# https://docs.aws.amazon.com/sagemaker/latest/dg/ecr-sa-east-1.html
from sagemaker import image_uris
container = image_uris.retrieve(framework = 'xgboost', region=boto3.Session().region_name, version='latest')

In [19]:
# https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html
xgboost = sagemaker.estimator.Estimator(image_uri = container,
                                        role = role,
                                        instance_count = 1,
                                        instance_type = 'ml.m5.2xlarge',
                                        output_path = output_location,
                                        sagemaker_session = session)

In [20]:
# https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost_hyperparameters.html
xgboost.set_hyperparameters(num_round = 100, objective = 'reg:logistic')

In [21]:
train_input = sagemaker.inputs.TrainingInput(s3_data = s3_train_data, content_type='csv', s3_data_type = 'S3Prefix')
validation_input = sagemaker.inputs.TrainingInput(s3_data = s3_test_data, content_type='csv', s3_data_type = 'S3Prefix')
data_channels = {'train': train_input, 'validation': validation_input}

In [22]:
xgboost.fit(data_channels)

2022-05-17 15:52:00 Starting - Starting the training job...
2022-05-17 15:52:29 Starting - Preparing the instances for trainingProfilerReport-1652802720: InProgress
.........
2022-05-17 15:53:57 Downloading - Downloading input data...
2022-05-17 15:54:17 Training - Downloading the training image...
2022-05-17 15:54:57 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2022-05-17:15:54:56:INFO] Running standalone xgboost training.[0m
[34m[2022-05-17:15:54:56:INFO] File size need to be processed in the node: 7.07mb. Available memory size in the node: 23504.35mb[0m
[34m[2022-05-17:15:54:56:INFO] Determined delimiter of CSV input is ','[0m
[34m[15:54:56] S3DistributionType set as FullyReplicated[0m
[34m[15:54:56] 22792x108 matrix with 2461536 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2022-05-17:15:54:56:INFO] Determined delimiter of CSV input is ','[0m
[34m[15:54:56] S3Distribution

# Deploy, previsões e avaliação

In [23]:
xgboost_classifier = xgboost.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')

-------!

In [24]:
from sagemaker.serializers import CSVSerializer
xgboost_classifier.serializer = CSVSerializer()

In [25]:
X_teste.shape, type(X_teste)

((9769, 108), numpy.ndarray)

In [26]:
previsoes = np.array(xgboost_classifier.predict(X_teste).decode('utf-8').split(',')).astype(np.float32)
previsoes

array([7.1665001e-01, 9.8000973e-01, 8.9295841e-05, ..., 1.3065693e-02,
       2.0877716e-04, 9.9958044e-01], dtype=float32)

In [27]:
previsoes = (previsoes >= 0.5)
print(previsoes)

[ True  True False ... False False  True]


In [28]:
previsoes.shape, y_teste.shape

((9769,), (9769,))

In [29]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [30]:
cm = confusion_matrix(y_teste, previsoes)
cm

array([[6914,  451],
       [ 821, 1583]])

In [31]:
accuracy_score(y_teste, previsoes)

0.8697921998157436

In [32]:
print(classification_report(y_teste, previsoes))

              precision    recall  f1-score   support

         0.0       0.89      0.94      0.92      7365
         1.0       0.78      0.66      0.71      2404

    accuracy                           0.87      9769
   macro avg       0.84      0.80      0.81      9769
weighted avg       0.87      0.87      0.87      9769



# Tuning

In [None]:
# https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost_hyperparameters.html
tuning_job_config = {
    "ParameterRanges": {
      "CategoricalParameterRanges": [],
      "ContinuousParameterRanges": [
        {
          "MaxValue": "1",
          "MinValue": "0",
          "Name": "eta"
        },
        {
          "MaxValue": "2",
          "MinValue": "0",
          "Name": "alpha"
        },
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "min_child_weight"
        }
      ],
      "IntegerParameterRanges": [
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "max_depth"
        }
      ],
      "IntegerParameterRanges": [
        {
          "MaxValue": "300",
          "MinValue": "50",
          "Name": "num_round"
        }
      ]
    },
    "ResourceLimits": {
      "MaxNumberOfTrainingJobs": 9,
      "MaxParallelTrainingJobs": 3
    },
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {
      "MetricName": "validation:error",
      "Type": "Minimize"
    }
  }

In [None]:
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
training_job_definition = {
    "AlgorithmSpecification": {
      "TrainingImage": container,
      "TrainingInputMode": "File"
    },
    "InputDataConfig": [
      {
        "ChannelName": "train",
        "CompressionType": "None",
        "ContentType": "csv",
        "DataSource": {
          "S3DataSource": {
            "S3DataDistributionType": "FullyReplicated",
            "S3DataType": "S3Prefix",
            "S3Uri": s3_train_data
          }
        }
      },
      {
        "ChannelName": "validation",
        "CompressionType": "None",
        "ContentType": "csv",
        "DataSource": {
          "S3DataSource": {
            "S3DataDistributionType": "FullyReplicated",
            "S3DataType": "S3Prefix",
            "S3Uri": s3_test_data
          }
        }
      }
    ],
    "OutputDataConfig": {
      "S3OutputPath": "s3://{}/{}/output".format(bucket,subpasta_modelo)
    },
    "ResourceConfig": {
      "InstanceCount": 2,
      "InstanceType": "ml.c4.2xlarge",
      "VolumeSizeInGB": 10
    },
    "RoleArn": role,
    "StaticHyperParameters": {
      "eval_metric": "error",
      "objective": "binary:logistic",
      "rate_drop": "0.3",
      "tweedie_variance_power": "1.4"
    },
    "StoppingCondition": {
      "MaxRuntimeInSeconds": 43200
    }
}

In [None]:
smclient = boto3.client('sagemaker')
smclient.create_hyper_parameter_tuning_job(HyperParameterTuningJobName = "xgboosttuningcensus",
                                          HyperParameterTuningJobConfig = tuning_job_config,
                                          TrainingJobDefinition = training_job_definition)

# Construção do novo modelo

In [33]:
container = image_uris.retrieve(framework='xgboost',region=boto3.Session().region_name,version='latest')
xgboost_tuning = sagemaker.estimator.Estimator(image_uri = container,
                                        role = role, 
                                        instance_count = 1, 
                                        instance_type = 'ml.m5.2xlarge',
                                        output_path = output_location,
                                        sagemaker_session = session)
xgboost_tuning.set_hyperparameters(num_round = 102, eta = 0.14507612435685635,
                                   min_child_weight = 2.412681801757289,
                                   alpha = 0.3189676727624047, tweedie_variance_power = 1.4,
                                   rate_drop = 0.3)
xgboost_tuning.fit(data_channels)

2022-05-17 16:06:12 Starting - Starting the training job...ProfilerReport-1652803572: InProgress
...
2022-05-17 16:06:52 Starting - Preparing the instances for training......
2022-05-17 16:08:11 Downloading - Downloading input data...
2022-05-17 16:08:29 Training - Downloading the training image.....[34mArguments: train[0m
[34m[2022-05-17:16:09:20:INFO] Running standalone xgboost training.[0m
[34m[2022-05-17:16:09:20:INFO] File size need to be processed in the node: 7.07mb. Available memory size in the node: 23867.78mb[0m
[34m[2022-05-17:16:09:20:INFO] Determined delimiter of CSV input is ','[0m
[34m[16:09:20] S3DistributionType set as FullyReplicated[0m
[34m[16:09:20] 22792x108 matrix with 2461536 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2022-05-17:16:09:20:INFO] Determined delimiter of CSV input is ','[0m
[34m[16:09:20] S3DistributionType set as FullyReplicated[0m
[34m[16:09:20] 9769x108 matrix with 1055052 entries lo

In [34]:
xgboost_classifier_tuning = xgboost_tuning.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')

------!

In [35]:
from sagemaker.serializers import CSVSerializer
xgboost_classifier_tuning.serializer = CSVSerializer()
previsoes = np.array(xgboost_classifier_tuning.predict(X_teste).decode('utf-8').split(',')).astype(np.float32)

In [36]:
previsoes

array([ 0.690067  ,  0.91854674, -0.00379324, ...,  0.01779461,
       -0.00613862,  0.9809268 ], dtype=float32)

In [37]:
previsoes = (previsoes >= 0.5)
previsoes

array([ True,  True, False, ..., False, False,  True])

In [38]:
y_teste = np.array(y_teste).astype(int)
y_teste

array([0, 1, 0, ..., 0, 0, 1])

In [39]:
previsoes.shape, y_teste.shape

((9769,), (9769,))

In [40]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
cm = confusion_matrix(y_teste, previsoes)
cm

array([[6974,  391],
       [ 855, 1549]])

In [41]:
accuracy_score(y_teste, previsoes)

0.8724536800081891

In [42]:
print(classification_report(y_teste, previsoes))

              precision    recall  f1-score   support

           0       0.89      0.95      0.92      7365
           1       0.80      0.64      0.71      2404

    accuracy                           0.87      9769
   macro avg       0.84      0.80      0.82      9769
weighted avg       0.87      0.87      0.87      9769

