# Tratamento da base de dados

In [71]:
import pandas as pd
import numpy as np
import io
import os
base_census = pd.read_csv('census.csv')
colunas = []
colunas.append('income')
for i in range(len(base_census.columns[:-1])):
    colunas.append(base_census.columns[i])
base_census = base_census[colunas]
base_census

Unnamed: 0,income,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,inative-country
0,<=50K,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,<=50K,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,<=50K,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,<=50K,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,<=50K,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,<=50K,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States
32557,>50K,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
32558,<=50K,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States
32559,<=50K,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States


In [72]:
def income(text):
    if text == ' >50K':
        return 1.0
    else:
        return 0.0

In [73]:
base_census['income'] = base_census['income'].apply(income)
base_census

Unnamed: 0,income,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,inative-country
0,0.0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,0.0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,0.0,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,0.0,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,0.0,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.0,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States
32557,1.0,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
32558,0.0,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States
32559,0.0,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States


In [74]:
base_census = pd.get_dummies(base_census,prefix=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'inative-country'], 
                            columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'inative-country'])
base_census.shape

(32561, 109)

In [75]:
X = base_census.iloc[:, 1:len(base_census.columns)].values
X, X.shape

(array([[    39,  77516,     13, ...,      1,      0,      0],
        [    50,  83311,     13, ...,      1,      0,      0],
        [    38, 215646,      9, ...,      1,      0,      0],
        ...,
        [    58, 151910,      9, ...,      1,      0,      0],
        [    22, 201490,      9, ...,      1,      0,      0],
        [    52, 287927,      9, ...,      1,      0,      0]]),
 (32561, 108))

In [76]:
y = base_census.iloc[:, 0].values
y

array([0., 0., 0., ..., 0., 0., 1.])

In [77]:
X = np.array(X).astype('float32')
y = np.array(y).astype('float32')

# Configurações do SageMaker

In [78]:
import sagemaker
import boto3
from sagemaker import Session

In [79]:
session = sagemaker.Session()
bucket = 'cursoawssagemaker'
subpasta_modelo = 'modelos/census/pca'
subpasta_dataset = 'datasets/census'
key_train = 'census-train-data-pca'
role = sagemaker.get_execution_role()
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, subpasta_dataset, key_train)
output_location = 's3://{}/{}/output'.format(bucket, subpasta_modelo)
print('Role: ', role)
print('Localização da base de dados de treinamento: ', s3_train_data)
print('Modelo final será salvo em: ', output_location)

Role:  arn:aws:iam::936535973187:role/service-role/AmazonSageMaker-ExecutionRole-20220510T125992
Localização da base de dados de treinamento:  s3://cursoawssagemaker/datasets/census/train/census-train-data-pca
Modelo final será salvo em:  s3://cursoawssagemaker/modelos/census/pca/output


In [80]:
import sagemaker.amazon.common as smac
buffer = io.BytesIO()
smac.write_numpy_to_dense_tensor(buffer, X)
buffer.seek(0)

0

In [81]:
boto3.resource('s3').Bucket(bucket).Object(os.path.join(subpasta_dataset, 'train', key_train)).upload_fileobj(buffer)

# Treinamento do PCA

In [82]:
# https://docs.aws.amazon.com/sagemaker/latest/dg/ecr-sa-east-1.html
container = sagemaker.image_uris.retrieve(framework = 'pca', region = boto3.Session().region_name)

# https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html
pca = sagemaker.estimator.Estimator(image_uri = container,
                                    role = role,
                                    instance_count = 1,
                                    instance_type = 'ml.c4.xlarge',
                                    output_path = output_location,
                                    sagemaker_session = session)

# https://docs.aws.amazon.com/sagemaker/latest/dg/PCA-reference.html
pca.set_hyperparameters(feature_dim = 32561,
                        num_components = 80,
                        mini_batch_size = 200)

pca.fit({'train': s3_train_data})

2022-05-25 13:36:30 Starting - Starting the training job...
2022-05-25 13:36:47 Starting - Preparing the instances for trainingProfilerReport-1653485790: InProgress
............
2022-05-25 13:38:43 Downloading - Downloading input data...
2022-05-25 13:39:31 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[05/25/2022 13:39:38 INFO 140449382917952] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-conf.json: {'algorithm_mode': 'regular', 'subtract_mean': 'true', 'extra_components': '-1', 'force_dense': 'true', 'epochs': 1, '_log_level': 'info', '_kvstore': 'dist_sync', '_num_kv_servers': 'auto', '_num_gpus': 'auto'}[0m
[34m[05/25/2022 13:39:38 INFO 140449382917952] Merging with provided configuration from /opt/ml/input/config/hyperparameters.json: {'feature_dim': '32561', 'mini_batch_size': '200', 'num_components': '80'

# Redução da dimensionalidade

In [83]:
pca_predictor = pca.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')

------!

In [84]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
pca_predictor.serializer = CSVSerializer()
pca_predictor.deserializer = JSONDeserializer()

In [85]:
X[0]

array([3.9000e+01, 7.7516e+04, 1.3000e+01, 2.1740e+03, 0.0000e+00,
       4.0000e+01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
       1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e

In [86]:
X0_pca = pca_predictor.predict(X[0])
print(X0_pca)

{'projections': [{'projection': [0.0012449510395526886, 0.002343464642763138, -0.0013039950281381607, 0.000418664887547493, 0.0004079388454556465, -0.0004254486411809921, 0.0002772137522697449, -0.0004678121767938137, -0.0008481075055897236, 0.00024294480681419373, 0.0018959343433380127, -0.0004775784909725189, 0.00011449679732322693, -0.0002458728849887848, 0.0030149780213832855, -0.0020423270761966705, -0.004432309418916702, -0.0015896577388048172, 0.0026109106838703156, 0.005269220098853111, -8.162111043930054e-05, 0.00033833831548690796, 0.006753459572792053, -0.0012435587123036385, 0.0007078980561345816, 0.016528287902474403, 0.002406906336545944, -0.00607888400554657, -0.007607053965330124, 0.0011467933654785156, -0.015767782926559448, -0.00635441392660141, 0.028152629733085632, -0.048242077231407166, 0.04655647277832031, 0.0051122382283210754, -0.10033466666936874, -0.07806280255317688, -0.005875088274478912, 0.013174638152122498, -0.012452095746994019, -0.01010817289352417, 0.0

In [87]:
X_pca = pca_predictor.predict(X[0:12000])

In [88]:
X_pca = np.array([r['projection'] for r in X_pca['projections']])

In [89]:
X_pca, X_pca.shape

(array([[ 1.24495104e-03,  2.34346464e-03, -1.30399503e-03, ...,
         -8.97964935e+01, -1.09991724e+03,  1.12262344e+05],
        [ 1.43589824e-03, -6.27957284e-04, -9.94171947e-04, ...,
         -9.33537750e+01,  1.07425757e+03,  1.06467406e+05],
        [-1.22067891e-03, -6.09576702e-04,  6.42577186e-04, ...,
         -8.81579437e+01,  1.07828333e+03, -2.58675938e+04],
        ...,
        [ 1.46968104e-03, -7.50370324e-04, -2.14563683e-03, ...,
         -9.44503784e+01,  1.07340918e+03,  1.34354406e+05],
        [-1.27792172e-03, -1.41525269e-03,  1.01876631e-03, ...,
         -8.62802887e+01, -1.09910840e+03,  2.01783438e+04],
        [ 2.44413875e-03,  1.71445310e-04,  1.50985084e-03, ...,
         -8.40632935e+01,  1.08143372e+03, -1.29492594e+05]]),
 (12000, 80))

# Preparação dos dados para o Linear Learner

In [90]:
from sklearn.model_selection import train_test_split
X_treinamento, X_teste, y_treinamento, y_teste = train_test_split(X_pca, y[0:12000], test_size = 0.3, random_state = 0)

In [91]:
X_treinamento = np.array(X_treinamento).astype('float32')
y_treinamento = np.array(y_treinamento).astype('float32')

In [92]:
X_treinamento.shape, X_teste.shape

((8400, 80), (3600, 80))

In [93]:
y_treinamento.shape, y_teste.shape

((8400,), (3600,))

# Configurações do SageMaker

In [94]:
subpasta_modelo_linear = 'modelos/census/linear-learner_pca'
subpasta_dataset = 'datasets/census'
key = 'census-train-data_pca'
s3_train_data_pca = 's3://{}/{}/train/{}'.format(bucket, subpasta_dataset, key)
output_location_pca = 's3://{}/{}/output'.format(bucket, subpasta_modelo)
print('Localização da base de dados: ', s3_train_data)
print('Modelo final será salvo em: ', output_location)

Localização da base de dados:  s3://cursoawssagemaker/datasets/census/train/census-train-data-pca
Modelo final será salvo em:  s3://cursoawssagemaker/modelos/census/pca/output


In [95]:
buffer = io.BytesIO()
smac.write_numpy_to_dense_tensor(buffer, X_treinamento, y_treinamento)
buffer.seek(0)

0

In [96]:
boto3.resource('s3').Bucket(bucket).Object(os.path.join(subpasta_dataset, 'train', key)).upload_fileobj(buffer)

# Treinamento do Linear Learner

In [97]:
# https://docs.aws.amazon.com/sagemaker/latest/dg/linear-learner.html
# https://docs.aws.amazon.com/sagemaker/latest/dg/ecr-sa-east-1.html
container = sagemaker.image_uris.retrieve(framework = 'linear-learner', region=boto3.Session().region_name)

# https://aws.amazon.com/ec2/instance-types/
# https://docs.aws.amazon.com/pt_br/AWSEC2/latest/UserGuide/instance-types.html
# https://docs.aws.amazon.com/sagemaker/latest/dg/notebooks-available-instance-types.html
# https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html
# https://aws.amazon.com/pt/about-aws/whats-new/2019/08/amazon-sagemaker-launches-managed-spot-training-saving-machine-learning-training-costs/
linear = sagemaker.estimator.Estimator(image_uri = container,
                                       role = role,
                                       instance_count = 1,
                                       instance_type = 'ml.m4.xlarge',
                                       output_path = output_location,
                                       sagemaker_session = session,
                                       use_stop_instances = True)

# https://docs.aws.amazon.com/sagemaker/latest/dg/ll_hyperparameters.html
linear.set_hyperparameters(feature_dim = 80,
                           predictor_type = 'binary_classifier',
                           num_models = 8)

linear.fit({'train': s3_train_data_pca})

2022-05-25 13:47:25 Starting - Starting the training job...
2022-05-25 13:47:52 Starting - Preparing the instances for trainingProfilerReport-1653486445: InProgress
.........
2022-05-25 13:49:21 Downloading - Downloading input data......
2022-05-25 13:50:26 Training - Downloading the training image.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[05/25/2022 13:50:36 INFO 140002932959040] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'optimizer': 'auto', 'loss': 'auto', 'margin': '1.0', 'qu

# Deploy, previsões e avaliação

In [98]:
linear_classifier = linear.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')

------!

In [99]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
linear_classifier.serializer = CSVSerializer()
linear_classifier.deserializer = JSONDeserializer()

In [100]:
X_teste.shape

(3600, 80)

In [None]:
results = linear_classifier.predict(X_teste)
results

In [102]:
previsoes = np.array([r['predicted_label'] for r in results['predictions']])
previsoes

array([0, 1, 0, ..., 0, 0, 1])

In [103]:
y_teste = np.array(y_teste).astype(int)
y_teste

array([0, 1, 0, ..., 0, 0, 0])

In [104]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [105]:
accuracy_score(y_teste, previsoes)

0.8455555555555555

In [106]:
cm = confusion_matrix(y_teste, previsoes)
cm

array([[2516,  189],
       [ 367,  528]])

In [107]:
print(classification_report(y_teste, previsoes))

              precision    recall  f1-score   support

           0       0.87      0.93      0.90      2705
           1       0.74      0.59      0.66       895

    accuracy                           0.85      3600
   macro avg       0.80      0.76      0.78      3600
weighted avg       0.84      0.85      0.84      3600

