In [107]:
import boto3
import botocore
import sagemaker

import pandas as pd
import numpy as np

import os
import json
import requests

In [142]:
!pip install ipynb

Collecting ipynb
  Downloading ipynb-0.5.1-py3-none-any.whl (6.9 kB)
Installing collected packages: ipynb
Successfully installed ipynb-0.5.1
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [99]:
# Sagemaker activation
session = boto3.session.Session()
region_name = session.region_name
sagemaker_session = sagemaker.Session()
bucket = 'klarnadataset'
role = sagemaker.get_execution_role()
smclient = boto3.Session().client('sagemaker')

#### The famous tree-based gradient boosting XGBoost algorithm is used.
#### The hyperparameter optimization is done in Sagemaker

In [236]:
from sagemaker.tuner import ContinuousParameter,IntegerParameter,HyperparameterTuner
from sagemaker.session import TrainingInput
from sagemaker.serializers import CSVSerializer

# model output path 
prefix = 'xgboost_model'
output_path = 's3://{}/{}/{}/output'.format(bucket, prefix, 'xgb')



# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=training_image, 
                                          role=role,
                                          instance_count=1, 
                                          instance_type='ml.m5.2xlarge', 
                                          volume_size=5, # 5 GB 
                                          output_path=output_path
                                          )

estimator.set_hyperparameters(eval_metric='auc', objective='binary:logistic', num_round=100)

# hyperparameters to optimized
hyperparameter_ranges = {'eta': ContinuousParameter(0, 1), 'min_child_weight': ContinuousParameter(1, 10),
                        'alpha': ContinuousParameter(0, 2), 'max_depth': IntegerParameter(1, 6)}

# the objective function maximizes the AUC of the validation set
tuner = HyperparameterTuner(estimator, 
                            objective_metric_name='validation:auc', 
                            objective_type='Maximize',
                            hyperparameter_ranges=hyperparameter_ranges, 
                            max_jobs=9, 
                            max_parallel_jobs=3)


# input training and validation datasets
prefix = 'modeling'
train_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "train.csv"), content_type="csv"
)
validation_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "val.csv"), content_type="csv"
)


tuner.fit({'train': train_input, 'validation': validation_input})

..................................................................................................................................!


In [254]:
# the best model is deployed
endpoint_name = '20210811xgb2'
xgb_model_endpoint = tuner.deploy(initial_instance_count=1, 
                                  instance_type='ml.m4.xlarge',
                                  endpoint_name=endpoint_name,
                                  serializer = CSVSerializer())


2021-08-11 21:15:43 Starting - Preparing the instances for training
2021-08-11 21:15:43 Downloading - Downloading input data
2021-08-11 21:15:43 Training - Training image download completed. Training in progress.
2021-08-11 21:15:43 Uploading - Uploading generated training model
2021-08-11 21:15:43 Completed - Training job completed
---------------!

In [108]:
# load the endpoint (if the notebook is restarted)
xgb_model_endpoint = sagemaker.predictor.RealTimePredictor(endpoint_name=endpoint_name,
                                                          serializer= CSVSerializer())

The class RealTimePredictor has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [104]:
# upload data from S3
prefix= 'modeling'

train_location = 's3://{}/{}/{}'.format(bucket, prefix, 'train.csv')
train_set = pd.read_csv(train_location,delimiter=',',index_col=0)

val_location = 's3://{}/{}/{}'.format(bucket, prefix, 'val.csv')
val_set = pd.read_csv(val_location,delimiter=',',index_col=0)

test_location = 's3://{}/{}/{}'.format(bucket, prefix, 'test.csv')
test_set = pd.read_csv(test_location,delimiter=',',names=[i for i in range(0,45)]) # arbitrary column names

In [110]:
# convert the prediction file to a list of probability and of 0 & 1
def bytes_to_predictions(df_pred,threshold):
    df_pred_list = str(df_pred)[2:-1].split(',')
    df_pred_list_0_1 = np.array([0 if float(i) < threshold else 1 for i in df_pred_list ])
    return df_pred_list,df_pred_list_0_1

In [111]:
# predictions on the train, val and test sets
train_miniset = train_set.iloc[:15000,]  # more examples give error
train_miniset_labels = train_set.iloc[:15000,].index.values
train_miniset_predictions = xgb_model_endpoint.predict(np.array(train_miniset))
train_miniset_predictions_list, train_miniset_prediction_0_1 = bytes_to_predictions(train_miniset_predictions,0.5)

val_labels = val_set.index.values
val_predictions = xgb_model_endpoint.predict(np.array(val_set))
val_predictions_list, val_predictions_0_1 = bytes_to_predictions(val_predictions,0.5)

test_predictions = xgb_model_endpoint.predict(np.array(test_set))
test_predictions_list, test_predictions_0_1 = bytes_to_predictions(test_predictions,0.5)

### performance of the model on the train and val sets

In [114]:
from sklearn.metrics import confusion_matrix, classification_report

### Training set

In [120]:
confusion_matrix(train_miniset_labels,train_miniset_prediction_0_1)#,normalize='true')

array([[8597,  760],
       [ 946, 4697]])

In [122]:
print(classification_report(train_miniset_labels,train_miniset_prediction_0_1))

              precision    recall  f1-score   support

         0.0       0.90      0.92      0.91      9357
         1.0       0.86      0.83      0.85      5643

    accuracy                           0.89     15000
   macro avg       0.88      0.88      0.88     15000
weighted avg       0.89      0.89      0.89     15000



### Validation set

In [125]:
confusion_matrix(val_labels,val_predictions_0_1)

array([[9810,  825],
       [ 122,  143]])

In [127]:
print(classification_report(val_labels,val_predictions_0_1))

              precision    recall  f1-score   support

         0.0       0.99      0.92      0.95     10635
         1.0       0.15      0.54      0.23       265

    accuracy                           0.91     10900
   macro avg       0.57      0.73      0.59     10900
weighted avg       0.97      0.91      0.94     10900



#### The model is clearly overfitting and performing poorly in the validation set.
#### For the sake of time, no further optimization is carried out, although there is plenty of room for improvement

### Test set

In [131]:
# load test uuid from s3
prefix = 'data_prep'
data_key_df = 'test_uuid.csv'
data_location = 's3://{}/{}/{}'.format(bucket, prefix, data_key_df)
test_uuid = pd.read_csv(data_location)#,delimiter=',')
test_uuid_list = [i.strip("'") for i in  (test_uuid.columns)]

In [133]:
#dataset of uuid and predictions
test_preds_with_uuid = pd.DataFrame(np.array(test_uuid_list),columns=['uuid'])
test_preds_with_uuid['predictions'] = test_predictions_0_1
test_preds_with_uuid.head(5)

Unnamed: 0,uuid,predictions
0,6f6e6c6a-2081-4e6b-8eb3-4fd89b54b2d7,0
1,f6f6d9f3-ef2b-4329-a388-c6a687f27e70,0
2,e9c39869-1bc5-4375-b627-a2df70b445ea,0
3,6beb88a3-9641-4381-beb6-c9a208664dd0,0
4,bb89b735-72fe-42a4-ba06-d63be0f4ca36,0


In [None]:
# save the .csv predictions file in s3
test_preds_with_uuid.to_csv('test_predictions.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test_predictions.csv')).upload_file('test_predictions.csv')

## REST API

#### A REST API deployed in API GATEWAY (AWS) uses a LAMBDA function to predict a row of data.
#### The data preprocessing pipeline is not integrated in the API yet.

In [134]:
# API URL
url = 'https://tkqrt2qg9k.execute-api.us-east-2.amazonaws.com/test/klarna'
# fake data
data = {"data":"0.08752,0.07697999999999999,0.047510000000000004,0.033839999999999995,0.1809,13.49,22.3,86.91,561.0,0.08752,0.07697999999999999,0.047510000000000004,0.033839999999999995,0.1809,13.49,22.3,86.91,561.0,0.08752,0.07697999999999999,0.047510000000000004,0.033839999999999995,0.1809,0.057179999999999995,0.2338,1.3530000000000002,1.735,20.2,0.004455,0.013819999999999999,0.02095,0.01184,0.01641,0.001956,15.15,31.82,99.0,698.8,0.1162,0.1711,0.2282,0.1282,0.2871,0.06917000000000001"}
json_str = json.dumps(data)

x = requests.post(url, json_str)

print(x.text)  # Defaulter or Non defaulter

"Non defaulter"


In [145]:
# call function 'preprocessing_data' from pipeline.ipynb
from ipynb.fs.full.pipeline import preprocessing_data

us-east-2


#### First, the whole dataset is preprocesed. Second, the prediction takes place through the API

In [149]:
# upload original dataset from S3
data_key = 'dataset.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)
original_dataset = pd.read_csv(data_location,delimiter=';')

In [150]:
# it takes a few minutes to pre-process the whole original dataset
processed_dataset = preprocessing_data(original_dataset)

In [178]:
rows= processed_dataset.shape[0]
processed_dataset.shape

(99976, 45)

In [196]:
# upload column names after pre-proc from S3
columns_location = 's3://{}/{}/{}'.format(bucket, prefix, 'train_columns.csv')
columns = pd.read_csv(columns_location,delimiter=',',index_col=0)
columns_after_prepross = columns.columns

In [214]:
def predict_random_row(df,rows):
    #choose a random row
    random_row_number = np.random.choice(np.arange(rows),1 )[0]
    #convert to string
    data = ','.join(list(df[random_row,:].astype('str')))
    data_dict = {'data': data}
    json_str = json.dumps(data_dict)
    x = requests.post(url, json_str)
    print(x.text) 
    return pd.DataFrame([df[random_row,:]],columns=columns_after_prepross)

In [216]:
df_random_row = predict_random_row(processed_dataset,rows)
df_random_row

"Non defaulter"


Unnamed: 0,account_amount_added_12_24m,account_days_in_dc_12_24m,account_days_in_rem_12_24m,account_days_in_term_12_24m,account_status,age,avg_payment_span_0_12m,avg_payment_span_0_3m,has_paid,max_paid_inv_0_12m,...,merchant_group_Clothing & Shoes,merchant_group_Electronics,merchant_group_Entertainment,merchant_group_Erotic Materials,merchant_group_Food & Beverage,merchant_group_Health & Beauty,merchant_group_Home & Garden,merchant_group_Intangible products,merchant_group_Jewelry & Accessories,"merchant_group_Leisure, Sport & Hobby"
0,0.0,0.0,0.0,0.0,0.0,1.05,-0.934908,-0.860465,1.0,-0.645522,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
