In [1]:
!pip install category_encoders



In [2]:
import numpy as np
import pandas as pd
import inflection
import sagemaker
import boto3
import os

from category_encoders       import OneHotEncoder
from xgboost                 import XGBClassifier
from sklearn.preprocessing   import MinMaxScaler
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from sklearn.metrics         import balanced_accuracy_score, precision_score, classification_report
from sklearn.metrics         import recall_score, f1_score, make_scorer, cohen_kappa_score
from sagemaker.debugger      import Rule, ProfilerRule, rule_configs
from sagemaker.session       import TrainingInput
from sagemaker.serializers   import CSVSerializer

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [3]:
def ml_scores(model_name, y_true, y_pred):
    """
    Calculate and return various classification metrics for a machine learning model's performance.

    Parameters:
    - model_name (str): Name or identifier for the machine learning model.
    - y_true (array-like): True labels for the data.
    - y_pred (array-like): Predicted labels for the data.

    Returns:
    - pandas.DataFrame: A DataFrame containing the following metrics:
        - 'Balanced Accuracy': Balanced accuracy score rounded to 3 decimal places.
        - 'Precision': Precision score rounded to 3 decimal places.
        - 'Recall': Recall score rounded to 3 decimal places.
        - 'F1': F1 score rounded to 3 decimal places.
        - 'Kappa': Cohen's kappa score rounded to 3 decimal places.
    
    This function calculates and reports commonly used classification metrics such as balanced accuracy, precision, recall, F1 score,
    and Cohen's kappa score for evaluating the performance of a classification model. The results are returned as a DataFrame with the
    model's name as the index.

    Example usage:
    >>> y_true = [0, 1, 1, 0, 1]
    >>> y_pred = [0, 1, 0, 0, 1]
    >>> model_name = 'MyClassifier'
    >>> metrics_df = ml_scores(model_name, y_true, y_pred)
    >>> print(metrics_df)
               Balanced Accuracy  Precision  Recall     F1  Kappa
    MyClassifier             0.75        1.0     0.5  0.667   0.4
    """
    
    accuracy = balanced_accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    kappa = cohen_kappa_score(y_true, y_pred)
    
    return pd.DataFrame({'Balanced Accuracy': np.round(accuracy, 3), 
                         'Precision': np.round(precision, 3), 
                         'Recall': np.round(recall, 3),
                         'F1': np.round(f1, 3),
                         'Kappa': np.round(kappa, 3)}, 
                        index=[model_name])

In [4]:
def ml_cv_results(model_name, model, x, y, verbose=1):
    """
    Perform cross-validation and return the mean and standard deviation of various classification metrics for a machine learning model.

    Parameters:
    - model_name (str): Name or identifier for the machine learning model.
    - model (object): The machine learning model to be evaluated.
    - x (pd.DataFrame): Input features for the data.
    - y (pd.Series): True labels for the data.
    - verbose (int, optional): Verbosity level. If 1, it will print fold information. Default is 1.

    Returns:
    - pandas.DataFrame: A DataFrame containing the mean and standard deviation of the following metrics across folds:
        - 'Balanced Accuracy': Mean and standard deviation as "{mean} +/- {std}".
        - 'Precision': Mean and standard deviation as "{mean} +/- {std}".
        - 'Recall': Mean and standard deviation as "{mean} +/- {std}".
        - 'F1': Mean and standard deviation as "{mean} +/- {std}".
        - 'Kappa': Mean and standard deviation as "{mean} +/- {std}".
    
    This function performs cross-validation to evaluate the performance of a classification model. It calculates and reports the mean and
    standard deviation of metrics such as balanced accuracy, precision, recall, F1 score, and Cohen's kappa score across different folds.

    Example usage:
    >>> from sklearn.linear_model import LogisticRegression
    >>> from sklearn.datasets import load_iris
    >>> data = load_iris()
    >>> x = pd.DataFrame(data.data, columns=data.feature_names)
    >>> y = pd.Series(data.target)
    >>> model = LogisticRegression()
    >>> model_name = 'LogisticRegression'
    >>> cv_results = ml_cv_results(model_name, model, x, y)
    >>> print(cv_results)
                       Balanced Accuracy         Precision            Recall                 F1              Kappa
    LogisticRegression  0.966 +/- 0.032  0.967 +/- 0.053  0.967 +/- 0.053  0.966 +/- 0.053  0.951 +/- 0.072
    """
    
    '''initial'''
    balanced_accuracies = []
    precisions = []
    recalls = []
    f1s = []
    kappas = []
    
    mm = MinMaxScaler()
    
    x_ = x.to_numpy()
    y_ = y.to_numpy()
    
    count = 0
    
    '''cross-validation'''
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    
    for index_train, index_test in skf.split(x_, y_):
        ## Showing the Fold
        if verbose > 0:
            count += 1
            print('Fold K=%i' % (count))
    
        ## selecting train and test
        x_train, x_test = x.iloc[index_train], x.iloc[index_test]
        y_train, y_test = y.iloc[index_train], y.iloc[index_test]
        
        ## applying the scale
        x_train = mm.fit_transform(x_train)
        x_test = mm.transform(x_test)
    
        ## training the model
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        
        ## saving the metrics
        balanced_accuracies.append(balanced_accuracy_score(y_test, y_pred))
        precisions.append(precision_score(y_test, y_pred))
        recalls.append(recall_score(y_test, y_pred))
        f1s.append(f1_score(y_test, y_pred))
        kappas.append(cohen_kappa_score(y_test, y_pred))
        
        
    '''results'''    
    accuracy_mean, accuracy_std = np.round(np.mean(balanced_accuracies), 3), np.round(np.std(balanced_accuracies), 3)
    precision_mean, precision_std = np.round(np.mean(precisions), 3), np.round(np.std(precisions), 3)
    recall_mean, recall_std = np.round(np.mean(recalls), 3), np.round(np.std(recalls), 3)
    f1_mean, f1_std = np.round(np.mean(f1s), 3), np.round(np.std(f1s), 3)
    kappa_mean, kappa_std = np.round(np.mean(kappas), 3), np.round(np.std(kappas), 3)
    
    ## saving the results in a dataframe
    return pd.DataFrame({"Balanced Accuracy": "{} +/- {}".format(accuracy_mean, accuracy_std),
                        "Precision": "{} +/- {}".format(precision_mean, precision_std),
                        "Recall": "{} +/- {}".format(recall_mean, recall_std),
                        "F1": "{} +/- {}".format(f1_mean, f1_std),
                        "Kappa": "{} +/- {}".format(kappa_mean, kappa_std)},
                       index=[model_name])

## Data Load

In [5]:
# Load the dataset from S3
'''
step: represents a unit of time where 1 step equals 1 hour
type: type of online transaction
amount: the amount of the transaction
nameOrig: customer starting the transaction
oldbalanceOrg: balance before the transaction
newbalanceOrig: balance after the transaction
nameDest: recipient of the transaction
oldbalanceDest: initial balance of recipient before the transaction
newbalanceDest: the new balance of recipient after the transaction
isFraud: fraud transaction
'''
s3_data_location = 's3://archana-training-data/onlinefraud.csv'
data = pd.read_csv(s3_data_location)
print(data.head())

   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  


## Data Processing

In [6]:
# Columns Rename
cols_old = data.columns.tolist()

snakecase = lambda x: inflection.underscore(x)
cols_new = list(map(snakecase, cols_old))

data.columns = cols_new

In [7]:
# Change Data Type
data['is_fraud'] = data['is_fraud'].map({1: 'yes', 0: 'no'})
data['is_flagged_fraud'] = data['is_flagged_fraud'].map({1: 'yes', 0: 'no'})

In [8]:
# Feature Engineering
newData = data.copy()

# step
newData['step_days'] = newData['step'].apply(lambda i: i/24)
newData['step_weeks'] = newData['step'].apply(lambda i: i/(24*7))

# difference between initial balance before the transaction and new balance after the transaction
newData['diff_new_old_balance'] = newData['newbalance_orig'] - newData['oldbalance_org']

# difference between initial balance recipient before the transaction and new balance recipient after the transaction.
newData['diff_new_old_destiny'] = newData['newbalance_dest'] - newData['oldbalance_dest']

# name orig and name dest
newData['name_orig'] = newData['name_orig'].apply(lambda i: i[0])
newData['name_dest'] = newData['name_dest'].apply(lambda i: i[0])

In [9]:
# Splitting Data into Train, Valid and Test
X = newData.drop(columns=['is_fraud', 'is_flagged_fraud', 'name_orig', 'name_dest', 
                      'step_weeks', 'step_days'], axis=1)
y = newData['is_fraud'].map({'yes': 1, 'no': 0})

# spliting into temp and test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=.2, stratify=y)

# spliting into train and valid
X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=.2, stratify=y_temp)

In [10]:
# One Hot Encoder
ohe = OneHotEncoder(cols=['type'], use_cat_names=True)

X_train = ohe.fit_transform(X_train)
X_valid = ohe.transform(X_valid)

X_temp = ohe.fit_transform(X_temp)
X_test = ohe.transform(X_test)

In [11]:
# Rescaling
num_columns = ['amount', 'oldbalance_org', 'newbalance_orig', 'oldbalance_dest', 'newbalance_dest',
               'diff_new_old_balance', 'diff_new_old_destiny']
mm = MinMaxScaler()
X_params = X_temp.copy()

X_train[num_columns] = mm.fit_transform(X_train[num_columns])
X_valid[num_columns] = mm.transform(X_valid[num_columns])

X_params[num_columns] = mm.fit_transform(X_temp[num_columns])
X_test[num_columns] = mm.transform(X_test[num_columns])

In [12]:
final_columns_selected = ['step', 'oldbalance_org', 
                          'newbalance_orig', 'newbalance_dest', 
                          'diff_new_old_balance', 'diff_new_old_destiny', 
                          'type_TRANSFER']

In [13]:
X_train_cs = X_train[final_columns_selected]
X_valid_cs = X_valid[final_columns_selected]

X_temp_cs = X_temp[final_columns_selected]
X_test_cs = X_test[final_columns_selected]

X_params_cs = X_params[final_columns_selected]

In [14]:
print(X_train_cs.head())
print(y_train.head())

         step  oldbalance_org  newbalance_orig  newbalance_dest  \
1035243    94        0.047279         0.058727         0.000784   
1977046   178        0.001274         0.000000         0.011089   
3507208   259        0.000436         0.000362         0.000000   
496331     20        0.000000         0.000000         0.002438   
5110653   355        0.000000         0.000000         0.000000   

         diff_new_old_balance  diff_new_old_destiny  type_TRANSFER  
1035243              0.856759              0.109188              0  
1977046              0.842264              0.110734              0  
3507208              0.848023              0.109987              0  
496331               0.848708              0.111695              1  
5110653              0.848708              0.109987              0  
1035243    0
1977046    0
3507208    0
496331     0
5110653    0
Name: is_fraud, dtype: int64


## XGB Classifier

XGBoost, short for "Extreme Gradient Boosting," is a popular machine learning algorithm known for its exceptional performance in both classification and regression tasks. It belongs to the gradient boosting family of algorithms

In [15]:
xgb = XGBClassifier()
xgb.fit(X_train_cs, y_train)

y_pred = xgb.predict(X_valid_cs)

In [16]:
xgb_results = ml_scores('XGBoost', y_valid, y_pred)
xgb_results

Unnamed: 0,Balanced Accuracy,Precision,Recall,F1,Kappa
XGBoost,0.921,0.96,0.842,0.897,0.897


In [17]:
# Classification Report
print(classification_report(y_valid, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1016706
           1       0.96      0.84      0.90      1314

    accuracy                           1.00   1018020
   macro avg       0.98      0.92      0.95   1018020
weighted avg       1.00      1.00      1.00   1018020



In [18]:
# Cross Validation
xgb_cv = ml_cv_results('XGBoost', XGBClassifier(),
                       X_temp_cs, y_temp)
xgb_cv

Fold K=1
Fold K=2
Fold K=3
Fold K=4
Fold K=5


Unnamed: 0,Balanced Accuracy,Precision,Recall,F1,Kappa
XGBoost,0.918 +/- 0.003,0.9480000000000001 +/- 0.004,0.837 +/- 0.006,0.889 +/- 0.003,0.889 +/- 0.003


## Hyperparameter Fine Tuning
This code snippet demonstrates hyperparameter fine-tuning using GridSearchCV for an XGBoost classifier (XGBClassifier). It searches through various hyperparameter combinations, including booster type, learning rate (eta), and class weight scaling, optimizing for F1 score as the evaluation metric. The process employs a 3-fold stratified cross-validation scheme to find the best combination of hyperparameters for improved model performance.

In [19]:
f1 = make_scorer(f1_score)

In [20]:
params = {
    'booster': ['gbtree', 'gblinear'],
    'eta': [0.3, 0.01],
    'scale_pos_weight': [1, 508, 99]
}

In [21]:
gs = GridSearchCV(XGBClassifier(), 
                  param_grid=params, 
                  scoring=f1, 
                  cv=StratifiedKFold(n_splits=3),
                  verbose = 10)

gs.fit(X_params_cs, y_temp)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV 1/3; 1/12] START booster=gbtree, eta=0.3, scale_pos_weight=1................
[CV 1/3; 1/12] END booster=gbtree, eta=0.3, scale_pos_weight=1;, score=0.892 total time=  13.3s
[CV 2/3; 1/12] START booster=gbtree, eta=0.3, scale_pos_weight=1................
[CV 2/3; 1/12] END booster=gbtree, eta=0.3, scale_pos_weight=1;, score=0.893 total time=  13.6s
[CV 3/3; 1/12] START booster=gbtree, eta=0.3, scale_pos_weight=1................
[CV 3/3; 1/12] END booster=gbtree, eta=0.3, scale_pos_weight=1;, score=0.887 total time=  13.5s
[CV 1/3; 2/12] START booster=gbtree, eta=0.3, scale_pos_weight=508..............
[CV 1/3; 2/12] END booster=gbtree, eta=0.3, scale_pos_weight=508;, score=0.596 total time=  14.4s
[CV 2/3; 2/12] START booster=gbtree, eta=0.3, scale_pos_weight=508..............
[CV 2/3; 2/12] END booster=gbtree, eta=0.3, scale_pos_weight=508;, score=0.162 total time=  15.8s
[CV 3/3; 2/12] START booster=gbtree, eta=0.3, scal

In [22]:
best_params = gs.best_params_
best_params

{'booster': 'gbtree', 'eta': 0.3, 'scale_pos_weight': 1}

In [23]:
best_params = {'booster': 'gbtree', 'eta': 0.3, 'scale_pos_weight': 1}

In [24]:
gs.best_score_

0.8902458161596596

## Final Model

In [25]:
# Training the final model with the best hyperparameters
final_model = XGBClassifier(
    booster=best_params['booster'],
    eta=best_params['eta'],
    scale_pos_weight=best_params['scale_pos_weight']
)

final_model.fit(X_params_cs, y_temp)

In [26]:
y_pred = final_model.predict(X_test_cs)

In [27]:
unseen_scores = ml_scores('unseen', y_test, y_pred)
unseen_scores

Unnamed: 0,Balanced Accuracy,Precision,Recall,F1,Kappa
unseen,0.928,0.97,0.857,0.91,0.91


## Deploy Trained Model as SageMaker Endpoint

In [28]:
# Concatenate the target variable (is_fraud) and the feature data for the training set.
# This creates a DataFrame 'train' with 'is_fraud' as the first column.
# Parameters:
# - y_train: Series containing the target labels for the training set.
# - X_train_cs: DataFrame containing the feature data for the training set.
# Returns:
# - train: Concatenated DataFrame with 'is_fraud' as the first column.

train = pd.concat([pd.Series(y_train, index=X_train_cs.index,
                             name='is_fraud', dtype=int), X_train_cs], axis=1)
validation = pd.concat([pd.Series(y_valid, index=X_valid_cs.index,
                            name='is_fraud', dtype=int), X_valid_cs], axis=1)
test = pd.concat([pd.Series(y_test, index=X_test_cs.index,
                            name='is_fraud', dtype=int), X_test_cs], axis=1)

In [29]:
# Save csv
train.to_csv('train_final.csv', index=False, header=False)
validation.to_csv('validation_final.csv', index=False, header=False)
test.to_csv('test_final.csv', index=False, header=False)

In [30]:
# Upload the data into s3 bucket for better access and streaming during training.
bucket = "archana-training-data"
print("Training data is stored in : {}".format(bucket))
prefix = "demo-sagemaker-xgboost-fraud-detection-prediction"

boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'data/train_final.csv')).upload_file('train_final.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'data/validation_final.csv')).upload_file('validation_final.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'data/test_final.csv')).upload_file('test_final.csv')

Training data is stored in : archana-training-data


In [31]:
! aws s3 ls {bucket}/{prefix}/data --recursive

2023-11-25 03:42:12  112838841 demo-sagemaker-xgboost-fraud-detection-prediction/data/test_final.csv
2023-11-25 03:42:08  360986870 demo-sagemaker-xgboost-fraud-detection-prediction/data/train_final.csv
2023-11-25 03:42:11   90262459 demo-sagemaker-xgboost-fraud-detection-prediction/data/validation_final.csv


In [32]:
region = sagemaker.Session().boto_region_name
print("AWS Region: {}".format(region))

role = sagemaker.get_execution_role()
print("RoleArn: {}".format(role))

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
AWS Region: us-east-2
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
RoleArn: arn:aws:iam::058263730813:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole


In [33]:
# Define the model along with the training parameters
s3_output_location = 's3://{}/{}/{}'.format(bucket, prefix, 'xgboost_model')
container = sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")

xgb_model = sagemaker.estimator.Estimator(
    image_uri = container,
    role = role,
    instance_count = 1,
    instance_type = 'ml.m4.xlarge',
    volume_size = 5,
    output_path = s3_output_location,
    sagemaker_session = sagemaker.Session(),
    rules = [
        Rule.sagemaker(rule_configs.create_xgboost_report()),
        ProfilerRule.sagemaker(rule_configs.ProfilerReport())
    ]
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [34]:
# Based on the GridSearch set the hyperparamateres for training
xgb_model.set_hyperparameters(
    max_depth = 5,
    eta = 0.3,
    gamma = 4,
    min_child_weight = 6,
    subsample = 0.7,
    objective = "binary:logistic",
    num_round = 1,
    verbosity = 2
)

In [35]:
# Define training input
train_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "data/train_final.csv"), content_type="csv"
)
validation_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "data/validation_final.csv"), content_type="csv"
)

In [36]:
# Begin training using sagemaker instances
xgb_model.fit({"train": train_input, "validation": validation_input}, wait=True)

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-11-25-03-42-14-333


2023-11-25 03:42:14 Starting - Starting the training job...
2023-11-25 03:42:40 Starting - Preparing the instances for trainingCreateXgboostReport: InProgress
ProfilerReport: InProgress
.........
2023-11-25 03:44:14 Downloading - Downloading input data...
2023-11-25 03:44:41 Training - Downloading the training image......
2023-11-25 03:45:40 Training - Training image download completed. Training in progress..[34m[2023-11-25 03:45:38.622 ip-10-0-231-161.us-east-2.compute.internal:6 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV i

In [37]:
# Verify the model data and location
xgb_model.model_data

's3://archana-training-data/demo-sagemaker-xgboost-fraud-detection-prediction/xgboost_model/sagemaker-xgboost-2023-11-25-03-42-14-333/output/model.tar.gz'

In [38]:
# Deploy the trained model as a sagemaker endpoint for invocation
xgb_predictor = xgb_model.deploy(
    initial_instance_count = 1,
    instance_type = 'ml.t2.medium',
    serializer = CSVSerializer()
)

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-11-25-03-46-27-047
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2023-11-25-03-46-27-047
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2023-11-25-03-46-27-047


-------!

In [39]:
print("SageMaker Endpoint: {}".format(xgb_predictor.endpoint_name))

SageMaker Endpoint: sagemaker-xgboost-2023-11-25-03-46-27-047
