In [1]:
!pip install category_encoders



In [2]:
import numpy as np
import pandas as pd
import inflection

from category_encoders       import OneHotEncoder
from sklearn.linear_model    import LogisticRegression
from sklearn.preprocessing   import MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics         import balanced_accuracy_score, precision_score, classification_report
from sklearn.metrics         import recall_score, f1_score, make_scorer, cohen_kappa_score

In [3]:
def ml_scores(model_name, y_true, y_pred):
    """
    Calculate and return various classification metrics for a machine learning model's performance.

    Parameters:
    - model_name (str): Name or identifier for the machine learning model.
    - y_true (array-like): True labels for the data.
    - y_pred (array-like): Predicted labels for the data.

    Returns:
    - pandas.DataFrame: A DataFrame containing the following metrics:
        - 'Balanced Accuracy': Balanced accuracy score rounded to 3 decimal places.
        - 'Precision': Precision score rounded to 3 decimal places.
        - 'Recall': Recall score rounded to 3 decimal places.
        - 'F1': F1 score rounded to 3 decimal places.
        - 'Kappa': Cohen's kappa score rounded to 3 decimal places.
    
    This function calculates and reports commonly used classification metrics such as balanced accuracy, precision, recall, F1 score,
    and Cohen's kappa score for evaluating the performance of a classification model. The results are returned as a DataFrame with the
    model's name as the index.

    Example usage:
    >>> y_true = [0, 1, 1, 0, 1]
    >>> y_pred = [0, 1, 0, 0, 1]
    >>> model_name = 'MyClassifier'
    >>> metrics_df = ml_scores(model_name, y_true, y_pred)
    >>> print(metrics_df)
               Balanced Accuracy  Precision  Recall     F1  Kappa
    MyClassifier             0.75        1.0     0.5  0.667   0.4
    """
    
    accuracy = balanced_accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    kappa = cohen_kappa_score(y_true, y_pred)
    
    return pd.DataFrame({'Balanced Accuracy': np.round(accuracy, 3), 
                         'Precision': np.round(precision, 3), 
                         'Recall': np.round(recall, 3),
                         'F1': np.round(f1, 3),
                         'Kappa': np.round(kappa, 3)}, 
                        index=[model_name])

In [4]:
def ml_cv_results(model_name, model, x, y, verbose=1):
    """
    Perform cross-validation and return the mean and standard deviation of various classification metrics for a machine learning model.

    Parameters:
    - model_name (str): Name or identifier for the machine learning model.
    - model (object): The machine learning model to be evaluated.
    - x (pd.DataFrame): Input features for the data.
    - y (pd.Series): True labels for the data.
    - verbose (int, optional): Verbosity level. If 1, it will print fold information. Default is 1.

    Returns:
    - pandas.DataFrame: A DataFrame containing the mean and standard deviation of the following metrics across folds:
        - 'Balanced Accuracy': Mean and standard deviation as "{mean} +/- {std}".
        - 'Precision': Mean and standard deviation as "{mean} +/- {std}".
        - 'Recall': Mean and standard deviation as "{mean} +/- {std}".
        - 'F1': Mean and standard deviation as "{mean} +/- {std}".
        - 'Kappa': Mean and standard deviation as "{mean} +/- {std}".
    
    This function performs cross-validation to evaluate the performance of a classification model. It calculates and reports the mean and
    standard deviation of metrics such as balanced accuracy, precision, recall, F1 score, and Cohen's kappa score across different folds.

    Example usage:
    >>> from sklearn.linear_model import LogisticRegression
    >>> from sklearn.datasets import load_iris
    >>> data = load_iris()
    >>> x = pd.DataFrame(data.data, columns=data.feature_names)
    >>> y = pd.Series(data.target)
    >>> model = LogisticRegression()
    >>> model_name = 'LogisticRegression'
    >>> cv_results = ml_cv_results(model_name, model, x, y)
    >>> print(cv_results)
                       Balanced Accuracy         Precision            Recall                 F1              Kappa
    LogisticRegression  0.966 +/- 0.032  0.967 +/- 0.053  0.967 +/- 0.053  0.966 +/- 0.053  0.951 +/- 0.072
    """
    
    '''initial'''
    balanced_accuracies = []
    precisions = []
    recalls = []
    f1s = []
    kappas = []
    
    mm = MinMaxScaler()
    
    x_ = x.to_numpy()
    y_ = y.to_numpy()
    
    count = 0
    
    '''cross-validation'''
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    
    for index_train, index_test in skf.split(x_, y_):
        ## Showing the Fold
        if verbose > 0:
            count += 1
            print('Fold K=%i' % (count))
    
        ## selecting train and test
        x_train, x_test = x.iloc[index_train], x.iloc[index_test]
        y_train, y_test = y.iloc[index_train], y.iloc[index_test]
        
        ## applying the scale
        x_train = mm.fit_transform(x_train)
        x_test = mm.transform(x_test)
    
        ## training the model
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        
        ## saving the metrics
        balanced_accuracies.append(balanced_accuracy_score(y_test, y_pred))
        precisions.append(precision_score(y_test, y_pred))
        recalls.append(recall_score(y_test, y_pred))
        f1s.append(f1_score(y_test, y_pred))
        kappas.append(cohen_kappa_score(y_test, y_pred))
        
        
    '''results'''    
    accuracy_mean, accuracy_std = np.round(np.mean(balanced_accuracies), 3), np.round(np.std(balanced_accuracies), 3)
    precision_mean, precision_std = np.round(np.mean(precisions), 3), np.round(np.std(precisions), 3)
    recall_mean, recall_std = np.round(np.mean(recalls), 3), np.round(np.std(recalls), 3)
    f1_mean, f1_std = np.round(np.mean(f1s), 3), np.round(np.std(f1s), 3)
    kappa_mean, kappa_std = np.round(np.mean(kappas), 3), np.round(np.std(kappas), 3)
    
    ## saving the results in a dataframe
    return pd.DataFrame({"Balanced Accuracy": "{} +/- {}".format(accuracy_mean, accuracy_std),
                        "Precision": "{} +/- {}".format(precision_mean, precision_std),
                        "Recall": "{} +/- {}".format(recall_mean, recall_std),
                        "F1": "{} +/- {}".format(f1_mean, f1_std),
                        "Kappa": "{} +/- {}".format(kappa_mean, kappa_std)},
                       index=[model_name])

## Data Load

In [5]:
# Load the dataset from S3
'''
step: represents a unit of time where 1 step equals 1 hour
type: type of online transaction
amount: the amount of the transaction
nameOrig: customer starting the transaction
oldbalanceOrg: balance before the transaction
newbalanceOrig: balance after the transaction
nameDest: recipient of the transaction
oldbalanceDest: initial balance of recipient before the transaction
newbalanceDest: the new balance of recipient after the transaction
isFraud: fraud transaction
'''
s3_data_location = 's3://archana-training-data/onlinefraud.csv'
data = pd.read_csv(s3_data_location)
print(data.head())

   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  


## Data Processing

In [6]:
# Columns Rename
cols_old = data.columns.tolist()

snakecase = lambda x: inflection.underscore(x)
cols_new = list(map(snakecase, cols_old))

data.columns = cols_new

In [7]:
# Change Data Type
data['is_fraud'] = data['is_fraud'].map({1: 'yes', 0: 'no'})
data['is_flagged_fraud'] = data['is_flagged_fraud'].map({1: 'yes', 0: 'no'})

In [8]:
# Feature Engineering
newData = data.copy()

# step
newData['step_days'] = newData['step'].apply(lambda i: i/24)
newData['step_weeks'] = newData['step'].apply(lambda i: i/(24*7))

# difference between initial balance before the transaction and new balance after the transaction
newData['diff_new_old_balance'] = newData['newbalance_orig'] - newData['oldbalance_org']

# difference between initial balance recipient before the transaction and new balance recipient after the transaction.
newData['diff_new_old_destiny'] = newData['newbalance_dest'] - newData['oldbalance_dest']

# name orig and name dest
newData['name_orig'] = newData['name_orig'].apply(lambda i: i[0])
newData['name_dest'] = newData['name_dest'].apply(lambda i: i[0])

In [9]:
# Splitting Data into Train, Valid and Test
X = newData.drop(columns=['is_fraud', 'is_flagged_fraud', 'name_orig', 'name_dest', 
                      'step_weeks', 'step_days'], axis=1)
y = newData['is_fraud'].map({'yes': 1, 'no': 0})

# spliting into temp and test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=.2, stratify=y)

# spliting into train and valid
X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=.2, stratify=y_temp)

In [10]:
# One Hot Encoder
ohe = OneHotEncoder(cols=['type'], use_cat_names=True)

X_train = ohe.fit_transform(X_train)
X_valid = ohe.transform(X_valid)

X_temp = ohe.fit_transform(X_temp)
X_test = ohe.transform(X_test)

In [11]:
# Rescaling
num_columns = ['amount', 'oldbalance_org', 'newbalance_orig', 'oldbalance_dest', 'newbalance_dest',
               'diff_new_old_balance', 'diff_new_old_destiny']
mm = MinMaxScaler()
X_params = X_temp.copy()

X_train[num_columns] = mm.fit_transform(X_train[num_columns])
X_valid[num_columns] = mm.transform(X_valid[num_columns])

X_params[num_columns] = mm.fit_transform(X_temp[num_columns])
X_test[num_columns] = mm.transform(X_test[num_columns])

In [12]:
final_columns_selected = ['step', 'oldbalance_org', 
                          'newbalance_orig', 'newbalance_dest', 
                          'diff_new_old_balance', 'diff_new_old_destiny', 
                          'type_TRANSFER']

In [13]:
X_train_cs = X_train[final_columns_selected]
X_valid_cs = X_valid[final_columns_selected]

X_temp_cs = X_temp[final_columns_selected]
X_test_cs = X_test[final_columns_selected]

X_params_cs = X_params[final_columns_selected]

## Logistic Regression

Logistic Regression is a statistical and machine learning model used for binary classification problems, where the goal is to predict one of two possible outcomes (e.g., yes/no, true/false, 1/0) based on input features. It's named "logistic" because it uses the logistic function to model the probability that a given input belongs to a particular class.

In [14]:
lg = LogisticRegression(max_iter=200)
lg.fit(X_train_cs, y_train)

y_pred = lg.predict(X_valid_cs)

In [15]:
lg_results = ml_scores('Logistic Regression', y_valid, y_pred)
lg_results

Unnamed: 0,Balanced Accuracy,Precision,Recall,F1,Kappa
Logistic Regression,0.655,0.947,0.31,0.467,0.466


In [16]:
# Classification Report
print(classification_report(y_valid, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1016706
           1       0.95      0.31      0.47      1314

    accuracy                           1.00   1018020
   macro avg       0.97      0.65      0.73   1018020
weighted avg       1.00      1.00      1.00   1018020



In [17]:
# Cross Validation
lg_cv = ml_cv_results('Logistic Regression',
                      LogisticRegression(), 
                      X_temp_cs, y_temp)
lg_cv

Fold K=1
Fold K=2
Fold K=3
Fold K=4
Fold K=5


Unnamed: 0,Balanced Accuracy,Precision,Recall,F1,Kappa
Logistic Regression,0.658 +/- 0.013000000000000001,0.9540000000000001 +/- 0.013000000000000001,0.316 +/- 0.026000000000000002,0.47400000000000003 +/- 0.03,0.47300000000000003 +/- 0.03
