# Credit Card Fraud Detection
Author: Brenda De Leon

Modeling: #### 
#### Libraries

In [None]:
# importing libraries

In [None]:
# importing data

## Evaluation Function

In [None]:
# defining an evaluation classification function for automation and evaluating subsequent models
def evaluate_classification(model, X_train, X_test, y_train, y_test, classes=None, 
                            normalize='true', cmap='cividis', label=''):
    
    """Input a model, training data and test data to return sklearn metrics
        - Classification Report for training and test 
        - PR AUC Score for training and test 
        - Confusion Matrix for training and test 
        - ROC Curve for training and test
        - PR Curve for training and test
        
        Args:
        model: instantiated classifier (ex: LogisticRegression = log_reg)
        X_train: train_test_split & preprocessed
        X_test: train_test_split 
        y_train: train_test_split & preprocessed
        y_test: train_test_split 
        label: optional label for type of classifier 
        
        Modified Function, Citation:
        https://github.com/hpatel530/Chicago-Car-Crash-Data/blob/master/Chicage_Car_Crash.ipynb
    """
    # retrieve predictions for train and test data 
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # retrieve probabilites for train and test data 
    y_hat_train = model.predict_proba(X_train)
    y_hat_test = model.predict_proba(X_test)
    
    # retrieve probabilities for the positive class
    pos_probs_train = y_hat_train[:, 1]
    pos_probs_test = y_hat_test[:, 1]
    
    # print training classification report 
    header = label + " CLASSIFICATION REPORT TRAINING "
    dashes = "---" * 20
    print(dashes, header, dashes, sep='\n')
    print(classification_report(y_train, y_pred_train, target_names=classes))
    
    # calculate precision-recall area under curve
    # print training ap score
    # ap_score = average_precision_score(y_train, y_pred_train)
    # print('Training Area Under Precision-Recall Curve, measured by Average Precision score: %.3f' % ap_score)
     
    # display training figures 
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(14,4))
    
    # plot confusion matrix 
    plot_confusion_matrix(model, X_train, y_train, labels=classes, normalize=normalize,
                         cmap=cmap, ax=axes[0])
    axes[0].set(title='Confusion Matrix Training')
    
    # plot ROC curve 
    plot_roc_curve(model, X_train, y_train, ax=axes[1], name=label)
    roc = axes[1]
    roc.plot([0,1], [0,1], ls=':', label='No Skill')
    roc.grid()
    roc.set_title('Receiving Operator Characteristic Training')
    
    # plot Precision-Recall curve
    PrecisionRecallDisplay.from_estimator(model, X_train, y_train, ax=axes[2], name=label)
    # y axis is Precision
    axes[2].set_ylabel('Precision')
    # x axis is Recall
    axes[2].set_xlabel('Recall')
    axes[2].set_title('Precision-Recall AUC Training')
    plt.show()

    # print testing classification report 
    header_ = label + " CLASSIFICATION REPORT TESTING "
    print(dashes, header_, dashes, sep='\n')
    print(classification_report(y_test, y_pred_test, target_names=classes))
    
    # calculate precision-recall area under curve
    # print testing ap score
    # ap_score = average_precision_score(y_test, y_pred_test)
    # print('Test Area Under Precision-Recall Curve, measured by Average Precision score: %.3f' % ap_score)
    
    # display testing figures 
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15,4))
    
    # plot confusion matrix 
    plot_confusion_matrix(model, X_test, y_test, labels=classes, normalize=normalize, 
                          cmap=cmap, ax=axes[0])
    axes[0].set(title='Confusion Matrix Testing')
    
    # plot ROC curve 
    plot_roc_curve(model, X_test, y_test, ax=axes[1], name=label)
    axes[1].plot([0,1], [0,1], ls=':', label='No Skill')
    axes[1].grid()
    axes[1].set_title('Receiving Operator Characteristic Test')

    # plot Precision-Recall curve
    PrecisionRecallDisplay.from_estimator(model, X_test, y_test, ax=axes[2], name=label)
    # y axis is Precision
    axes[2].set_ylabel('Precision')
    # x axis is Recall
    axes[2].set_xlabel('Recall')
    axes[2].set_title('Precision-Recall AUC Test')
    plt.legend()
    plt.show()

## Preprocessing

In [None]:
# categorical mini-pipeline for columntransformer. X, X_train, X_test have no numeric dtypes. 
# will return sparse matrix where each column corresponds to one possible value of one feature, default parameter
subpipe_cat = Pipeline(steps=[
    ('ohe', OneHotEncoder(sparse=True, handle_unknown='ignore'))
])


## Column Transformer

Pipelines will include columntransformer ('CT') to take care of preprocessing & adding data back into original dataset

In [None]:
# creating columntransformer as 'CT'
# columntransformer will one hot encode all non np.number dtypes
# remainder data will pass through without transformation, instead of being dropped
CT = ColumnTransformer(transformers=[
     ('subpipe_cat', subpipe_cat, make_column_selector(dtype_exclude=np.number))], remainder='passthrough')




Robust Scaling 
next step?

oversampling, smote, scaling, or removing outliers, or weighing? best method to deal with severe imbalance (and why)

m