In [32]:
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, matthews_corrcoef, confusion_matrix
import time

class CustomFunctionTransformer(FunctionTransformer):
    def get_feature_names_out(self, input_features=None):
        return input_features

# Define the preprocessing steps as functions
def replace_inf(df):
    df_encoded = df.replace([np.inf], 1e6).replace([-np.inf], -1e6)
    if 'prev_age_delta' in df_encoded.columns:
        df_encoded['prev_age_delta'] = df_encoded['prev_age_delta'].fillna(0)
    if 'prev_USD_amount' in df_encoded.columns:
        df_encoded['prev_USD_amount'] = df_encoded['prev_USD_amount'].fillna(0)
    return df_encoded

def pipeline_init_(model, numerical_columns, categorical_columns):
    pipeline = Pipeline([
        ('preprocessing', ColumnTransformer([
            ('num_pipeline', Pipeline([
                ('replace_inf', CustomFunctionTransformer(replace_inf)),
                ('scaler', MinMaxScaler())
            ]), numerical_columns),
            
            ('cat_pipeline', Pipeline([
                ('ohe', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
            ]), categorical_columns)
        ])),
        ('smote', SMOTE()),
        ('classifier', model)
    ])

    return pipeline

def get_features(feature_selection_df, n=20):
    typological_features = [
        'is_crossborder', 
        'under_threshold_14d_count',
        'under_threshold_14d_sum',
        'under_threshold_30d_count',
        'under_threshold_30d_sum',
        'under_threshold_7d_count',
        'under_threshold_7d_sum'
        ]
    def_time_columns = ['txn_time_hr', 'txn_time_mm']
    def_categorical_columns = ['std_txn_type', 'std_txn_method', 'prev_std_txn_type', 'prev_std_txn_method']


    features = feature_selection_df[~feature_selection_df['Feature'].isin(typological_features)].nlargest(n, 'MI_Score').iloc[:,0].tolist() + typological_features

    categorical_cols = list(set(features) & set(def_categorical_columns))
    numerical_cols = list(set(features) - (set(categorical_cols) | set(def_time_columns)))

    return features, categorical_cols, numerical_cols

def calculate_informedness_markedness(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    # Calculate components
    sensitivity = tp / (tp + fn)  # also called TPR
    specificity = tn / (tn + fp)  # also called TNR
    ppv = tp / (tp + fp)  # positive predictive value
    npv = tn / (tn + fn)  # negative predictive value
    
    # Calculate metrics
    informedness = sensitivity + specificity - 1
    markedness = ppv + npv - 1
    
    return informedness, markedness

In [37]:
df_train = pd.read_parquet('../data/split/resplit/ds1_train.parquet').drop(columns=['Time_step', 'Transaction_Id', 'Transaction_Type','party_Id',
       'party_Account', 'party_Country', 'cparty_Id', 'cparty_Account',
       'cparty_Country',])
df_test = pd.read_parquet('../data/split/resplit/ds1_test.parquet').drop(columns=['Time_step', 'Transaction_Id', 'Transaction_Type','party_Id',
       'party_Account', 'party_Country', 'cparty_Id', 'cparty_Account',
       'cparty_Country',])

X_train  = df_train.drop(columns=['Label'])
y_train = df_train['Label']

X_test  = df_test.drop(columns=['Label'])
y_test = df_test['Label']

In [38]:
selected_features_, categorical_cols, numerical_cols = get_features(pd.read_csv('../data/feature_selection/ds1_mi_to_target.csv'))
pipeline = pipeline_init_(MultinomialNB(), numerical_cols, categorical_cols)
actual_features = pipeline['preprocessing'].get_feature_names_out()

# Time
start_time = time.time()

pipeline.fit(X_train, y_train)

# Time
end_time = time.time()


y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:,1]

roc_auc = roc_auc_score(y_test, y_pred_proba)
mcc = matthews_corrcoef(y_test, y_pred)
informedness, markedness = calculate_informedness_markedness(y_test, y_pred)

print(f'ROC AUC: {roc_auc}')
print(f'MCC: {mcc}')
print(f'Informedness: {informedness}')
print(f'Markedness: {markedness}')

print(classification_report(y_test, y_pred))
print(f'Time to train: {end_time - start_time}')

ROC AUC: 0.9962149097370446
MCC: 0.8468797461001761
Informedness: 0.8628492798902188
Markedness: 0.8312057749482615
              precision    recall  f1-score   support

           0       0.99      0.88      0.93    250813
           1       0.84      0.99      0.91    166477

    accuracy                           0.92    417290
   macro avg       0.92      0.93      0.92    417290
weighted avg       0.93      0.92      0.92    417290

Time to train: 4.831802845001221


In [41]:

features = pipeline['preprocessing'].get_feature_names_out()

In [26]:
features

['party_entity_btw',
 'party_account_btw',
 'volume_30d_sum',
 'USD_amount',
 'stat_30d_median',
 'prev_USD_amount',
 'volume_14d_sum',
 'stat_30d_mad',
 'stat_14d_median',
 'party_entity_deg',
 'cparty_is_account',
 'volume_7d_sum',
 'stat_7d_median',
 'cparty_is_cash',
 'is_credit',
 'stat_14d_mad',
 'std_txn_method',
 'cparty_l1_btw',
 'cparty_l2_btw',
 'std_txn_type',
 'is_crossborder',
 'under_threshold_14d_count',
 'under_threshold_14d_sum',
 'under_threshold_30d_count',
 'under_threshold_30d_sum',
 'under_threshold_7d_count',
 'under_threshold_7d_sum']

In [46]:
classifier_model = pipeline.named_steps['classifier']
# importance = np.std(classifier_model.feature_log_prob_, axis=0)
log_probs = classifier_model.feature_log_prob_
importance = log_probs[0] - log_probs[1]

feature_importance = pd.DataFrame({
        'feature': features,
        'importance': importance
    }).sort_values('importance', ascending=False)


In [44]:
classifier_model.feature_log_prob_

array([[ -4.22803479,  -4.55227087,  -2.89864037,  -8.60963631,
         -9.73075464,  -9.04198807,  -2.28913596,  -4.58981874,
         -8.40503246,  -3.4167876 ,  -7.85409939,  -8.30765452,
         -2.82360589,  -8.88644533,  -2.31699791,  -8.32095053,
         -1.85587061,  -8.49497763,  -9.18554656,  -9.42263553,
         -5.11513714,  -2.3596165 ,  -3.24718018,  -5.152627  ,
         -4.19042248,  -1.90427506,  -1.93410267,  -5.21543678],
       [ -9.96318842, -10.15762792,  -3.73431381,  -3.97258017,
         -4.82752828,  -4.40974318,  -2.23435767, -10.21424743,
         -3.75260063,  -3.00901379,  -3.02302881,  -3.51250406,
         -2.54242742,  -4.20798509,  -2.15516467,  -3.53630949,
         -1.82765922,  -3.8179026 ,  -4.31163663,  -4.77294361,
        -10.67457474,  -2.70777482,  -3.40256105, -10.73338213,
         -9.90266886,  -2.70777482,  -2.73438994,  -5.5749223 ]])

In [12]:
classifier_model.feature_log_prob_

array([[ -5.19377172,  -5.49181494,  -8.14665391, -10.27246556,
         -8.34967788,  -2.6539073 ,  -2.71013955,  -5.53024236,
         -9.31133522,  -2.8301567 ,  -2.97319844,  -9.20441569,
         -2.87107645,  -9.71975885,  -2.5030185 ,  -9.21568436,
         -2.24976105,  -9.39011991,  -9.90254155,  -8.59324597,
         -6.03499244,  -2.21984527,  -2.72527614,  -6.0735403 ,
         -5.15548723,  -3.61990029,  -2.11760785,  -2.6476801 ,
         -2.86354457,  -3.36439281],
       [ -8.79146797,  -8.80561891,  -4.0090128 ,  -4.86376091,
         -4.4444464 ,  -2.56835548,  -2.18362747,  -8.85582926,
         -3.79171437,  -3.34936465,  -3.3848866 ,  -3.54885944,
         -2.62250484,  -4.24509428,  -2.73182338,  -3.5770105 ,
         -2.56618343,  -3.85812242,  -4.34766738,  -4.80368822,
         -9.12917571,  -2.45054307,  -3.07516267,  -9.18240779,
         -8.73924316,  -4.76907758,  -2.48748114,  -3.51182003,
         -2.87521005,  -3.15586188]])