In [1]:
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, matthews_corrcoef, confusion_matrix
import time

# Define the preprocessing steps as functions
def replace_inf(df):
    df_encoded = df.replace([np.inf], 1e6).replace([-np.inf], -1e6)
    if 'prev_age_delta' in df_encoded.columns:
        df_encoded['prev_age_delta'] = df_encoded['prev_age_delta'].fillna(0)
    if 'prev_USD_amount' in df_encoded.columns:
        df_encoded['prev_USD_amount'] = df_encoded['prev_USD_amount'].fillna(0)
    return df_encoded

def pipeline_init_(model, numerical_columns, categorical_columns):
    pipeline = Pipeline([
        ('preprocessing', ColumnTransformer([
            ('num_pipeline', Pipeline([
                ('replace_inf', FunctionTransformer(replace_inf)),
                ('scaler', MinMaxScaler())
            ]), numerical_columns),
            
            ('cat_pipeline', Pipeline([
                ('ohe', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
            ]), categorical_columns)
        ])),
        ('smote', SMOTE()),
        ('classifier', model)
    ])

    return pipeline

def get_features(feature_selection_df, n=20):
    typological_features = [
        'is_crossborder', 
        'under_threshold_14d_count',
        'under_threshold_14d_sum',
        'under_threshold_30d_count',
        'under_threshold_30d_sum',
        'under_threshold_7d_count',
        'under_threshold_7d_sum'
        ]
    def_time_columns = ['txn_time_hr', 'txn_time_mm']
    def_categorical_columns = ['std_txn_type', 'std_txn_method', 'prev_std_txn_type', 'prev_std_txn_method']


    features = feature_selection_df[~feature_selection_df['Feature'].isin(typological_features)].nlargest(n, 'MI_Score').iloc[:,0].tolist() + typological_features

    categorical_cols = list(set(features) & set(def_categorical_columns))
    numerical_cols = list(set(features) - (set(categorical_cols) | set(def_time_columns)))

    return features, categorical_cols, numerical_cols

def calculate_informedness_markedness(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    # Calculate components
    sensitivity = tp / (tp + fn)  # also called TPR
    specificity = tn / (tn + fp)  # also called TNR
    ppv = tp / (tp + fp)  # positive predictive value
    npv = tn / (tn + fn)  # negative predictive value
    
    # Calculate metrics
    informedness = sensitivity + specificity - 1
    markedness = ppv + npv - 1
    
    return informedness, markedness

In [2]:
df_train = pd.read_parquet('../data/split/resplit/ds3_train.parquet').drop(columns=['Time_step', 'Transaction_Id', 'Transaction_Type','party_Id',
       'party_Account', 'party_Country', 'cparty_Id', 'cparty_Account',
       'cparty_Country',])
df_test = pd.read_parquet('../data/split/resplit/ds3_test.parquet').drop(columns=['Time_step', 'Transaction_Id', 'Transaction_Type','party_Id',
       'party_Account', 'party_Country', 'cparty_Id', 'cparty_Account',
       'cparty_Country',])

X_train  = df_train.drop(columns=['Label'])
y_train = df_train['Label']

X_test  = df_test.drop(columns=['Label'])
y_test = df_test['Label']

In [3]:
features, categorical_cols, numerical_cols = get_features(pd.read_csv('../data/feature_selection/ds3_mi_to_target.csv'), df_train.columns, 5)
pipeline = pipeline_init_(LogisticRegression(), numerical_cols, categorical_cols)

# Time
start_time = time.time()

pipeline.fit(X_train, y_train)

# Time
end_time = time.time()


y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:,1]

roc_auc = roc_auc_score(y_test, y_pred_proba)
mcc = matthews_corrcoef(y_test, y_pred)
informedness, markedness = calculate_informedness_markedness(y_test, y_pred)

print(f'ROC AUC: {roc_auc}')
print(f'MCC: {mcc}')
print(f'Informedness: {informedness}')
print(f'Markedness: {markedness}')

print(classification_report(y_test, y_pred))
print(f'Time to train: {end_time - start_time}')

ROC AUC: 0.9815208385022023
MCC: 0.9619719075767297
Informedness: 0.9406413780593141
Markedness: 0.9837861405544714
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    691344
           1       1.00      0.94      0.97    175250

    accuracy                           0.99    866594
   macro avg       0.99      0.97      0.98    866594
weighted avg       0.99      0.99      0.99    866594

Time to train: 10.55804181098938


In [6]:
classifier_model = pipeline.named_steps['classifier']
feature_importances = classifier_model.coef_[0]

feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})