In [1]:
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd

# Define the preprocessing steps as functions
def replace_inf(df):
    df_encoded = df.replace([np.inf], 1e6).replace([-np.inf], -1e6)
    df_encoded['prev_USD_amount'] = df_encoded['prev_USD_amount'].fillna(0) 
    df_encoded['prev_age_delta'] = df_encoded['prev_age_delta'].fillna(0)
    return df_encoded


def ohe_encoder(df, categorical_features=None):
    onehot = OneHotEncoder(sparse=False, drop='first', handle_unknown='ignore')
    encoded_features = onehot.fit_transform(df[categorical_features])
    encoded_feature_names = onehot.get_feature_names_out(categorical_features)
    df_encoded = pd.concat([df.drop(columns=categorical_features), pd.DataFrame(encoded_features, columns=encoded_feature_names, index=df.index)], axis=1)
    return df_encoded

numerical_columns = ['USD_amount', 'txn_age_days', 'prev_USD_amount', 'prev_age_delta',
       'volume_7d_sum', 'velocity_7d_count', 'stat_7d_median', 'stat_7d_mad',
       'under_threshold_7d_count', 'under_threshold_7d_sum', 'volume_14d_sum',
       'velocity_14d_count', 'stat_14d_median', 'stat_14d_mad',
       'under_threshold_14d_count', 'under_threshold_14d_sum',
       'volume_30d_sum', 'velocity_30d_count', 'stat_30d_median',
       'stat_30d_mad', 'under_threshold_30d_count', 'under_threshold_30d_sum',
       'is_crossborder', 'stat_7d_modzscr', 'stat_14d_modzscr',
       'stat_30d_modzscr', 'party_entity_btw', 'party_entity_deg',
       'party_account_btw', 'party_account_deg', 'cparty_l1_btw',
       'cparty_l1_deg', 'cparty_l2_btw', 'cparty_l2_deg']

# Create the pipeline


In [2]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc

In [3]:
df = pd.read_parquet('../data/split/resplit/ds1_train.parquet')
df_train_samp = df.sample(frac=0.2).drop(columns=['Time_step', 'Transaction_Id', 'Transaction_Type','party_Id',
       'party_Account', 'party_Country', 'cparty_Id', 'cparty_Account',
       'cparty_Country',])
df = pd.read_parquet('../data/split/resplit/ds1_test.parquet')
df_test_samp = df.sample(frac=0.2).drop(columns=['Time_step', 'Transaction_Id', 'Transaction_Type','party_Id',
       'party_Account', 'party_Country', 'cparty_Id', 'cparty_Account',
       'cparty_Country',])

In [6]:
X_train  = df_train_samp.drop(columns=['Label'])
y_train = df_train_samp['Label']

In [24]:
X_test  = df_test_samp.drop(columns=['Label'])
y_test = df_test_samp['Label']

In [8]:
model = LogisticRegression()

pipeline = Pipeline([
    ('preprocessing', ColumnTransformer([
        ('num_pipeline', Pipeline([
            ('replace_inf', FunctionTransformer(replace_inf)),
            ('scaler', MinMaxScaler())
        ]), numerical_columns),
        
        ('cat_pipeline', Pipeline([
            ('ohe', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
        ]), ['std_txn_type', 'std_txn_method', 'prev_std_txn_type', 'prev_std_txn_method'])
    ])),
    ('smote', SMOTE()),
    ('classifier', model)
])

In [9]:
pipeline.fit(X_train, y_train)

In [14]:
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)

In [25]:
from sklearn.metrics import roc_auc_score, matthews_corrcoef, confusion_matrix

def calculate_informedness_markedness(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    # Calculate components
    sensitivity = tp / (tp + fn)  # also called TPR
    specificity = tn / (tn + fp)  # also called TNR
    ppv = tp / (tp + fp)  # positive predictive value
    npv = tn / (tn + fn)  # negative predictive value
    
    # Calculate metrics
    informedness = sensitivity + specificity - 1
    markedness = ppv + npv - 1
    
    return informedness, markedness

roc_auc = roc_auc_score(y_test, y_pred_proba[:, 1])
mcc = matthews_corrcoef(y_test, y_pred)
informedness, markedness = calculate_informedness_markedness(y_test, y_pred)

In [31]:
classifier_model = pipeline.named_steps['classifier']
feature_importances = classifier_model.coef_

In [42]:
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, df_train_samp.iloc[:, 0])

In [49]:
model = GradientBoostingClassifier()

model.fit(X_train_resampled, y_train_resampled)

In [50]:

# Predict on new data
new_data = ...  # Replace with your new data
predictions = model.predict(X_test)


In [51]:
predictions

array([1, 0, 1, ..., 1, 0, 1])

In [57]:
import argparse

def pipeline_init_(model, numerical_columns, categorical_columns):
    pipeline = Pipeline([
        ('preprocessing', ColumnTransformer([
            ('num_pipeline', Pipeline([
                ('replace_inf', FunctionTransformer(replace_inf)),
                ('scaler', MinMaxScaler())
            ]), numerical_columns),
            
            ('cat_pipeline', Pipeline([
                ('ohe', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
            ]), categorical_columns)
        ])),
        ('smote', SMOTE()),
        ('classifier', model)
    ])

    return pipeline

def get_features(feature_selection_df):
    typological_features = [
        'is_crossborder', 
        'under_threshold_14d_count',
        'under_threshold_14d_sum',
        'under_threshold_30d_count',
        'under_threshold_30d_sum',
        'under_threshold_7d_count',
        'under_threshold_7d_sum'
        ]
    def_time_columns = ['txn_time_hr', 'txn_time_mm']
    def_categorical_columns = ['std_txn_type', 'std_txn_method', 'prev_std_txn_type', 'prev_std_txn_method']


    features = feature_selection_df[~feature_selection_df['Feature'].isin(typological_features)].nlargest(20, 'MI_Score').iloc[:,0].tolist() + typological_features

    categorical_cols = list(set(features) & set(def_categorical_columns))
    numerical_cols = list(set(features) - (set(categorical_cols) | set(def_time_columns)))

    return features, categorical_cols, numerical_cols

def get_models(model_selection):
    models = {}
    model_choices = ['LogisticRegression', 'NaiveBayes', 'RandomForest', 'GradientBoosting', 'NeuralNetwork']
    
    if model_selection == 0:
        # Run all models
        models['LogisticRegression'] = LogisticRegression()
        models['NaiveBayes'] = MultinomialNB()
        models['RandomForest'] = RandomForestClassifier()
        models['GradientBoosting'] = GradientBoostingClassifier()
        # models['NeuralNetwork'] = NeuralNetwork()
    else:
        # Run single selected model
        if model_selection == 1:
            models['LogisticRegression'] = LogisticRegression()
        elif model_selection == 2:
            models['NaiveBayes'] = MultinomialNB()
        elif model_selection == 3:
            models['RandomForest'] = RandomForestClassifier()
        elif model_selection == 4:
            models['GradientBoosting'] = GradientBoostingClassifier()
        elif model_selection == 5:
            models['NeuralNetwork'] = None # NeuralNetwork()
         
    return models

def main():

    feature_selection_df = pd.read_csv('../data/feature_selection/ds1_mi_to_target.csv')
    features_, cat_cols_, num_cols_ = get_features(feature_selection_df)
    # Get the models
    models = get_models(mdl_select)
    for k, v in models.items():
        pipeline = pipeline_init_(v, numerical_columns, ['std_txn_type', 'std_txn_method', 'prev_std_txn_type', 'prev_std_txn_method'])
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        y_pred_proba = pipeline.predict_proba(X_test)
        
        roc_auc = roc_auc_score(y_test, y_pred_proba[:, 1])
        mcc = matthews_corrcoef(y_test, y_pred)
        informedness, markedness = calculate_informedness_markedness(y_test, y_pred)
        
        classifier_model = pipeline.named_steps['classifier']
        feature_importances = classifier_model.coef_
        
        smote = SMOTE()
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, df_train_samp.iloc[:, 0])
        
        model.fit(X_train_resampled, y_train_resampled)
    
    # Rest of the code
