In [39]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd

# Define the preprocessing steps as functions
def replace_inf(df):
    df_encoded = df.replace([np.inf], 1e6).replace([-np.inf], -1e6)
    df_encoded['prev_USD_amount'] = df_encoded['prev_USD_amount'].fillna(0) 
    df_encoded['prev_age_delta'] = df_encoded['prev_age_delta'].fillna(0)
    return df_encoded


def ohe_encoder(df, categorical_features=None):
    onehot = OneHotEncoder(sparse=False, drop='first', handle_unknown='ignore')
    encoded_features = onehot.fit_transform(df[categorical_features])
    encoded_feature_names = onehot.get_feature_names_out(categorical_features)
    df_encoded = pd.concat([df.drop(columns=categorical_features), pd.DataFrame(encoded_features, columns=encoded_feature_names, index=df.index)], axis=1)
    return df_encoded

numerical_columns = ['USD_amount', 'txn_age_days', 'prev_USD_amount', 'prev_age_delta',
       'volume_7d_sum', 'velocity_7d_count', 'stat_7d_median', 'stat_7d_mad',
       'under_threshold_7d_count', 'under_threshold_7d_sum', 'volume_14d_sum',
       'velocity_14d_count', 'stat_14d_median', 'stat_14d_mad',
       'under_threshold_14d_count', 'under_threshold_14d_sum',
       'volume_30d_sum', 'velocity_30d_count', 'stat_30d_median',
       'stat_30d_mad', 'under_threshold_30d_count', 'under_threshold_30d_sum',
       'is_crossborder', 'stat_7d_modzscr', 'stat_14d_modzscr',
       'stat_30d_modzscr', 'party_entity_btw', 'party_entity_deg',
       'party_account_btw', 'party_account_deg', 'cparty_l1_btw',
       'cparty_l1_deg', 'cparty_l2_btw', 'cparty_l2_deg']

# Create the pipeline
preprocessor = Pipeline([
    ('preprocessing', ColumnTransformer([
        ('num_pipeline', Pipeline([
            ('replace_inf', FunctionTransformer(replace_inf)),
            ('scaler', MinMaxScaler())
        ]), numerical_columns),
        
        ('cat_pipeline', Pipeline([
            ('ohe', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
        ]), ['std_txn_type', 'std_txn_method', 'prev_std_txn_type', 'prev_std_txn_method'])
    ]))
])



In [37]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc

In [5]:
df = pd.read_parquet('../data/split/resplit/ds1_train.parquet')
df_train_samp = df.sample(frac=0.2).drop(columns=['Time_step', 'Transaction_Id', 'Transaction_Type','party_Id',
       'party_Account', 'party_Country', 'cparty_Id', 'cparty_Account',
       'cparty_Country',])
df = pd.read_parquet('../data/split/resplit/ds1_test.parquet')
df_test_samp = df.sample(frac=0.2).drop(columns=['Time_step', 'Transaction_Id', 'Transaction_Type','party_Id',
       'party_Account', 'party_Country', 'cparty_Id', 'cparty_Account',
       'cparty_Country',])

In [29]:
df_train_samp_preprocess = replace_inf(df_train_samp)

In [40]:

# Fit the pipeline on the training data
X_train = preprocessor.fit_transform(df_train_samp.iloc[:, 1:])
X_test = preprocessor.transform(df_test_samp.iloc[:, 1:])

In [41]:
X_train

array([[1.22610586e-03, 8.94308943e-02, 1.59333014e-04, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [1.25024470e-03, 2.53467241e-01, 1.31321428e-03, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [1.93827581e-03, 3.29029173e-01, 1.38158561e-03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [1.52543614e-03, 1.79818269e-01, 8.82934605e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.26992563e-03, 3.20420851e-01, 1.25065561e-03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.20628183e-03, 1.99904352e-01, 2.37613636e-03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [27]:
df_train_samp[]

SyntaxError: invalid syntax (2287446545.py, line 1)

In [42]:
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, df_train_samp.iloc[:, 0])

In [49]:
model = GradientBoostingClassifier()

model.fit(X_train_resampled, y_train_resampled)

In [50]:

# Predict on new data
new_data = ...  # Replace with your new data
predictions = model.predict(X_test)


In [51]:
predictions

array([1, 0, 1, ..., 1, 0, 1])