In [1]:
import sys
import os

# Add the parent directory to sys.path
sys.path.append(os.path.abspath('../scripts'))

# Import necessary libraries
import sys
import pandas as pd
import scorecardpy as sc
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import joblib  # For saving models
from load_data import load_data

warnings.filterwarnings('ignore')


In [2]:
# Step 1: Load the data
data = load_data('../data/data.csv')

Data loaded successfully. Shape: (95662, 16)


In [3]:
# Step 2: Drop irrelevant columns (with too many unique values)
data.drop(columns=['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode', 'CountryCode'], inplace=True)

# Step 3: Extract temporal features
data['TransactionStartTime'] = pd.to_datetime(data['TransactionStartTime'], errors='coerce')

# Extract useful temporal features
data['transaction_hour'] = data['TransactionStartTime'].dt.hour
data['transaction_day'] = data['TransactionStartTime'].dt.day
data['transaction_month'] = data['TransactionStartTime'].dt.month
data['transaction_year'] = data['TransactionStartTime'].dt.year

# Drop TransactionStartTime after extracting temporal features
data.drop(columns=['TransactionStartTime'], inplace=True)

In [4]:
# Step 4: Split the data into training and testing sets (70/30)
train, test = sc.split_df(data, 'FraudResult', ratio=0.7, seed=999).values()

In [5]:
# Step 5: Apply WoE binning
woe_bins = sc.woebin(train, y='FraudResult')

[INFO] creating woe binning ...


In [6]:
# Step 6: Transform the training and testing sets using WoE
train_woe = sc.woebin_ply(train, woe_bins)
test_woe = sc.woebin_ply(test, woe_bins)

# Drop target from the training and testing sets
X_train = train_woe.drop(columns=['FraudResult'])
y_train = train_woe['FraudResult']
X_test = test_woe.drop(columns=['FraudResult'])
y_test = test_woe['FraudResult']

[INFO] converting into woe values ...
[INFO] converting into woe values ...
