In [2]:
# Imports
import numpy as np
import pandas as pd
import ipaddress
from pandas.tseries.holiday import USFederalHolidayCalendar
from sklearn.preprocessing import OneHotEncoder

# Training libs
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

In [3]:
# Feature groups
anomaly_detection_input_features = [
    # Non-feature columns
    "account_id",                 # Unique identifier for the user account
    "transaction_id",             # Unique identifier for the transaction (dropped in training)
    "date",                       # (String) Transaction date (YYYY-MM-DD)

    # Feature columns
    "amount",                     # (Float) Transaction amount
    "merchant_name",              # (String) Name of the merchant (OHEd)
    "transaction_type",           # (String) Credit vs Debit (OHEd)
    "location",                   # (String) City or ZIP code of transaction location (OHEd)
    "login_attempts",        # (Integer) Number of login attempts before transaction
    "transaction_duration",       # (Integer) Time in seconds from initiation to settlement
    "is_weekend",                 # (Boolean) True if tx on Saturday or Sunday
    "day_of_week",                # (Integer) Day of week (0=Mon ... 6=Sun)"
    "recurrence_flag",               # (Boolean) Recurring (True) vs one-off (False)
    "rolling_mean_3mo_amount",    # (Float) 3-month trailing mean transaction amount
    "rolling_std_3mo_amount",     # (Float) 3-month trailing std deviation of amounts
    "time_since_last_tx_days",    # (Float) Days since previous transaction

    # Derived features
    "is_holiday",                 # (Bool`ean) True if tx date is a public holiday
    "amount_percentile",          # (Float) Percentile rank in user’s historical amounts
]

anomaly_detection_output_features = [
    # Output features
    "is_anomaly", # (Boolean): Flagged by the model as anomalous
    "anomaly_confidence", # (Float): Model’s confidence (0–1) in that flag
]

In [4]:
# Load the dataset with date parsing
df = pd.read_csv("assets/unlabeled_data/transactions.csv", parse_dates=['TransactionDate', 'PreviousTransactionDate'])

In [5]:
# Rename columns to match the feature list
rename_map = {
    'TransactionID': 'transaction_id',
    'AccountID': 'account_id',
    'TransactionAmount': 'amount',
    'TransactionDate': 'date',
    'TransactionType': 'transaction_type',
    'Location': 'location',
    'DeviceID': 'device_id',
    'IP Address': 'ip_address',
    'MerchantID': 'merchant_name',
    'Channel': 'channel',
    'CustomerAge': 'age',
    'CustomerOccupation': 'customer_occupation',
    'TransactionDuration': 'transaction_duration',
    'LoginAttempts': 'login_attempts',
    'AccountBalance': 'account_balance',
    'PreviousTransactionDate': 'previous_transaction_date'
}

df = df.rename(columns=rename_map)

In [6]:
# Preprocess features
def preprocess(v_df):
    #* Sort Data
    v_df = v_df.sort_values(["account_id","date"])

    #* Extract date components
    v_df["day_of_week"]    = v_df["date"].dt.dayofweek
    v_df["is_weekend"]     = v_df["day_of_week"] >= 5

    #* Rolling statistics
    if "rolling_mean_3mo_amount" not in v_df.columns:
        v_df["rolling_mean_3mo_amount"] = (
            v_df.groupby("account_id")["amount"]
            .transform(lambda x: x.rolling(90, min_periods=1).mean())
            .fillna(0)
        )
    if "rolling_std_3mo_amount" not in v_df.columns:
        v_df["rolling_std_3mo_amount"] = (
            v_df.groupby("account_id")["amount"]
            .transform(lambda x: x.rolling(90, min_periods=1).std())
            .fillna(0)
        )

    #* Recurrence
    if "recurrence_flag" not in v_df.columns:
        v_df["recurrence_flag"]   = v_df.duplicated(
            subset=["account_id","merchant_name","amount"], keep=False
        )


In [7]:
# Derive features
def derive(v_df):
    #* Holidays
    years = v_df["date"].dt.year.unique()
    hols = USFederalHolidayCalendar().holidays(
        start=f"{years.min()-1}-01-01",
        end=  f"{years.max()+1}-12-31"
    )
    v_df["is_holiday"] = v_df["date"].dt.normalize().isin(hols)

    #* Percentiles
    v_df["amount_percentile"] = v_df.groupby("account_id")["amount"].rank(pct=True)

In [8]:
# Final Cleanup
preprocess(df)
derive(df)

final_features = [c for c in anomaly_detection_input_features if c in df.columns]
df = df[final_features]

# df.head()

In [9]:
# output the cleaned dataframe to a new CSV file
df.to_csv("assets/unlabeled_data/transactions_cleaned.csv", index=False)

In [10]:
def encode(v_df, cols_to_encode):
    # One-hot encode categorical columns and join into v_df
    for col in cols_to_encode:
        ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
        loc_mat = ohe.fit_transform(v_df[[col]])
        loc_cols = [f"{col}_{c}" for c in ohe.categories_[0]]
        df_loc = pd.DataFrame(loc_mat, columns=loc_cols, index=v_df.index)

        # drop the original column and join the new columns
        v_df = v_df.drop(columns=[col])
        v_df = pd.concat([v_df, df_loc], axis=1)
    
    return v_df

In [11]:
# Convert categorical features to numerical
# Load preprocessed feature CSV
df = pd.read_csv("assets/unlabeled_data/transactions_cleaned.csv", parse_dates=["date"])

#! Drop non-features
cols_to_drop = ["account_id", "transaction_id", "date"]
df = df.drop(columns=cols_to_drop)

# One hot encoding for categorical features
df = encode(df, ["merchant_name", "location"])

# Other categorical columns
# Convert any boolean columns to 0/1
bool_cols = df.select_dtypes(include="bool").columns
for c in bool_cols:
    df[c] = df[c].astype(int)

# Convert credit/debit to binary
df['transaction_type'] = df['transaction_type'].map({
    'Credit': 1,
    'Debit': 0
})

In [12]:
# Scaling and Training
# Prepare numeric matrix and scale
X = df.astype(float)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train IsolationForest
iso = IsolationForest(
    n_estimators=500,
    max_samples=1.0,      
    max_features=1.0,
    bootstrap=True,
    contamination=0.05,
    n_jobs=-1,
    random_state=42,
    verbose=0,
)
iso.fit(X_scaled)

# Predict & score
labels     = iso.predict(X_scaled)         # 1 = normal, -1 = anomaly
raw_scores = iso.score_samples(X_scaled)   # higher = more normal

df["is_anomaly"] = labels == -1

inv = -raw_scores
df["anomaly_confidence"] = (inv - inv.min()) / (inv.max() - inv.min())

In [13]:
# Save/print results
important_head_cols = [
    "amount", 
    "is_anomaly", "anomaly_confidence"
]
print(df[important_head_cols])

# df.to_csv("assets/unlabeled_data/transactions_with_anomalies.csv", index=False)

      amount  is_anomaly  anomaly_confidence
0      14.09       False            0.295466
1     376.24       False            0.289169
2     126.29       False            0.415668
3     184.50       False            0.500506
4      13.45       False            0.457014
...      ...         ...                 ...
2507  856.21       False            0.437856
2508  251.54       False            0.185300
2509   28.63       False            0.220011
2510  185.97       False            0.171144
2511  243.08       False            0.390393

[2512 rows x 3 columns]


In [14]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
# Calculate Metrics
##! Calculate accuracy with the labeled datasets

#* Synthetic dataset
#* Load the synthetic dataset
df_synthetic = pd.read_csv("assets/labeled_data/synthetic_labeled_transactions.csv", parse_dates=["date"])

#? If account_id is not in the synthetic dataset, add a column with the same value. This column will be dropped later.
if "account_id" not in df_synthetic.columns:
    df_synthetic["account_id"] = "account_id" # [ GMX_user_file: model.ipynb ]

##! Preprocess the synthetic dataset
preprocess(df_synthetic) # [ GMX_user_file: model.ipynb ]
derive(df_synthetic) # [ GMX_user_file: model.ipynb ]

#* Make sure all required columns are present
missing_cols = [col for col in anomaly_detection_input_features if col not in df_synthetic.columns] # [ GMX_user_file: model.ipynb ]
if missing_cols:
    raise ValueError(f"Missing columns in synthetic dataset: {missing_cols}")

#* Drop non-feature columns
# Keep 'is_anomaly' for evaluation
cols_to_drop_features_only = ["account_id", "transaction_id", "date"] # Modified from original
df_synthetic_model = df_synthetic.drop(columns=cols_to_drop_features_only) # Modified from original

# Store true labels before potentially dropping the column
y_true = df_synthetic["is_anomaly"].astype(int) # [ GMX_user_file: model.ipynb ]

# Drop the true label column if it's still in df_synthetic_model
if "is_anomaly" in df_synthetic_model.columns:
     df_synthetic_model = df_synthetic_model.drop(columns=["is_anomaly"]) # [ GMX_user_file: model.ipynb ]


#* One hot encoding for categorical features
df_synthetic_model = encode(df_synthetic_model, ["merchant_name", "location"]) # [ GMX_user_file: model.ipynb ]

#* Convert any boolean columns to 0/1
bool_cols = df_synthetic_model.select_dtypes(include="bool").columns # [ GMX_user_file: model.ipynb ]
for c in bool_cols:
    df_synthetic_model[c] = df_synthetic_model[c].astype(int) # [ GMX_user_file: model.ipynb ]

#* Convert credit/debit to binary
df_synthetic_model['transaction_type'] = df_synthetic_model['transaction_type'].map({
    'Credit': 1,
    'Debit': 0
}) # [ GMX_user_file: model.ipynb ]

#* Ensure columns match the training data (df)
# Add missing columns with zeros (OHE encoded columns)
# Use the columns from the *scaled* training data `X` before anomaly columns were added
train_cols = X.columns # Assuming X is the DataFrame before scaling in cell 11
for col in train_cols:
    if col not in df_synthetic_model.columns:
        df_synthetic_model[col] = 0 # [ GMX_user_file: model.ipynb ]

# Remove extra columns not in training data
df_synthetic_model = df_synthetic_model[train_cols] # Use train_cols order and selection [ GMX_user_file: model.ipynb ]


##! Predict on the synthetic dataset
#* Scaling
X_synthetic = df_synthetic_model.astype(float) # [ GMX_user_file: model.ipynb ]
X_synthetic_scaled = scaler.transform(X_synthetic) # [ GMX_user_file: model.ipynb ]

#* Predict & score.
labels_synthetic = iso.predict(X_synthetic_scaled)         # 1 = normal, -1 = anomaly [ GMX_user_file: model.ipynb ]
raw_scores_synthetic = iso.score_samples(X_synthetic_scaled)   # higher = more normal [ GMX_user_file: model.ipynb ]

# Create prediction column: is_anomaly_pred (convert -1 to 1, 1 to 0)
y_pred = (labels_synthetic == -1).astype(int) # [ GMX_user_file: model.ipynb ]

# Use the negative raw scores for ROC AUC (higher score -> more anomalous)
y_scores = -raw_scores_synthetic # [ GMX_user_file: model.ipynb ]


##! Calculate and Print Metrics

# Testing Accuracy (already calculated in a similar way)
# Ensure y_true and y_pred are correctly aligned if df_synthetic index was changed
# Re-calculate using sklearn function for consistency
test_accuracy = accuracy_score(y_true, y_pred)
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Precision
precision = precision_score(y_true, y_pred, zero_division=0) # Handles cases with no predicted positives
print(f"Precision: {precision:.4f}")

# Recall
recall = recall_score(y_true, y_pred, zero_division=0) # Handles cases with no actual positives
print(f"Recall: {recall:.4f}")

# F1 Score
f1 = f1_score(y_true, y_pred, zero_division=0) # Handles cases with no positives in either true or pred
print(f"F1 Score: {f1:.4f}")

# ROC AUC Score
# Check if there is more than one class in true labels before calculating ROC AUC
if len(np.unique(y_true)) > 1:
    roc_auc = roc_auc_score(y_true, y_scores)
    print(f"ROC AUC Score: {roc_auc:.4f}")
else:
    print("ROC AUC Score: Not defined (only one class present in true labels)")

Testing Accuracy: 0.8883
Precision: 0.6122
Recall: 0.2381
F1 Score: 0.3429
ROC AUC Score: 0.6760
