In [3]:
import pandas as pd
from sklearn.ensemble import IsolationForest
import xgboost as xgb
import joblib

df = pd.read_csv('transactions_sample.csv')

df_processed = pd.get_dummies(df, columns=['type'], prefix='type')

features = [col for col in df_processed.columns if col not in ['step', 'nameOrig', 'nameDest', 'isFraud', 'isFlaggedFraud']]
X = df_processed[features]

print("Data prepared. Features to use:")
print(X.columns)

Data prepared. Features to use:
Index(['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest',
       'newbalanceDest', 'type_CASH_IN', 'type_CASH_OUT', 'type_DEBIT',
       'type_PAYMENT', 'type_TRANSFER'],
      dtype='object')


In [4]:

iso_forest = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)

print("Training the Isolation Forest model...")
iso_forest.fit(X)
print("Model trained!")

anomaly_predictions = iso_forest.predict(X)
df_processed['anomaly_label'] = [1 if x == -1 else 0 for x in anomaly_predictions]

print("\nNumber of anomalies detected by Isolation Forest:")
print(df_processed['anomaly_label'].value_counts())

Training the Isolation Forest model...
Model trained!

Number of anomalies detected by Isolation Forest:
anomaly_label
0    629899
1      6363
Name: count, dtype: int64


In [5]:
y = df_processed['anomaly_label']

xgb_surrogate = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

print("\nTraining the XGBoost surrogate model...")
xgb_surrogate.fit(X, y)
print("Surrogate model trained!")


Training the XGBoost surrogate model...
Surrogate model trained!


In [6]:
import os
os.makedirs('models', exist_ok=True)

joblib.dump(iso_forest, 'models/isolation_forest_model.joblib')
joblib.dump(xgb_surrogate, 'models/xgb_surrogate_model.joblib')

#  list of feature columns our models were trained on
joblib.dump(features, 'models/features.joblib')

print("\nAll models and feature list have been saved to the 'models' folder.")


All models and feature list have been saved to the 'models' folder.


In [7]:
from sklearn.metrics import classification_report, confusion_matrix

y_true_fraud = df_processed['isFraud']


y_pred_anomaly = df_processed['anomaly_label']

print("--- Evaluating Isolation Forest's ability to find REAL FRAUD ---")
print("\nClassification Report:")

# Note: We are comparing the model's anomaly predictions to the true fraud labels.
print(classification_report(y_true_fraud, y_pred_anomaly))

--- Evaluating Isolation Forest's ability to find REAL FRAUD ---

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    635445
           1       0.01      0.06      0.01       817

    accuracy                           0.99    636262
   macro avg       0.50      0.52      0.50    636262
weighted avg       1.00      0.99      0.99    636262



In [8]:
y_iso_forest_labels = y 

# Get predictions from our trained surrogate model
y_xgb_pred = xgb_surrogate.predict(X)

print("\n--- Evaluating XGBoost's ability to MIMIC the Isolation Forest ---")
print("\nClassification Report:")
print(classification_report(y_iso_forest_labels, y_xgb_pred))


--- Evaluating XGBoost's ability to MIMIC the Isolation Forest ---

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    629899
           1       0.98      0.98      0.98      6363

    accuracy                           1.00    636262
   macro avg       0.99      0.99      0.99    636262
weighted avg       1.00      1.00      1.00    636262



In [9]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.
