In [13]:
import pandas as pd

# CSV is in the same folder as the notebook
df = pd.read_csv("anomaly_data.csv")

print(df.shape)
df.head()


(5000, 7)


Unnamed: 0,TransactionID,CustomerAge,TransactionAmount,TransactionTimeHour,NumPrevTransactions,LocationRiskScore,IsFraud
0,1,56,316.02,15,17,0.95,0
1,2,69,50.92,19,45,0.37,0
2,3,46,183.13,5,24,0.83,0
3,4,32,20.89,10,27,0.19,0
4,5,60,111.61,23,7,0.81,0


In [14]:
print(df.columns.tolist())
df.describe(include="all")


['TransactionID', 'CustomerAge', 'TransactionAmount', 'TransactionTimeHour', 'NumPrevTransactions', 'LocationRiskScore', 'IsFraud']


Unnamed: 0,TransactionID,CustomerAge,TransactionAmount,TransactionTimeHour,NumPrevTransactions,LocationRiskScore,IsFraud
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,2500.5,43.5846,97.259972,11.7428,24.9502,0.50756,0.0352
std,1443.520003,14.919094,97.444622,6.827442,14.264544,0.287437,0.184303
min,1.0,18.0,0.0,0.0,1.0,0.0,0.0
25%,1250.75,31.0,27.675,6.0,13.0,0.26,0.0
50%,2500.5,43.0,66.61,12.0,25.0,0.51,0.0
75%,3750.25,56.0,134.5025,18.0,37.0,0.75,0.0
max,5000.0,69.0,772.35,23.0,49.0,1.0,1.0


In [16]:
from pycaret.anomaly import *

anom_exp = setup(
    data=df,
    ignore_features=["TransactionID"],   # ID column
    session_id=123,
    normalize=True,                      # scale features
    verbose=False                        # hides extra logs; replaces old 'silent=True'
)


In [17]:
iforest = create_model('iforest')
labeled_df = assign_model(iforest)
labeled_df.head()


Unnamed: 0,CustomerAge,TransactionAmount,TransactionTimeHour,NumPrevTransactions,LocationRiskScore,IsFraud,Anomaly,Anomaly_Score
0,56,316.019989,15,17,0.95,0,0,-0.057221
1,69,50.919998,19,45,0.37,0,0,-0.071065
2,46,183.130005,5,24,0.83,0,0,-0.117112
3,32,20.889999,10,27,0.19,0,0,-0.143919
4,60,111.610001,23,7,0.81,0,0,-0.055577


In [18]:
# Train the Isolation Forest model
iforest = create_model('iforest')
iforest


IForest(behaviour='new', bootstrap=False, contamination=0.05,
    max_features=1.0, max_samples='auto', n_estimators=100, n_jobs=-1,
    random_state=123, verbose=0)

In [19]:
# Label anomalies in the dataset
labeled_df = assign_model(iforest)
labeled_df.head(20)


Unnamed: 0,CustomerAge,TransactionAmount,TransactionTimeHour,NumPrevTransactions,LocationRiskScore,IsFraud,Anomaly,Anomaly_Score
0,56,316.019989,15,17,0.95,0,0,-0.057221
1,69,50.919998,19,45,0.37,0,0,-0.071065
2,46,183.130005,5,24,0.83,0,0,-0.117112
3,32,20.889999,10,27,0.19,0,0,-0.143919
4,60,111.610001,23,7,0.81,0,0,-0.055577
5,25,377.26001,2,9,0.66,0,0,-0.020055
6,38,10.75,19,43,0.19,0,0,-0.110697
7,56,0.84,9,35,0.3,0,0,-0.129521
8,36,56.84,0,12,0.13,0,0,-0.085027
9,40,9.72,19,21,0.38,0,0,-0.144841


In [20]:
# Compare model's anomalies with real fraud labels
actual_frauds = (labeled_df["IsFraud"] == 1).sum()
model_anomalies = (labeled_df["Anomaly"] == 1).sum()
overlap = labeled_df[(labeled_df["IsFraud"] == 1) & (labeled_df["Anomaly"] == 1)].shape[0]

print(f"✅ Actual frauds in data: {actual_frauds}")
print(f"✅ Anomalies flagged by model: {model_anomalies}")
print(f"✅ Frauds correctly identified as anomalies: {overlap}")


✅ Actual frauds in data: 176
✅ Anomalies flagged by model: 250
✅ Frauds correctly identified as anomalies: 176


In [21]:
# Save the trained model for later use
save_model(iforest, "anomaly_iforest_model")


Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['CustomerAge', 'TransactionAmount',
                                              'TransactionTimeHour',
                                              'NumPrevTransactions',
                                              'LocationRiskScore', 'IsFraud'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=[],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('normalize', TransformerWrapper(transformer=StandardScaler())),
                 ('trained_model',
                  IForest(behaviour='new', bootstrap=False, contamination=0.05,
     max_features=1.0, max_samples='auto', n_estimators=100, n_jobs=-1,
     random_state=123, verbose=0))]),
 'anomaly_iforest_model.pkl')

In [22]:
plot_model(iforest, plot='tsne')
