In [39]:
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings("ignore")

In [40]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable


In [41]:
df=pd.read_csv("AIML Dataset.csv")

In [42]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [43]:
df.shape

(6362620, 11)

In [44]:
df['sender_balance_diff'] = df['oldbalanceOrg'] - df['newbalanceOrig'] - df['amount']
df['receiver_balance_diff'] = df['newbalanceDest'] - df['oldbalanceDest'] - df['amount']
df['is_sender_balance_mismatch'] = (df['oldbalanceOrg'] - df['amount'] != df['newbalanceOrig']).astype(int)
df['is_receiver_balance_mismatch'] = (df['oldbalanceDest'] + df['amount'] != df['newbalanceDest']).astype(int)

In [45]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,sender_balance_diff,receiver_balance_diff,is_sender_balance_mismatch,is_receiver_balance_mismatch
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,1.455192e-11,-9839.64,0,1
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,-1.136868e-12,-1864.28,0,1
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,0.0,-181.0,0,1
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,0.0,-21363.0,0,1
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,0.0,-11668.14,0,1


In [46]:
data=df.drop(['step','nameOrig','nameDest','nameDest','isFlaggedFraud'],axis=1)

In [47]:
data.head()

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,sender_balance_diff,receiver_balance_diff,is_sender_balance_mismatch,is_receiver_balance_mismatch
0,PAYMENT,9839.64,170136.0,160296.36,0.0,0.0,0,1.455192e-11,-9839.64,0,1
1,PAYMENT,1864.28,21249.0,19384.72,0.0,0.0,0,-1.136868e-12,-1864.28,0,1
2,TRANSFER,181.0,181.0,0.0,0.0,0.0,1,0.0,-181.0,0,1
3,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1,0.0,-21363.0,0,1
4,PAYMENT,11668.14,41554.0,29885.86,0.0,0.0,0,0.0,-11668.14,0,1


In [48]:
X = data.drop(columns=['isFraud'])
y = data['isFraud']

In [49]:
categorical_cols = ['type']
numerical_cols = ['amount','oldbalanceOrg','newbalanceOrig',
                  'oldbalanceDest','newbalanceDest',
                  'sender_balance_diff','receiver_balance_diff']
boolean_cols = ['is_sender_balance_mismatch','is_receiver_balance_mismatch']

In [50]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), numerical_cols),
        ('bool', 'passthrough', boolean_cols)
    ]
)

In [51]:
# Compute scale_pos_weight for XGBoost
pos_weight = len(y[y==0]) / len(y[y==1])
print(f"scale_pos_weight for XGBoost: {pos_weight:.2f}")

scale_pos_weight for XGBoost: 773.70


In [52]:
# Classifiers + Hyperparameter Grids
# =========================================
models = {
    "Logistic Regression": (
        LogisticRegression(max_iter=1000, class_weight="balanced"),
        {"classifier__C": [0.01, 0.1, 1, 10]}
    ),
    "Decision Tree": (
        DecisionTreeClassifier(class_weight="balanced", random_state=42),
        {"classifier__max_depth": [5, 10, 20, None],
         "classifier__min_samples_split": [2, 5, 10]}
    ),
    "Random Forest": (
        RandomForestClassifier(class_weight="balanced", random_state=42, n_jobs=-1),
        {"classifier__n_estimators": [100, 200],
         "classifier__max_depth": [10, 20, None]}
    ),
    "XGBoost": (
        XGBClassifier(use_label_encoder=False, eval_metric="logloss",
                      random_state=42, n_jobs=-1, scale_pos_weight=pos_weight),
        {"classifier__n_estimators": [100, 200],
         "classifier__learning_rate": [0.05, 0.1],
         "classifier__max_depth": [3, 5]}
    )
}

In [53]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [54]:
results = []

for name, (model, params) in models.items():
    print(f"\nðŸš€ Training {name} ...")
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])

    grid = GridSearchCV(pipe, param_grid=params, scoring="f1",n_jobs=1, cv=3, verbose=1)
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"\nâœ… Best Params for {name}: {grid.best_params_}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, digits=4))

    results.append([name, acc, prec, rec, f1, grid.best_params_])


ðŸš€ Training Logistic Regression ...
Fitting 3 folds for each of 4 candidates, totalling 12 fits

âœ… Best Params for Logistic Regression: {'classifier__C': 0.01}
Confusion Matrix:
 [[1237594   33287]
 [      5    1638]]
Classification Report:
               precision    recall  f1-score   support

           0     1.0000    0.9738    0.9867   1270881
           1     0.0469    0.9970    0.0896      1643

    accuracy                         0.9738   1272524
   macro avg     0.5234    0.9854    0.5382   1272524
weighted avg     0.9988    0.9738    0.9856   1272524


ðŸš€ Training Decision Tree ...
Fitting 3 folds for each of 12 candidates, totalling 36 fits

âœ… Best Params for Decision Tree: {'classifier__max_depth': 5, 'classifier__min_samples_split': 2}
Confusion Matrix:
 [[1270881       0]
 [      4    1639]]
Classification Report:
               precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000   1270881
           1     1.0000    0.9976    0.9

In [55]:
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-score", "Best Params"])
print("\nðŸ“Š Final Comparison:\n", results_df)


ðŸ“Š Final Comparison:
                  Model  Accuracy  Precision    Recall  F1-score  \
0  Logistic Regression  0.973838   0.046901  0.996957  0.089587   
1        Decision Tree  0.999997   1.000000  0.997565  0.998781   
2        Random Forest  0.999997   1.000000  0.997565  0.998781   
3              XGBoost  0.999924   0.946305  0.997565  0.971259   

                                         Best Params  
0                            {'classifier__C': 0.01}  
1  {'classifier__max_depth': 5, 'classifier__min_...  
2  {'classifier__max_depth': 10, 'classifier__n_e...  
3  {'classifier__learning_rate': 0.1, 'classifier...  


# random forest pipeline

In [65]:

categorical_cols = ['type']
numerical_cols = ['amount','oldbalanceOrg','newbalanceOrig',
                  'oldbalanceDest','newbalanceDest',
                  'sender_balance_diff','receiver_balance_diff']
boolean_cols = ['is_sender_balance_mismatch','is_receiver_balance_mismatch']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), numerical_cols),
        ('bool', 'passthrough', boolean_cols)
    ]
)

# =============================
# Random Forest Pipeline
# =============================
rf = RandomForestClassifier(
    class_weight="balanced",
    n_estimators=100,
    max_depth=10,
    n_jobs=-1,
    random_state=42
)

rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', rf)
])

# =============================
# Train/Test Split
# =============================
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# =============================
# Train & Evaluate
# =============================
rf_pipeline.fit(X_train, y_train)
y_pred = rf_pipeline.predict(X_test)

print("âœ… Random Forest Results")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, digits=4))

# Save model
import joblib
joblib.dump(rf_pipeline, "rf_pipeline.pkl")

âœ… Random Forest Results
Confusion Matrix:
 [[1270881       0]
 [      4    1639]]
Classification Report:
               precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000   1270881
           1     1.0000    0.9976    0.9988      1643

    accuracy                         1.0000   1272524
   macro avg     1.0000    0.9988    0.9994   1272524
weighted avg     1.0000    1.0000    1.0000   1272524



['rf_pipeline.pkl']

# Xgboost

In [68]:
#from xgboost import XGBClassifier

# =============================
# Compute scale_pos_weight for imbalance
# =============================
#pos_weight = len(y[y==0]) / len(y[y==1])
#print(f"scale_pos_weight: {pos_weight:.2f}")

# =============================
# XGBoost Pipeline
# =============================
xgb = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    scale_pos_weight=pos_weight,
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb)
])

# =============================
# Train & Evaluate
# =============================
xgb_pipeline.fit(X_train, y_train)
y_pred = xgb_pipeline.predict(X_test)

print("\nâœ… XGBoost Results")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, digits=4))

# Save model
joblib.dump(xgb_pipeline, "xgb_pipeline.pkl")



âœ… XGBoost Results
Confusion Matrix:
 [[1270788      93]
 [      4    1639]]
Classification Report:
               precision    recall  f1-score   support

           0     1.0000    0.9999    1.0000   1270881
           1     0.9463    0.9976    0.9713      1643

    accuracy                         0.9999   1272524
   macro avg     0.9732    0.9987    0.9856   1272524
weighted avg     0.9999    0.9999    0.9999   1272524



['xgb_pipeline.pkl']