In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bank-account-fraud-dataset-neurips-2022/Base.csv
/kaggle/input/bank-account-fraud-dataset-neurips-2022/Variant IV.csv
/kaggle/input/bank-account-fraud-dataset-neurips-2022/Variant V.csv
/kaggle/input/bank-account-fraud-dataset-neurips-2022/Variant I.csv
/kaggle/input/bank-account-fraud-dataset-neurips-2022/Variant III.csv
/kaggle/input/bank-account-fraud-dataset-neurips-2022/Variant II.csv


In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import shap
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import logging
import os
import pickle
from tqdm import tqdm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from collections import Counter
from sklearn.ensemble import IsolationForest

# ----- Setup Logging -----
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# ----- Step 1: Load Data -----
def load_data(filepath):
    logger.info("Loading data from %s", filepath)
    data = pd.read_csv(filepath)
    logger.info("Data loaded. Shape: %s", data.shape)
    return data

# ----- Step 2: Preprocess Data -----
def preprocess_data(data):
    logger.info("Preprocessing data")
    y = data['fraud_bool']
    X = data.drop('fraud_bool', axis=1)

    categorical_cols = ['payment_type', 'employment_status', 'housing_status', 'source', 'device_os']
    numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numerical_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
        ],
        remainder="passthrough"
    )

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
    X_train_proc = preprocessor.fit_transform(X_train)
    X_test_proc = preprocessor.transform(X_test)

    return X_train_proc, X_test_proc, y_train, y_test, preprocessor

# ----- Step 3: Oversampling (Hybrid SMOTE + ADASYN) -----
def oversample_data(X_train, y_train):
    logger.info("Performing hybrid oversampling using SMOTE and ADASYN")
    hybrid_sampler = SMOTE(random_state=42)
    X_resampled, y_resampled = hybrid_sampler.fit_resample(X_train, y_train)
    adasyn_sampler = ADASYN(random_state=42)
    X_resampled, y_resampled = adasyn_sampler.fit_resample(X_resampled, y_resampled)
    logger.info("Oversampling complete. New class distribution: %s", Counter(y_resampled))
    return X_resampled, y_resampled

# ----- Step 4: Fit IsolationForest -----
def fit_isolation_forest(X_train):
    logger.info("Fitting IsolationForest")
    iso_forest = IsolationForest(random_state=42, contamination=0.1, n_jobs=-1)
    iso_forest.fit(X_train)
    logger.info("IsolationForest fitted.")
    return iso_forest

# ----- Step 5: Compute Anomaly Scores -----
def compute_anomaly_scores(iso_forest, X):
    logger.info("Computing anomaly scores")
    anomaly_scores = iso_forest.decision_function(X)
    return anomaly_scores

# ----- Step 6: Augment Data with Anomaly Scores -----
def augment_data_with_anomaly_scores(X, anomaly_scores, preprocessor):
    logger.info("Augmenting data with anomaly scores")
    feature_names = preprocessor.get_feature_names_out().tolist()
    X_aug = pd.DataFrame(X, columns=feature_names)
    X_aug['anomaly_score'] = anomaly_scores
    return X_aug

# ----- Step 6: Train XGBoost with Grid Search -----
def train_xgboost_in_batches(X_train, y_train, batch_size=50000):
    logger.info("Training XGBoost in batches using CPU & GPU")

    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "random_state": 42,
        "tree_method": "gpu_hist",  
        "device": "cuda",
        "max_depth": 5,
        "learning_rate": 0.1,
        "subsample": 0.8,
        "n_estimators": 200,  
        "n_jobs": -1
    }

    model = xgb.XGBClassifier(**params)
    num_batches = int(np.ceil(len(X_train) / batch_size))

    for i in range(num_batches):
        start = i * batch_size
        end = min((i + 1) * batch_size, len(X_train))
        logger.info(f"Training batch {i+1}/{num_batches}: Samples {start}-{end}")

        # Convert batch to DeviceQuantileDMatrix (faster on GPU)
        dtrain = xgb.DeviceQuantileDMatrix(X_train, label=y_train)


        model.fit(X_train[start:end], y_train[start:end], xgb_model=model.get_booster() if i > 0 else None)

    return model

# ----- Step 7: Save Model -----
def save_model(model, preprocessor, iso_forest, directory="saved_models"):
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    joblib.dump(model, os.path.join(directory, "xgb_model.pkl"))
    joblib.dump(preprocessor, os.path.join(directory, "preprocessor.pkl"))
    joblib.dump(iso_forest, os.path.join(directory, "iso_forest.pkl"))

# ----- Step 8: Generate and Save Analytical Graphs -----
def generate_graphs(model, X_test, y_test, directory="saved_graphs"):
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    # Confusion Matrix Heatmap
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title("Confusion Matrix")
    plt.savefig(os.path.join(directory, "confusion_matrix.png"))
    
    # ROC Curve
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, label='ROC Curve')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()
    plt.savefig(os.path.join(directory, "roc_curve.png"))
    
    # Feature Importance
    xgb.plot_importance(model)
    plt.savefig(os.path.join(directory, "feature_importance.png"))
    
    # SHAP Analysis
    explainer = shap.Explainer(model)
    shap_values = explainer(X_test)
    shap.summary_plot(shap_values, X_test, show=False)
    plt.savefig(os.path.join(directory, "shap_summary.png"))

# ----- Main Execution -----
def main():
    filepath = "/kaggle/input/bank-account-fraud-dataset-neurips-2022/Base.csv"
    data = load_data(filepath)
    X_train, X_test, y_train, y_test, preprocessor = preprocess_data(data)
    X_train_resampled, y_train_resampled = oversample_data(X_train, y_train)
    iso_forest = fit_isolation_forest(X_train_resampled)
    anomaly_scores_train = compute_anomaly_scores(iso_forest, X_train_resampled)
    anomaly_scores_test = compute_anomaly_scores(iso_forest, X_test)
    X_train_aug = augment_data_with_anomaly_scores(X_train_resampled, anomaly_scores_train, preprocessor)
    X_test_aug = augment_data_with_anomaly_scores(X_test, anomaly_scores_test, preprocessor)          
    best_model = train_xgboost_with_grid_search(X_train_resampled, y_train_resampled)
    save_model(best_model, preprocessor, iso_forest)
    generate_graphs(best_model, X_test, y_test)
    logger.info("Model training complete and saved.")

if __name__ == "__main__":
    main()


# Create a model with more efficient data consumption

In [None]:
# import pandas as pd
# import numpy as np
# import xgboost as xgb
# import shap
# import matplotlib.pyplot as plt
# import seaborn as sns
# import joblib
# import logging
# import os
# import pickle
# from tqdm import tqdm
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import confusion_matrix, classification_report, roc_curve
# from imblearn.over_sampling import SMOTE, ADASYN
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from collections import Counter
# from sklearn.ensemble import IsolationForest
# import cudf  # GPU-accelerated pandas

# # ----- Setup Logging -----
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# logger = logging.getLogger(__name__)

# # ----- Step 1: Load Data -----
# def load_data(filepath):
#     logger.info(f"Loading data from {filepath}")
#     data = cudf.read_csv(filepath)
#     logger.info(f"Data loaded. Shape: {data.shape}")
#     return data

# # ----- Step 2: Preprocess Data -----
# def preprocess_data(data):
#     logger.info("Preprocessing data")
#     y = data['fraud_bool']
#     X = data.drop('fraud_bool', axis=1)

#     categorical_cols = ['payment_type', 'employment_status', 'housing_status', 'source', 'device_os']
#     numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()

#     preprocessor = ColumnTransformer(
#         transformers=[
#             ("num", StandardScaler(), numerical_cols),
#             ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
#         ],
#         remainder="passthrough"
#     )

#     X_train, X_test, y_train, y_test = train_test_split(X.to_pandas(), y.to_pandas(), test_size=0.3, stratify=y.to_pandas(), random_state=42)
#     X_train_proc = preprocessor.fit_transform(X_train)
#     X_test_proc = preprocessor.transform(X_test)

#     logger.info("Preprocessing complete.")
#     return X_train_proc, X_test_proc, y_train, y_test, preprocessor

# # ----- Step 3: Oversampling -----
# def oversample_data(X_train, y_train):
#     logger.info("Performing hybrid oversampling using SMOTE and ADASYN")
#     smote = SMOTE(random_state=42)
#     X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
#     adasyn = ADASYN(random_state=42)
#     X_resampled, y_resampled = adasyn.fit_resample(X_resampled, y_resampled)
#     logger.info(f"Oversampling complete. New class distribution: {Counter(y_resampled)}")
#     return X_resampled, y_resampled

# # ----- Step 4: Fit IsolationForest -----
# def fit_isolation_forest(X_train):
#     logger.info("Fitting IsolationForest")
#     iso_forest = IsolationForest(random_state=42, contamination=0.1, n_jobs=-1)
#     iso_forest.fit(X_train)
#     logger.info("IsolationForest fitted.")
#     return iso_forest

# # ----- Step 5: Compute Anomaly Scores -----
# def compute_anomaly_scores(iso_forest, X):
#     logger.info("Computing anomaly scores")
#     return iso_forest.decision_function(X)

# # ----- Step 6: Augment Data with Anomaly Scores -----
# def augment_data_with_anomaly_scores(X, anomaly_scores, preprocessor):
#     logger.info("Augmenting data with anomaly scores")
#     feature_names = preprocessor.get_feature_names_out().tolist()
#     X_aug = pd.DataFrame(X, columns=feature_names)
#     X_aug['anomaly_score'] = anomaly_scores
#     return X_aug

# # ----- Step 7: Train XGBoost on GPU -----
# def train_xgboost(X_train, y_train):
#     logger.info("Training XGBoost using GPU")
#     dtrain = xgb.DMatrix(X_train, label=y_train)
#     params = {
#         "objective": "binary:logistic",
#         "eval_metric": "logloss",
#         "random_state": 42,
#         "tree_method": "hist",  # Corrected method
#         "device": "cuda",
#         "max_depth": 5,
#         "learning_rate": 0.1,
#         "subsample": 0.8,
#         "n_estimators": 200,
#         "n_jobs": -1
#     }
#     model = xgb.train(params, dtrain, num_boost_round=200)
#     logger.info("XGBoost training complete.")
#     return model

# # ----- Step 8: Save Model -----
# def save_model(model, preprocessor, directory="saved_models"):
#     if not os.path.exists(directory):
#         os.makedirs(directory)
#     joblib.dump(model, os.path.join(directory, "xgb_model.pkl"))
#     joblib.dump(preprocessor, os.path.join(directory, "preprocessor.pkl"))
#     logger.info("Model and preprocessor saved.")

# # ----- Step 9: Generate Graphs -----
# def generate_graphs(model, X_test, y_test, preprocessor, directory="saved_graphs"):
#     if not os.path.exists(directory):
#         os.makedirs(directory)
    
#     logger.info("Generating graphs and SHAP analysis")
#     y_pred_prob = model.predict(xgb.DMatrix(X_test))
#     y_pred = y_pred_prob > 0.5
    
#     cm = confusion_matrix(y_test, y_pred)
#     plt.figure(figsize=(6, 4))
#     sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
#     plt.title("Confusion Matrix")
#     plt.xlabel("Predicted Label")
#     plt.ylabel("True Label")
#     plt.savefig(os.path.join(directory, "confusion_matrix.png"))
#     plt.close()
#     logger.info("Confusion matrix saved.")
    
#     fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
#     plt.figure(figsize=(6, 4))
#     plt.plot(fpr, tpr, label='ROC Curve')
#     plt.title("ROC Curve")
#     plt.xlabel("False Positive Rate")
#     plt.ylabel("True Positive Rate")
#     plt.legend()
#     plt.savefig(os.path.join(directory, "roc_curve.png"))
#     plt.close()
#     logger.info("ROC curve saved.")
    
#     explainer = shap.Explainer(model)
#     shap_values = explainer(X_test)
#     shap_df = pd.DataFrame(shap_values.values, columns=preprocessor.get_feature_names_out().tolist())
#     shap_df.to_csv(os.path.join(directory, "shap_analysis.csv"), index=False)
#     logger.info("SHAP analysis saved.")

# # ----- Main Execution -----
# def main():
#     filepath = "/kaggle/input/bank-account-fraud-dataset-neurips-2022/Base.csv"
#     data = load_data(filepath)
#     X_train, X_test, y_train, y_test, preprocessor = preprocess_data(data)
#     X_train_resampled, y_train_resampled = oversample_data(X_train, y_train)
#     iso_forest = fit_isolation_forest(X_train_resampled)
#     X_train_aug = augment_data_with_anomaly_scores(X_train_resampled, compute_anomaly_scores(iso_forest, X_train_resampled), preprocessor)
#     X_test_aug = augment_data_with_anomaly_scores(X_test, compute_anomaly_scores(iso_forest, X_test), preprocessor)
#     model = train_xgboost(X_train_resampled, y_train_resampled)
#     save_model(model, preprocessor)
#     generate_graphs(model, X_test, y_test, preprocessor)
#     logger.info("Model training and analysis complete.")

# if __name__ == "__main__":
#     main()


In [None]:
# import pandas as pd

# # Load the dataset
# file_path = "/kaggle/input/bank-account-fraud-dataset-neurips-2022/Base.csv"  # Replace with the actual file path
# data = pd.read_csv(file_path)

# # Check for missing values
# print(data.isnull().sum())
# print(data.head())


In [None]:
# import numpy as np
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.compose import ColumnTransformer

# def preprocess_data(data):
#     y = data['fraud_bool']
#     X = data.drop('fraud_bool', axis=1)

#     categorical_cols = ['payment_type', 'employment_status', 'housing_status', 'source', 'device_os']
#     numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()

#     preprocessor = ColumnTransformer(
#         transformers=[
#             ("num", StandardScaler(), numerical_cols),
#             ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
#         ],
#         remainder="passthrough"
#     )

#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
#     X_train_proc = preprocessor.fit_transform(X_train)
#     X_test_proc = preprocessor.transform(X_test)

#     return X_train_proc, X_test_proc, y_train, y_test, preprocessor

# X_train, X_test, y_train, y_test, preprocessor = preprocess_data(data)
# print("Preprocessing done!")


In [None]:
# from tqdm import tqdm
# from imblearn.combine import SMOTEENN
# from imblearn.over_sampling import ADASYN

# def apply_oversampling(X_train, y_train):
#     tqdm.pandas(desc="Applying SMOTE + ADASYN")
#     # smote = SMOTE(random_state=42)
#     # X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
#     # adasyn = ADASYN(random_state=42)
#     # X_resampled, y_resampled = adasyn.fit_resample(X_resampled, y_resampled)
#     # logger.info(f"Oversampling complete. New class distribution: {Counter(y_resampled)}")
#     # return X_resampled, y_resampled
    
#     # SMOTE + ENN
#     smote_adasyn = SMOTEENN(sampling_strategy="auto")
#     X_train_resampled, y_train_resampled = smote_adasyn.fit_resample(X_train, y_train)
    
#     # ADASYN
#     adasyn = ADASYN(sampling_strategy="auto", random_state=42)
#     X_train_final, y_train_final = adasyn.fit_resample(X_train_resampled, y_train_resampled)
    
#     return X_train_final, y_train_final

# X_train_bal, y_train_bal = apply_oversampling(X_train, y_train)
# print(f"Oversampling done! New shape: {X_train_bal.shape}")


In [None]:
# from sklearn.ensemble import IsolationForest

# def fit_isolation_forest(X_train):
#     print("Training Isolation Forest...")
#     iso_forest = IsolationForest(n_estimators=100, contamination=0.02, random_state=42, n_jobs=-1)
#     iso_forest.fit(X_train)
    
#     anomaly_scores = iso_forest.decision_function(X_train)  # Get anomaly scores
#     return iso_forest, anomaly_scores

# iso_forest_model, anomaly_scores = fit_isolation_forest(X_train_bal)
# print("Isolation Forest training done!")


In [None]:
# import numpy as np

# # Add anomaly score as a new feature
# X_train_bal = np.hstack((X_train_bal, anomaly_scores.reshape(-1, 1)))
# X_test = np.hstack((X_test, iso_forest_model.decision_function(X_test).reshape(-1, 1)))

# print("Anomaly scores added to dataset!")


In [None]:
# from xgboost import XGBClassifier
# from sklearn.model_selection import GridSearchCV

# def train_xgboost(X_train, y_train):
#     print("Training XGBoost with Grid Search...")

#     param_grid = {
#         'n_estimators': [100, 200],
#         'max_depth': [3, 5, 7],
#         'learning_rate': [0.01, 0.1, 0.2],
#         'subsample': [0.8, 1.0],
#         'colsample_bytree': [0.8, 1.0]
#     }

#     xgb = XGBClassifier(random_state=42, n_jobs=-1)
#     grid_search = GridSearchCV(xgb, param_grid, scoring="f1", cv=3, verbose=2, n_jobs=-1)
#     grid_search.fit(X_train, y_train)

#     print(f"Best parameters: {grid_search.best_params_}")
#     return grid_search.best_estimator_

# xgb_model = train_xgboost(X_train_bal, y_train_bal)
# print("XGBoost training complete!")


In [None]:
# import joblib

# joblib.dump(xgb_model, "fraud_detection_xgb.pkl")
# print("Model saved successfully!")


In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# from sklearn.metrics import confusion_matrix

# # Predict on test data
# y_pred = xgb_model.predict(X_test)

# # Generate Confusion Matrix
# cm = confusion_matrix(y_test, y_pred)

# plt.figure(figsize=(6, 5))
# sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No Fraud", "Fraud"], yticklabels=["No Fraud", "Fraud"])
# plt.xlabel("Predicted")
# plt.ylabel("Actual")
# plt.title("Confusion Matrix")
# plt.show()


In [None]:
# from sklearn.metrics import roc_curve, auc

# y_pred_prob = xgb_model.predict_proba(X_test)[:, 1]
# fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
# roc_auc = auc(fpr, tpr)

# plt.figure(figsize=(6, 5))
# plt.plot(fpr, tpr, color="blue", label=f"AUC = {roc_auc:.2f}")
# plt.plot([0, 1], [0, 1], "r--")
# plt.xlabel("False Positive Rate")
# plt.ylabel("True Positive Rate")
# plt.title("ROC Curve")
# plt.legend()
# plt.show()


In [None]:
# from sklearn.metrics import classification_report

# report = classification_report(y_test, y_pred)
# print("Classification Report:\n", report)


In [None]:
# import shap
# import numpy as np
# import pandas as pd

# explainer = shap.Explainer(xgb_model)
# shap_values = explainer(X_test)

# # Convert SHAP values to DataFrame
# shap_df = pd.DataFrame(shap_values.values, columns=[f"Feature_{i}" for i in range(X_test.shape[1])])
# shap_df.to_csv("shap_analysis.csv", index=False)

# print("SHAP values saved to shap_analysis.csv")
