# Fraud Detection - Model Building (Interim 2)\n## Task 2: Model Building and Training\n\nThis notebook implements the model building and training for fraud detection.

## 1. Setup and Imports

In [None]:
import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom sklearn.model_selection import train_test_split, StratifiedKFold\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc\nimport warnings\nwarnings.filterwarnings("ignore")\n\n# Set style\nplt.style.use("seaborn-v0_8-darkgrid")\nsns.set_palette("husl")

## 2. Load and Prepare Data\n*(Note: For demonstration, we create synthetic data. Replace with actual data loading)*

In [None]:
# Create synthetic data for demonstration\nnp.random.seed(42)\nn_samples = 10000\nn_fraud = int(n_samples * 0.015)  # 1.5% fraud rate\n\n# Generate features\nX = np.random.randn(n_samples, 10)\ny = np.zeros(n_samples)\ny[:n_fraud] = 1  # First n_fraud samples are fraud\nnp.random.shuffle(y)\n\n# Split data\nX_train, X_test, y_train, y_test = train_test_split(\n    X, y, test_size=0.2, stratify=y, random_state=42\n)\n\nprint(f"Training set: {X_train.shape}")\nprint(f"Test set: {X_test.shape}")\nprint(f"Fraud rate in training: {y_train.mean():.3%}")\nprint(f"Fraud rate in test: {y_test.mean():.3%}")

## 3. Handle Class Imbalance with SMOTE

In [None]:
from imblearn.over_sampling import SMOTE\n\nprint("Class distribution before SMOTE:")\nprint(pd.Series(y_train).value_counts())\n\nsmote = SMOTE(random_state=42)\nX_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)\n\nprint("\\nClass distribution after SMOTE:")\nprint(pd.Series(y_train_resampled).value_counts())

## 4. Train Baseline Model (Logistic Regression)

In [None]:
from sklearn.linear_model import LogisticRegression\nfrom src.train import evaluate_model\n\n# Train model\nlr_model = LogisticRegression(\n    class_weight=\"balanced\",\n    random_state=42,\n    max_iter=1000\n)\nlr_model.fit(X_train_resampled, y_train_resampled)\n\n# Evaluate\nlr_results = evaluate_model(lr_model, X_test, y_test, \"Logistic Regression\")

## 5. Train Ensemble Model (XGBoost)

In [None]:
from xgboost import XGBClassifier\n\n# Calculate scale_pos_weight for imbalance\nscale_pos_weight = len(y_train[y_train==0]) / len(y_train[y_train==1])\n\n# Train XGBoost\nxgb_model = XGBClassifier(\n    n_estimators=200,\n    max_depth=5,\n    learning_rate=0.1,\n    scale_pos_weight=scale_pos_weight,\n    random_state=42,\n    eval_metric=\"aucpr\",\n    use_label_encoder=False\n)\nxgb_model.fit(X_train_resampled, y_train_resampled)\n\n# Evaluate\nxgb_results = evaluate_model(xgb_model, X_test, y_test, \"XGBoost\")

## 6. Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold\n\n# Define cross-validation strategy\ncv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\n\n# Cross-validate XGBoost\nxgb_cv_scores = cross_val_score(\n    xgb_model, X_train_resampled, y_train_resampled,\n    cv=cv, scoring=\"average_precision\"\n)\n\nprint(f"XGBoost Cross-Validation AUC-PR Scores: {xgb_cv_scores}")\nprint(f"Mean: {xgb_cv_scores.mean():.4f}, Std: {xgb_cv_scores.std():.4f}")

## 7. Model Comparison and Selection

In [None]:
# Compare model performance\ncomparison_df = pd.DataFrame({\n    "Model": [\"Logistic Regression\", \"XGBoost\"],\n    "AUC-PR": [lr_results[\"auc_pr\"], xgb_results[\"auc_pr\"]],\n    "F1-Score": [lr_results[\"f1_score\"], xgb_results[\"f1_score\"]]\n})\n\nprint("Model Performance Comparison:")\nprint(comparison_df)\n\n# Visual comparison\nfig, axes = plt.subplots(1, 2, figsize=(12, 4))\n\n# AUC-PR comparison\naxes[0].bar(comparison_df[\"Model\"], comparison_df[\"AUC-PR\"], color=[\"skyblue\", \"lightcoral\"])\naxes[0].set_title(\"AUC-PR Comparison\")\naxes[0].set_ylabel(\"AUC-PR Score\")\naxes[0].tick_params(axis=\"x\", rotation=45)\n\n# F1-Score comparison\naxes[1].bar(comparison_df[\"Model\"], comparison_df[\"F1-Score\"], color=[\"skyblue\", \"lightcoral\"])\naxes[1].set_title(\"F1-Score Comparison\")\naxes[1].set_ylabel(\"F1-Score\")\naxes[1].tick_params(axis=\"x\", rotation=45)\n\nplt.tight_layout()\nplt.show()\n\nprint("\\nâœ… Best Model Selected: XGBoost (Higher AUC-PR and F1-Score)")

## 8. Save Best Model

In [None]:
import joblib\n\n# Save the best model\njoblib.dump(xgb_model, \"models/xgb_fraud_model.pkl\")\nprint(\"Best model saved to models/xgb_fraud_model.pkl\")