In [1]:
import pandas as pd
import numpy as np
import gc

# 1. Load the two data layers (assuming they are unzipped in the current directory)
print("Loading transaction and identity data...")
train_transaction = pd.read_csv('train_transaction.csv')
train_identity = pd.read_csv('train_identity.csv')

# 2. Merge the two files on 'TransactionID' (the core multilayered step)
# Use a left merge to keep all transaction records.
print("Merging data layers...")
df_train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')

# 3. Clean up memory for the next steps
del train_transaction, train_identity
gc.collect()

print(f"Merged Data Shape: {df_train.shape}")
print("Data loading and merging complete. Ready for preprocessing.")

Loading transaction and identity data...
Merging data layers...
Merged Data Shape: (590540, 434)
Data loading and merging complete. Ready for preprocessing.


In [8]:
# Define features (X) and target (y)
y = df_train['isFraud']
X = df_train.drop(['TransactionID', 'isFraud'], axis=1)

# Separate numerical and categorical columns
# Note: XGBoost handles numerical missing values (NaN) efficiently by default
categorical_cols = X.select_dtypes(include='object').columns

# Split data (CRITICAL: Use stratify=y to ensure fraud cases are split correctly)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y 
)

# Convert categorical strings to integers (Label Encoding) for XGBoost efficiency
from sklearn.preprocessing import LabelEncoder
for col in categorical_cols:
    le = LabelEncoder()
    # Fit on concatenated, safely-typed data to ensure consistency across splits
    # Fill missing/NaN values with 'missing' string to handle them explicitly
    combined_data = pd.concat([X_train[col], X_test[col]]).astype(str).fillna('missing')
    le.fit(combined_data)
    
    X_train[col] = le.transform(X_train[col].astype(str).fillna('missing'))
    X_test[col] = le.transform(X_test[col].astype(str).fillna('missing'))

print("Data preprocessing complete. Ready for GPU training.")

Data preprocessing complete. Ready for GPU training.


In [9]:
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score, recall_score, confusion_matrix

# 1. Calculate the weight ratio for the imbalanced data
ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1) 
print(f"Calculated scale_pos_weight: {ratio:.0f}")

# 2. Instantiate the Model (GPU Optimized - Modern Syntax)
clf = xgb.XGBClassifier(
    n_estimators=1000, 
    max_depth=9, 
    learning_rate=0.05,
    scale_pos_weight=ratio, 
    
    # --- GPU ACCELERATION (Updated Professional Syntax) ---
    tree_method='hist', 
    device='cuda', # Specifies the GPU (CUDA) device
    
    random_state=42, 
    n_jobs=-1,
    enable_categorical=True # Use this flag as we encoded categorical data
)

# 3. Train the Model
print("Starting XGBoost training (GPU-accelerated)...")
clf.fit(X_train, y_train)
print("Training complete.")

# 4. Make Predictions
y_pred = clf.predict(X_test)


# --- 5. Final Evaluation and Documentation ---
print("\n--- Model Performance Metrics ---")
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Overall Accuracy: {accuracy:.4f}")
print(f"Recall (Minority Class): {recall:.4f}")

# Print the detailed Classification Report
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred))

# Print the Confusion Matrix
print("\n--- Confusion Matrix ---")
print(conf_matrix)

Calculated scale_pos_weight: 28
Starting XGBoost training (GPU-accelerated)...
Training complete.

--- Model Performance Metrics ---
Overall Accuracy: 0.9742
Recall (Minority Class): 0.8043

--- Classification Report ---
              precision    recall  f1-score   support

           0       0.99      0.98      0.99    113975
           1       0.60      0.80      0.69      4133

    accuracy                           0.97    118108
   macro avg       0.79      0.89      0.84    118108
weighted avg       0.98      0.97      0.98    118108


--- Confusion Matrix ---
[[111731   2244]
 [   809   3324]]
