In [None]:
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import xgboost as xgb

# Define dtypes for the dataset
dtype = {
    'Timestamp': 'str',
    'From Bank': 'category',
    'Account': 'str',
    'To Bank': 'category',
    'Amount Received': 'float32',  
    'Receiving Currency': 'category',
    'Amount Paid': 'float32',  
    'Payment Currency': 'category',
    'Payment Format': 'category',
    'Is Laundering': 'int8' 
}

# Load the data in chunks
print("Loading dataset in chunks...")
chunk_size = 100000  # Adjust chunk size to your system's memory
try:
    chunks = pd.read_csv('HI-Large_Trans.csv', dtype=dtype, chunksize=chunk_size)
    data = pd.concat(chunks, ignore_index=True)
    print(f"Dataset loaded successfully. Shape: {data.shape}")
except Exception as e:
    print(f"Error loading dataset: {str(e)}")
    exit()

# Data Reduction: Downsample the majority class
print("\nReducing and balancing the dataset...")
try:
    majority_class = data[data['Is Laundering'] == 0]
    minority_class = data[data['Is Laundering'] == 1]
    print(f"Majority class size: {len(majority_class)}")
    print(f"Minority class size: {len(minority_class)}")

    # Downsample the majority class
    majority_downsampled = resample(
        majority_class, replace=False, n_samples=len(minority_class), random_state=42
    )
    data_reduced = pd.concat([majority_downsampled, minority_class])
    data_reduced = data_reduced.sample(frac=1, random_state=42).reset_index(drop=True)
    print(f"Balanced dataset shape: {data_reduced.shape}")
except Exception as e:
    print(f"Error during data reduction: {str(e)}")
    exit()

# Data Preprocessing: Extract datetime features
print("\nExtracting datetime features...")
try:
    data_reduced['hour'] = pd.to_datetime(data_reduced['Timestamp']).dt.hour
    data_reduced['day'] = pd.to_datetime(data_reduced['Timestamp']).dt.day
    data_reduced['month'] = pd.to_datetime(data_reduced['Timestamp']).dt.month
    data_reduced = data_reduced.drop(['Timestamp'], axis=1)
    print(f"Dataset shape after feature extraction: {data_reduced.shape}")
except Exception as e:
    print(f"Error extracting datetime features: {str(e)}")
    exit()

# Encoding categorical columns
print("\nEncoding categorical columns...")
try:
    data_reduced = pd.get_dummies(
        data_reduced,
        columns=['From Bank', 'To Bank', 'Payment Format', 'Receiving Currency', 'Payment Currency'],
        drop_first=True
    )
    print(f"Dataset shape after encoding: {data_reduced.shape}")
except Exception as e:
    print(f"Error during encoding: {str(e)}")
    exit()

# Split into features and target
print("\nSplitting dataset into features (X) and target (y)...")
X = data_reduced.drop('Is Laundering', axis=1)
y = data_reduced['Is Laundering']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# Train XGBoost
print("\nTraining the XGBoost model...")
try:
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        use_label_encoder=False
    )
    model.fit(X_train, y_train)
    print("Model training completed.")
except Exception as e:
    print(f"Error during model training: {str(e)}")
    exit()

# Evaluate the model
print("\nEvaluating the model...")
try:
    y_pred = model.predict(X_test)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
except Exception as e:
    print(f"Error during evaluation: {str(e)}")
    exit()


Loading dataset in chunks...
