In [1]:
import pandas as pd
import numpy as np
import json
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
import gc
import warnings
warnings.filterwarnings('ignore')

# Configuration Class
class CFG:
    DATA_PATH = "/kaggle/input/nwu-data/NWU_CSE_FEST_2025_DATATHON_COMPETITION/"
    N_SPLITS = 5
    SEED = 42
    TARGET_COL = 'fraud'

# Provided function to load the tricky JSON file
def robust_load_json_with_clean(file_path):
    """ Loads a JSON file, cleaning newline characters that might break standard parsers. """
    print(f"Attempting to load and clean: {file_path}")
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            raw_content = f.read()
    except FileNotFoundError:
        print(f"FATAL ERROR: File not found at {file_path}")
        raise
    # This cleaning step is crucial for the provided labels file
    cleaned_content = raw_content.replace('\n', '')
    try:
        data = json.loads(cleaned_content)
        print("JSON loaded successfully after cleaning.")
        return data
    except json.JSONDecodeError as e:
        print(f"\nFATAL JSON DECODE ERROR after cleaning: {e}")
        raise

def reduce_mem_usage(df, verbose=True):
    """ Iterate through all the columns of a dataframe and modify the data type to reduce memory usage. """
    start_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object and col_type.name != 'category' and 'datetime' not in str(col_type):
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def feature_engineering(df):
    """ Creates new features for the model from the transaction data. """
    print("Starting feature engineering...")
    
    # 1. Date/Time Features
    df['date'] = pd.to_datetime(df['date'])
    df['dt_year'] = df['date'].dt.year
    df['dt_month'] = df['date'].dt.month
    df['dt_day'] = df['date'].dt.day
    df['dt_hour'] = df['date'].dt.hour
    df['dt_minute'] = df['date'].dt.minute
    df['dt_dayofweek'] = df['date'].dt.dayofweek
    df['dt_dayofyear'] = df['date'].dt.dayofyear
    df['is_weekend'] = (df['dt_dayofweek'] >= 5).astype(int)
    
    # 2. Amount Features
    df['amount'] = df['amount'].replace({'\$': ''}, regex=True).astype(float)
    df['is_refund'] = (df['amount'] < 0).astype(int)
    df['log_amount'] = np.log1p(df['amount'].abs())
    
    # 3. Categorical Features Preprocessing
    # Fill NaNs before creating aggregations or label encoding
    for col in ['merchant_city', 'merchant_state', 'zip', 'errors']:
        df[col] = df[col].fillna('MISSING')
    
    # Additional features
    df['has_error'] = (df['errors'] != 'MISSING').astype(int)
    df['is_online'] = (df['merchant_city'] == 'ONLINE').astype(int)
    
    # 4. Aggregation Features (Client-based)
    # These features describe the typical behavior of a client
    df['client_avg_amount'] = df.groupby('client_id')['amount'].transform('mean')
    df['client_std_amount'] = df.groupby('client_id')['amount'].transform('std')
    df['client_median_amount'] = df.groupby('client_id')['amount'].transform('median')
    df['client_transaction_count'] = df.groupby('client_id')['transaction_id'].transform('count')
    df['client_distinct_merchant_count'] = df.groupby('client_id')['merchant_id'].transform('nunique')
    df['client_unique_cities'] = df.groupby('client_id')['merchant_city'].transform('nunique')
    df['client_unique_states'] = df.groupby('client_id')['merchant_state'].transform('nunique')
    
    # 5. Interaction Features
    # How does this transaction's amount compare to the client's average?
    df['amount_to_avg_ratio'] = df['amount'] / (df['client_avg_amount'] + 1e-6)
    df['amount_to_median_ratio'] = df['amount'] / (df['client_median_amount'] + 1e-6)
    
    # 6. Time Delta Features
    # Time since the client's first transaction
    df['client_first_transaction'] = df.groupby('client_id')['date'].transform('min')
    df['time_since_first_transaction'] = (df['date'] - df['client_first_transaction']).dt.total_seconds()
    
    # Time since the client's last transaction and other sorted features
    df_sorted = df.sort_values(['client_id', 'date'])
    df_sorted['time_since_last_transaction'] = df_sorted.groupby('client_id')['date'].diff().dt.total_seconds()
    df_sorted['transaction_order'] = df_sorted.groupby('client_id').cumcount() + 1
    df = df.merge(df_sorted[['transaction_id', 'time_since_last_transaction', 'transaction_order']], on='transaction_id', how='left')
    df['time_since_last_transaction'] = df['time_since_last_transaction'].fillna(0)  # First transaction has no prior one
    
    # Client total span and avg interval
    df['client_total_span'] = df.groupby('client_id')['time_since_first_transaction'].transform('max')
    df['client_avg_interval'] = df['client_total_span'] / (df['client_transaction_count'] - 1 + 1e-6)
    df['interval_deviation'] = df['time_since_last_transaction'] / (df['client_avg_interval'] + 1e-6)
    
    # Transaction fraction
    df['transaction_fraction'] = df['transaction_order'] / df['client_transaction_count']
    
    # 7. Merchant-based features
    df['merchant_popularity'] = df.groupby('merchant_id')['client_id'].transform('nunique')
    df['merchant_avg_amount'] = df.groupby('merchant_id')['amount'].transform('mean')
    df['amount_to_merchant_avg'] = df['amount'] / (df['merchant_avg_amount'] + 1e-6)
    
    # 8. Additional Shift and Novelty Features on sorted df
    # Novelty features (first time with merchant/city/state/mcc)
    df_sorted['is_new_merchant'] = (df_sorted.groupby(['client_id', 'merchant_id'])['date'].transform('min') == df_sorted['date']).astype(int)
    df_sorted['is_new_city'] = (df_sorted.groupby(['client_id', 'merchant_city'])['date'].transform('min') == df_sorted['date']).astype(int)
    df_sorted['is_new_state'] = (df_sorted.groupby(['client_id', 'merchant_state'])['date'].transform('min') == df_sorted['date']).astype(int)
    df_sorted['is_new_mcc'] = (df_sorted.groupby(['client_id', 'mcc'])['date'].transform('min') == df_sorted['date']).astype(int)
    
    # Shift features (comparison to previous transaction)
    df_sorted['prev_merchant'] = df_sorted.groupby('client_id')['merchant_id'].shift(1)
    df_sorted['is_same_merchant'] = (df_sorted['merchant_id'] == df_sorted['prev_merchant']).astype(int)
    
    df_sorted['prev_city'] = df_sorted.groupby('client_id')['merchant_city'].shift(1)
    df_sorted['is_same_city'] = (df_sorted['merchant_city'] == df_sorted['prev_city']).astype(int)
    
    df_sorted['prev_use_chip'] = df_sorted.groupby('client_id')['use_chip'].shift(1)
    df_sorted['is_same_use_chip'] = (df_sorted['use_chip'] == df_sorted['prev_use_chip']).astype(int)
    
    df_sorted['prev_amount'] = df_sorted.groupby('client_id')['amount'].shift(1).fillna(0)
    df_sorted['amount_ratio_prev'] = df_sorted['amount'] / (df_sorted['prev_amount'] + 1e-6)
    
    # Running average excluding current
    df_sorted['cum_amount'] = df_sorted.groupby('client_id')['amount'].cumsum()
    df_sorted['cum_count'] = df_sorted.groupby('client_id')['transaction_id'].cumcount() + 1
    df_sorted['running_sum_prev'] = df_sorted['cum_amount'] - df_sorted['amount']
    df_sorted['running_count_prev'] = df_sorted['cum_count'] - 1
    df_sorted['running_avg_prev'] = np.where(df_sorted['running_count_prev'] > 0, df_sorted['running_sum_prev'] / df_sorted['running_count_prev'], 0)
    df_sorted['amount_to_running_avg'] = df_sorted['amount'] / (df_sorted['running_avg_prev'] + 1e-6)
    
    # Merge new features back to original df
    new_cols = [
        'is_new_merchant', 'is_new_city', 'is_new_state', 'is_new_mcc',
        'is_same_merchant', 'is_same_city', 'is_same_use_chip',
        'amount_ratio_prev', 'running_avg_prev', 'amount_to_running_avg'
    ]
    df = df.merge(df_sorted[['transaction_id'] + new_cols], on='transaction_id', how='left')
    
    # 9. Mode-based features (usual city, state, use_chip)
    client_mode_city = df_sorted.groupby('client_id')['merchant_city'].agg(lambda x: x.mode()[0] if not x.mode().empty else 'MISSING')
    df['client_mode_city'] = df['client_id'].map(client_mode_city)
    df['is_mode_city'] = (df['merchant_city'] == df['client_mode_city']).astype(int)
    
    client_mode_state = df_sorted.groupby('client_id')['merchant_state'].agg(lambda x: x.mode()[0] if not x.mode().empty else 'MISSING')
    df['client_mode_state'] = df['client_id'].map(client_mode_state)
    df['is_mode_state'] = (df['merchant_state'] == df['client_mode_state']).astype(int)
    
    client_mode_use_chip = df_sorted.groupby('client_id')['use_chip'].agg(lambda x: x.mode()[0] if not x.mode().empty else 'MISSING')
    df['client_mode_use_chip'] = df['client_id'].map(client_mode_use_chip)
    df['is_mode_use_chip'] = (df['use_chip'] == df['client_mode_use_chip']).astype(int)
    
    # Drop intermediate columns
    df = df.drop(['date', 'client_first_transaction', 'client_mode_city', 'client_mode_state', 'client_mode_use_chip'], axis=1, errors='ignore')
    
    print("Feature engineering complete.")
    return df

def run_training():
    """ Main function to orchestrate the loading, feature engineering, training, and prediction pipeline. """
    
    # --- 1. Load Data ---
    print("Loading data...")
    train_df = pd.read_csv(CFG.DATA_PATH + 'train_transactions_data.csv')
    test_df = pd.read_csv(CFG.DATA_PATH + 'test_transactions_data.csv')
    labels_json = robust_load_json_with_clean(CFG.DATA_PATH + 'train_fraud_labels.json')
    labels_df = pd.DataFrame(labels_json['target'].items(), columns=['transaction_id', CFG.TARGET_COL])
    labels_df['transaction_id'] = labels_df['transaction_id'].astype(float)
    train_df = pd.merge(train_df, labels_df, on='transaction_id', how='left')
    
    # Map target to 0/1
    train_df[CFG.TARGET_COL] = train_df[CFG.TARGET_COL].map({'Yes': 1, 'No': 0})
    
    # For memory efficiency, especially since fraud is rare, we can drop rows with no label
    train_df.dropna(subset=[CFG.TARGET_COL], inplace=True)
    train_df[CFG.TARGET_COL] = train_df[CFG.TARGET_COL].astype(np.int8)
    
    # Store test transaction_ids for submission
    test_ids = test_df['transaction_id']
    
    # --- 2. Feature Engineering ---
    # Combine train and test for consistent feature creation
    combined_df = pd.concat([train_df.drop(CFG.TARGET_COL, axis=1), test_df], ignore_index=True)
    combined_df = feature_engineering(combined_df)
    
    # --- 3. Set Categorical Features ---
    print("Setting categorical features...")
    categorical_cols = ['use_chip', 'merchant_city', 'merchant_state', 'zip', 'mcc', 'errors']
    for col in categorical_cols:
        combined_df[col] = combined_df[col].astype('category')
    
    # --- 4. Final Data Preparation ---
    train_df_fe = combined_df.iloc[:len(train_df)]
    test_df_fe = combined_df.iloc[len(train_df):]
    
    # Add target back to the training set
    train_df_fe[CFG.TARGET_COL] = train_df[CFG.TARGET_COL].values
    
    # Reduce memory usage
    train_df_fe = reduce_mem_usage(train_df_fe)
    test_df_fe = reduce_mem_usage(test_df_fe)
    
    # Identify feature columns
    features = [col for col in test_df_fe.columns if col not in ['transaction_id', CFG.TARGET_COL]]
    X = train_df_fe[features]
    y = train_df_fe[CFG.TARGET_COL]
    X_test = test_df_fe[features]
    
    print(f"Training with {len(features)} features on {len(X)} samples.")
    
    del train_df, test_df, labels_df, combined_df, train_df_fe, test_df_fe
    gc.collect()
    
    # --- 5. Model Training (LGBM with Stratified K-Fold) ---
    skf = StratifiedKFold(n_splits=CFG.N_SPLITS, shuffle=True, random_state=CFG.SEED)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))
    
    # Calculate scale_pos_weight for imbalanced classes
    scale_pos_weight = (y == 0).sum() / (y == 1).sum()
    print(f"Scale Pos Weight: {scale_pos_weight:.2f}")
    
    lgb_params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': 10000,
        'learning_rate': 0.02,
        'num_leaves': 128,
        'max_depth': -1,
        'seed': CFG.SEED,
        'n_jobs': -1,
        'verbose': -1,
        'colsample_bytree': 0.6,
        'subsample': 0.8,
        'reg_alpha': 0.5,
        'reg_lambda': 1.0,
        'min_child_samples': 50,
        'min_child_weight': 0.01,
        'scale_pos_weight': scale_pos_weight,  # Crucial for imbalanced data
    }
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"========== Fold {fold+1} ==========")
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        
        model = lgb.LGBMClassifier(**lgb_params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='auc',
            callbacks=[lgb.early_stopping(100, verbose=False)],
            categorical_feature=categorical_cols
        )
        
        val_preds = model.predict_proba(X_val)[:, 1]
        oof_preds[val_idx] = val_preds
        
        # Ensemble predictions for the test set
        test_preds += model.predict_proba(X_test)[:, 1] / CFG.N_SPLITS
        
        del X_train, y_train, X_val, y_val, model
        gc.collect()
    
    # --- 6. Find Optimal Threshold for Cohen's Kappa ---
    print("Finding optimal threshold for Cohen's Kappa...")
    thresholds = np.linspace(0.01, 0.99, 100)
    kappa_scores = [cohen_kappa_score(y, (oof_preds > t).astype(int)) for t in thresholds]
    best_threshold = thresholds[np.argmax(kappa_scores)]
    best_kappa = max(kappa_scores)
    print(f"Overall OOF Cohen's Kappa: {best_kappa:.5f} at threshold {best_threshold:.3f}")
    
    # --- 7. Create Submission File ---
    print("Creating submission file...")
    final_preds = (test_preds > best_threshold).astype(int)
    submission = pd.DataFrame({
        'transaction_id': test_ids,
        CFG.TARGET_COL: final_preds
    })
    submission[CFG.TARGET_COL] = submission[CFG.TARGET_COL].map({1: 'Yes', 0: 'No'})
    submission.to_csv('submission.csv', index=False)
    print("Submission file 'submission.csv' created successfully!")
    print(submission.head())
    print(f"Predicted fraud distribution:\n{submission[CFG.TARGET_COL].value_counts(normalize=True)}")

# Execute the pipeline
if __name__ == "__main__":
    run_training()

Loading data...
Attempting to load and clean: /kaggle/input/nwu-data/NWU_CSE_FEST_2025_DATATHON_COMPETITION/train_fraud_labels.json
JSON loaded successfully after cleaning.
Starting feature engineering...
Feature engineering complete.
Setting categorical features...
Memory usage of dataframe is 2226.87 MB
Memory usage after optimization is: 631.90 MB
Decreased by 71.6%
Memory usage of dataframe is 952.42 MB
Memory usage after optimization is: 268.86 MB
Decreased by 71.8%
Training with 54 features on 6240474 samples.
Scale Pos Weight: 667.72
Finding optimal threshold for Cohen's Kappa...
Overall OOF Cohen's Kappa: 0.92955 at threshold 0.693
Creating submission file...
Submission file 'submission.csv' created successfully!
   transaction_id fraud
0       8677815.0    No
1      18228653.0    No
2      11775845.0    No
3      11156207.0    No
4      15615886.0    No
Predicted fraud distribution:
fraud
No     0.998618
Yes    0.001382
Name: proportion, dtype: float64
