Dataset Loading

### Import Modules and Verification

In [1]:
import torch
print("PyTorch - CUDA available:", torch.cuda.is_available())

import xgboost as xgb

# Create a simple DMatrix
dtrain = xgb.DMatrix(data=[[1, 2], [3, 4], [5, 6]], label=[0, 1, 0])

# Train a small model with GPU support
params = {
 'tree_method': 'hist',  # Use the updated method
        'device': 'cuda',       # Specify GPU usage
}

# Train the model
bst = xgb.train(params, dtrain, num_boost_round=10)

# If no errors occurred, XGBoost is using the GPU
print("XGBoost is using the GPU.")



# CatBoost check
from catboost import CatBoostClassifier
# Initialize CatBoost model with GPU usage
model = CatBoostClassifier(task_type='GPU', iterations=10)

# Train on dummy data
model.fit([[0, 1], [1, 0], [0, 0], [1, 1]], [0, 1, 0, 1])

print("CatBoost successfully used the GPU.")

# FastAI check
from fastai.tabular.all import *

# Create a simple DataFrame
df = pd.DataFrame({'a': [1, 2, 3, 4, 5], 'b': [1, 0, 1, 0, 1], 'y': [0, 1, 0, 1, 0]})

# Split the data
dls = TabularDataLoaders.from_df(df, y_names='y', cat_names=['a'], cont_names=['b'], procs=[Categorify, Normalize])

# Initialize a simple model
learn = tabular_learner(dls, metrics=accuracy)

# Check if FastAI is using GPU
if torch.cuda.is_available():
    learn.to_fp16()
    print("FastAI is using the GPU.")
else:
    print("FastAI is not using the GPU.")




PyTorch - CUDA available: True
XGBoost is using the GPU.
Learning rate set to 0.5
0:	learn: 0.6242674	total: 6.2ms	remaining: 55.8ms
1:	learn: 0.5917165	total: 6.79ms	remaining: 27.2ms
2:	learn: 0.5615844	total: 7.22ms	remaining: 16.9ms
3:	learn: 0.5336707	total: 8.19ms	remaining: 12.3ms
4:	learn: 0.4862918	total: 8.57ms	remaining: 8.57ms
5:	learn: 0.4450405	total: 8.93ms	remaining: 5.96ms
6:	learn: 0.4089812	total: 9.42ms	remaining: 4.04ms
7:	learn: 0.3773298	total: 9.78ms	remaining: 2.45ms
8:	learn: 0.3494304	total: 10.1ms	remaining: 1.13ms
9:	learn: 0.3360929	total: 10.6ms	remaining: 0us
CatBoost successfully used the GPU.
FastAI is using the GPU.


In [2]:
# LightGBM check
import lightgbm as lgb
import numpy as np

# Create a simple dataset
data = np.array([[1, 2], [3, 4], [5, 6]])  # Convert to NumPy array
label = np.array([0, 1, 0])  # Convert to NumPy array
train_data = lgb.Dataset(data, label=label)

# Define parameters for LightGBM with GPU usage
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',

}

# Train the LightGBM model
gbm = lgb.train(params, train_data, num_boost_round=10)

# If no errors occurred, LightGBM is using the GPU
print("LightGBM is using the GPU.")

[LightGBM] [Info] Number of positive: 1, number of negative: 2
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 3, number of used features: 0
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333333 -> initscore=-0.693147
[LightGBM] [Info] Start training from score -0.693147
LightGBM is using the GPU.


In [3]:

import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from autogluon.tabular import TabularPredictor, TabularDataset

## Evaluate the Model & save the preprocessor
import joblib
# Define the root directory
root_dir = '/workspace/Dataset/FSI/'

# Load the datasets
# train_df = pd.read_csv(root_dir + 'train_sample.csv')
# train_generated_df = pd.read_csv(root_dir + 'submission/filtered_data_3000.csv')
# test_df = pd.read_csv(root_dir + 'test.csv')


train_df = pd.read_csv(root_dir + 'final_data.csv')
train_generated_df = pd.read_csv(root_dir + 'final_data.csv')
test_df = pd.read_csv(root_dir + 'test.csv')

# Concatenate the training datasets
# combined_train_df = pd.concat([train_df, train_generated_df], ignore_index=True)

combined_train_df = train_generated_df

# Drop the 'ID' column from training data (but not 'Fraud_Type')
combined_train_df = combined_train_df.drop(columns=['ID'], errors='ignore')
print("combined_train_df after dropping ID:")
display(combined_train_df.head())  # Displays the first few rows after dropping 'ID'

test_df = test_df.drop(columns=['Fraud_Type'],errors='ignore')
print("test_df after dropping ID:")
display(test_df.head())  # Displays the first few rows after dropping 'ID'

# Check if val.csv exists; if not, split the combined_train_df
val_file_path = root_dir + 'train.csv'
if os.path.exists(val_file_path):
    val_df = pd.read_csv(val_file_path)
    val_df = val_df.drop(columns=['ID'], errors='ignore')  # Only drop 'ID', keep 'Fraud_Type' as the label
    print("val_df (loaded from train.csv):")
    display(val_df.head())  # Displays the first few rows of val_df
else:
    combined_train_df, val_df = train_test_split(combined_train_df, test_size=0.2, random_state=42)
    print("combined_train_df after splitting:")
    display(combined_train_df.head())  # Displays the first few rows of combined_train_df after splitting
    print("val_df (created by splitting):")
    display(val_df.head())  # Displays the first few rows of val_df created by splitting


  from .autonotebook import tqdm as notebook_tqdm
  train_df = pd.read_csv(root_dir + 'final_data.csv')
  train_generated_df = pd.read_csv(root_dir + 'final_data.csv')


combined_train_df after dropping ID:


Unnamed: 0,Customer_Birthyear,Customer_Gender,Customer_personal_identifier,Customer_identification_number,Customer_registration_datetime,Customer_credit_rating,Customer_flag_change_of_authentication_1,Customer_flag_change_of_authentication_2,Customer_flag_change_of_authentication_3,Customer_flag_change_of_authentication_4,...,Last_atm_transaction_datetime,Last_bank_branch_transaction_datetime,Flag_deposit_more_than_tenMillion,Unused_account_status,Recipient_account_suspend_status,Number_of_transaction_with_the_account,Transaction_history_with_the_account,First_time_iOS_by_vulnerable_user,Fraud_Type,Transaction_resumed_date
0,1950.0,female,조현준,uCPmTt-UEKUIYN,2008-01-30 03:49:41,A,0,1,1,0,...,2009-01-20 23:10:39,2039-02-24 08:23:20,1,0,0,0,4,0,a,2003-04-12 14:53:58
1,1995.0,female,안수진,QedXNL-ZYiNKYB,2008-11-16 14:28:15,C,1,1,1,1,...,2019-09-27 04:31:25,2037-01-18 14:26:51,0,0,1,1,2,0,a,2024-09-16 15:58:27
2,2004.0,female,류민준,qoGhFc-HUXuEjd,2010-05-11 06:53:06,B,1,0,0,1,...,2011-05-05 14:37:19,2037-09-23 18:50:20,1,1,1,1,0,0,a,2016-01-02 06:00:10
3,1964.0,female,우은주,TLIaDc-KXecOaP,2009-07-15 21:50:12,B,1,0,1,1,...,2024-11-21 06:07:23,2043-04-01 17:34:12,1,1,0,0,0,0,a,2003-11-28 06:14:09
4,1968.0,male,이성진,DOcbBI-GDIdPah,2008-06-09 20:14:26,E,1,0,1,1,...,2025-09-27 22:17:47,2018-07-12 12:39:16,0,0,0,1,1,0,a,2003-04-12 14:53:58


test_df after dropping ID:


Unnamed: 0,ID,Customer_Birthyear,Customer_Gender,Customer_personal_identifier,Customer_identification_number,Customer_registration_datetime,Customer_credit_rating,Customer_flag_change_of_authentication_1,Customer_flag_change_of_authentication_2,Customer_flag_change_of_authentication_3,...,Unused_terminal_status,Last_atm_transaction_datetime,Last_bank_branch_transaction_datetime,Flag_deposit_more_than_tenMillion,Unused_account_status,Recipient_account_suspend_status,Number_of_transaction_with_the_account,Transaction_history_with_the_account,First_time_iOS_by_vulnerable_user,Transaction_resumed_date
0,TEST_000000,1960,female,주지아,DOMcBN-kRMFflJ,2003-01-07 10:59:08,E,1,0,0,...,1,2003-01-10 05:27:56,2003-01-08 05:27:56,0,1,1,0,0,0,2003-01-08 05:27:56
1,TEST_000001,1960,female,주지아,DOMcBN-kRMFflJ,2003-01-07 10:59:08,E,1,1,1,...,0,2003-01-11 21:29:50,2003-01-08 05:27:56,0,1,0,0,0,0,2003-01-08 05:27:56
2,TEST_000002,1951,male,김정수,pZrAvI-mhxfVyw,2003-01-06 18:10:55,B,1,1,1,...,0,2003-01-13 01:08:19,2003-01-13 01:08:19,1,0,0,2,2,0,2003-01-13 01:08:19
3,TEST_000003,1999,female,김현지,fVlbzX-wvugTpH,2003-01-08 05:28:53,B,0,1,1,...,1,2003-01-21 10:03:32,2003-01-26 13:49:24,0,1,1,0,0,0,2003-01-20 10:03:32
4,TEST_000004,1996,female,박은정,chYftA-AjVuXMW,2003-01-17 03:37:22,A,0,1,0,...,1,2003-01-28 19:04:19,2003-01-28 19:04:19,0,1,1,0,0,0,2003-01-28 19:04:19


val_df (loaded from train.csv):


Unnamed: 0,Customer_Birthyear,Customer_Gender,Customer_personal_identifier,Customer_identification_number,Customer_registration_datetime,Customer_credit_rating,Customer_flag_change_of_authentication_1,Customer_flag_change_of_authentication_2,Customer_flag_change_of_authentication_3,Customer_flag_change_of_authentication_4,...,Last_atm_transaction_datetime,Last_bank_branch_transaction_datetime,Flag_deposit_more_than_tenMillion,Unused_account_status,Recipient_account_suspend_status,Number_of_transaction_with_the_account,Transaction_history_with_the_account,First_time_iOS_by_vulnerable_user,Fraud_Type,Transaction_resumed_date
0,1980,male,이상호,BJWQxd-WBASPLJ,2003-01-06 18:38:01,B,0,1,0,1,...,2003-01-22 23:38:48,2003-01-22 23:38:48,1,1,1,0,0,0,m,2003-01-22 23:38:48
1,1964,male,박상철,kurCwX-odPUXEt,2003-01-07 16:40:44,C,0,1,0,0,...,2003-01-21 21:29:08,2003-01-31 00:19:46,0,1,0,0,0,0,m,2003-01-19 21:29:08
2,1982,female,조옥자,OiERQa-CTXBoaX,2003-01-11 14:08:36,B,1,1,0,0,...,2003-01-31 07:13:28,2003-01-31 07:13:28,0,0,1,1,1,0,m,2003-01-31 07:13:28
3,1982,female,조옥자,OiERQa-CTXBoaX,2003-01-11 14:08:36,B,1,1,1,0,...,2003-01-31 11:49:56,2003-01-31 07:13:28,1,1,0,0,0,0,m,2003-01-31 07:13:28
4,1982,female,조옥자,OiERQa-CTXBoaX,2003-01-11 14:08:36,B,1,1,1,0,...,2003-01-31 11:49:56,2003-01-31 07:13:28,1,0,0,1,1,0,m,2003-01-31 07:13:28


## Train the Model

We now specify the `Fraud_Type` as the target column and use AutoGluon's `TabularPredictor` to train the model with the `best_quality` preset.

In [4]:
# Specify the target column for classification
label = 'Fraud_Type'


# Initialize the TabularPredictor
predictor = TabularPredictor(label=label, eval_metric='accuracy' ).fit(
    train_data=TabularDataset(combined_train_df), 
    tuning_data=TabularDataset(val_df), 
    # presets='best_quality',# Options: 'best_quality', 'high_quality', 'good_quality', 'medium_quality'
)



leaderboard = predictor.leaderboard(silent=False)

No path specified. Models will be saved in: "AutogluonModels/ag-20240827_130156"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #129~20.04.1-Ubuntu SMP Wed Aug 7 13:07:13 UTC 2024
CPU Count:          20
Memory Avail:       24.61 GB / 31.15 GB (79.0%)
Disk Space Avail:   191.92 GB / 467.89 GB (41.0%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.


                  model  score_val eval_metric  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2   0.997025    accuracy     159.250082  398.818386                0.008063           3.748007            2       True         14
1            LightGBMXT   0.993425    accuracy       0.948882   13.996720                0.948882          13.996720            1       True          4
2              LightGBM   0.992167    accuracy       0.874973   15.500237                0.874973          15.500237            1       True          5
3         LightGBMLarge   0.990775    accuracy       0.639179   11.904474                0.639179          11.904474            1       True         13
4        KNeighborsDist   0.989483    accuracy     156.479294    0.916233              156.479294           0.916233            1       True          2
5        KNeighborsUnif   0.989367    accuracy     156.067117    0.916059              1

## Evaluate the Model 

Let's evaluate the model on both the training and validation datasets to see how well it performs.

In [5]:
# Evaluate performance on the training and validation datasets
train_performance = predictor.evaluate(combined_train_df)
val_performance = predictor.evaluate(val_df)

# Display performance metrics
print("Training performance metrics:")
print(train_performance)

print("\nValidation performance metrics:")
print(val_performance)


Training performance metrics:
{'accuracy': 1.0, 'balanced_accuracy': 1.0, 'mcc': 1.0}

Validation performance metrics:
{'accuracy': 0.997025, 'balanced_accuracy': 0.8060858585858586, 'mcc': 0.8426993150522791}


## Save the preprocessor used in Autogluon for future use

In [6]:
# Access the feature generator from the predictor
preprocessor = predictor._learner.feature_generator

# Save the preprocessor using joblib
import joblib
joblib.dump(preprocessor, root_dir + 'preprocessor.pkl')

['/workspace/Dataset/FSI/preprocessor.pkl']

## Detailed Classification Report

We'll generate a detailed classification report for each fraud type using the validation dataset.

In [7]:
# Make predictions on the validation set and calculate per-class scores
val_predictions = predictor.predict(val_df)
val_true = val_df[label]
classification_report_per_class = classification_report(val_true, val_predictions, output_dict=True)

# Display classification report for each fraud type
for fraud_type, metrics in classification_report_per_class.items():
    if isinstance(metrics, dict):  # Filter out 'accuracy' and other aggregate scores
        print(f"Fraud Type: {fraud_type}")
        for metric, score in metrics.items():
            print(f"  {metric}: {score}")
        print()

Fraud Type: a
  precision: 0.96
  recall: 0.48
  f1-score: 0.64
  support: 100.0

Fraud Type: b
  precision: 0.7966101694915254
  recall: 0.94
  f1-score: 0.8623853211009174
  support: 100.0

Fraud Type: c
  precision: 0.925531914893617
  recall: 0.87
  f1-score: 0.8969072164948454
  support: 100.0

Fraud Type: d
  precision: 0.9125
  recall: 0.73
  f1-score: 0.811111111111111
  support: 100.0

Fraud Type: e
  precision: 0.9245283018867925
  recall: 0.98
  f1-score: 0.9514563106796116
  support: 100.0

Fraud Type: f
  precision: 0.9705882352941176
  recall: 0.99
  f1-score: 0.9801980198019802
  support: 100.0

Fraud Type: g
  precision: 0.8362068965517241
  recall: 0.97
  f1-score: 0.8981481481481481
  support: 100.0

Fraud Type: h
  precision: 0.9620253164556962
  recall: 0.76
  f1-score: 0.8491620111731845
  support: 100.0

Fraud Type: i
  precision: 0.6
  recall: 0.12
  f1-score: 0.19999999999999998
  support: 100.0

Fraud Type: j
  precision: 0.9479166666666666
  recall: 0.91
  f1-

## Make Predictions on the Test Set

We now make predictions on the test set and prepare the submission file.

In [8]:
# Make predictions on the test set
predictions = predictor.predict(test_df)


# Prepare the submission file using the ID column from the test data
submission = pd.DataFrame({
    'ID': test_df['ID'],  # Replace 'ID' with the actual ID column name in your test.csv
    'Fraud_Type': predictions
})

# Save the submission file
submission_file_path = root_dir + 'clf_submission.csv'
submission.to_csv(submission_file_path, index=False)

print(f"Submission file saved to {submission_file_path}")







Submission file saved to /workspace/Dataset/FSI/clf_submission.csv


Syn_Submission Save

In [10]:
train_generated_df['Predicted_Fraud_Type'] = predictor.predict(train_generated_df)
probabilities = predictor.predict_proba(train_generated_df)
train_generated_df['Confidence_Score'] = probabilities.max(axis=1)

correct_predictions_df = train_generated_df[train_generated_df['Fraud_Type'] == train_generated_df['Predicted_Fraud_Type']]

# Step 3: Sort by confidence score and select the top 1000 samples per Fraud_Type
sampled_df = correct_predictions_df.groupby('Fraud_Type').apply(lambda x: x.nlargest(1000, 'Confidence_Score')).reset_index(drop=True)

# Step 4: (Optional) Remove the 'Predicted_Fraud_Type' and 'Confidence_Score' columns if not needed
sampled_df = sampled_df.drop(columns=['Predicted_Fraud_Type', 'Confidence_Score'])







  sampled_df = correct_predictions_df.groupby('Fraud_Type').apply(lambda x: x.nlargest(1000, 'Confidence_Score')).reset_index(drop=True)


In [17]:
submission_format = pd.read_csv(root_dir + 'generated_data_submission.csv')
submission_dtypes = test_df.dtypes
sampled_df  = sampled_df[submission_format.columns]

# Convert the syn_submission.csv columns to match the train.csv data types
for column in submission_format.columns:
    if column =='ID' or column=="Fraud_Type":
        continue
    if sampled_df[column].dtype != submission_dtypes [column]:
        sampled_df[column] = sampled_df[column].astype(submission_dtypes [column])
        print(f"column '{column}' different!")


# Save the sampled DataFrame
sampled_df.to_csv(root_dir + 'syn_submission.csv', index=False)