Author:
        
        PARK, JunHo, junho@ccnets.org
        
    COPYRIGHT (c) 2024. CCNets. All Rights reserved.

In [None]:
import sys
path_append = "../../"
sys.path.append(path_append)  # Go up one directory from where you are.


In [None]:
# https://www.kaggle.com/competitions/santander-customer-transaction-prediction
import pandas as pd 

file_name = 'Santander Customer Transaction Prediction Dataset'
train_dataroot = path_append + f"../data/{file_name}/train.csv"
test_dataroot = path_append + f"../data/{file_name}/test.csv"
df_train = pd.read_csv(train_dataroot)
df_test = pd.read_csv(test_dataroot)
df_train

In [None]:
import torch
from torch.utils.data import Dataset
from tools.preprocessing.data_frame import auto_preprocess_dataframe

len_train = len(df_train)
# df = pd.concat([df_train, df_test], axis=0)
df_train, description = auto_preprocess_dataframe(df_train, target_columns=['target'], drop_columns= ['ID_code'])
# split the df back into training and test df
# df_train = df[:len_train].reset_index(drop=True)
# df_test = df[len_train:].reset_index(drop=True)

# Calculate the number of features and classes
num_features = description['num_features']
num_classes = description['num_classes']

print(num_features, num_classes)

In [None]:

# Defining the labeled and unlabeled dataset classes
class LabeledDataset(Dataset):
    def __init__(self, x, y):
        if not isinstance(x, torch.Tensor):
            self.x = torch.tensor(x, dtype=torch.float32)
        else:
            self.x = x.clone().detach()

        if not isinstance(y, torch.Tensor):
            self.y = torch.tensor(y, dtype=torch.float32)
        else:
            self.y = y.clone().detach()

    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        vals = self.x[index]
        label = self.y[index]
        return vals, label

#### Dataset Splitting for Training and Testing

The original dataset is split into training and testing parts to evaluate the model's performance accurately. This step is crucial for validating the effectiveness of the training on unseen data.


In [None]:
import torch
from sklearn.model_selection import train_test_split

df_train, df_eval = train_test_split(df_train, test_size=0.2, random_state=42)
X_train, y_train = df_train.iloc[:, :-1], df_train.iloc[:, -1:]
X_eval, y_eval = df_eval.iloc[:, :-1], df_eval.iloc[:, -1:]

# Labeled datasets for supervised learning tasks
trainset = LabeledDataset(X_train.values, y_train.values)  # Corrected to include training data
evalset = LabeledDataset(X_eval.values, y_eval.values)     # Test set with proper labels

# Printing the shapes of the datasets for verification
print(f"Labeled Trainset Shape: {len(trainset)}, {trainset.x.shape[1]}")
print(f"Labeled Testset Shape: {len(evalset)}, {evalset.x.shape[1]}")

#### Initial Setup and Model Configuration

This section initializes the environment by setting a fixed random seed to ensure reproducibility of results. It imports necessary configurations and initializes model parameters with specific configurations. The model specified here is set to have no core model but uses a 'tabnet' encoder model for data processing, which is particularly tailored for structured or tabular data like credit card transactions.


In [None]:
# Set a fixed random seed for reproducibility of experiments
from nn.utils.init import set_random_seed
set_random_seed(0)

# Importing configuration setups for ML parameters and data
from tools.setting.ml_params import MLParameters
from tools.setting.data_config import DataConfig
from trainer_hub import TrainerHub

# Configuration for the data handling, defining dataset specifics and the task type
data_config = DataConfig(dataset_name=file_name, task_type='binary_classification', obs_shape=[num_features], label_size=num_classes)

# Initializing ML parameters without a core model and setting the encoder model to 'tabnet' with specific configurations
ml_params = MLParameters(ccnet_network='tabnet')

# Setting training parameters and device configuration
ml_params.model.ccnet_config.num_layers = 4

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

# Create a TrainerHub instance to manage training and data processing
causal_trainer_from_dataset = TrainerHub(ml_params, data_config, device, use_print=True)


In [None]:
causal_trainer_from_dataset.train(trainset, evalset)

In [None]:
causal_trainer_from_dataset.test(evalset)   

In [None]:
from xgboost import XGBClassifier
from tools.report import get_test_results
import torch.nn.functional as F

# Configure the model to use 500 trees, each with a maximum depth of 3
xgb_model = XGBClassifier(n_estimators=500, max_depth=3)
xgb_model.fit(X_train, y_train)
eval_predictions = xgb_model.predict(X_eval)
metrics = get_test_results(eval_predictions, y_eval.values, task_type = 'binary_classification', num_classes=2)
print(metrics)

In [None]:
train_xgb_predictions = xgb_model.predict(X_train)
training_data_with_xgb_predictions = LabeledDataset(torch.tensor(X_train.values).detach(), torch.tensor(train_xgb_predictions).unsqueeze(-1).detach())  # Corrected to include training data

print("LabeledDataset X: ", training_data_with_xgb_predictions.x.shape)
print("LabeledDataset Y: ", training_data_with_xgb_predictions.y.shape)

In [None]:
# Create a TrainerHub instance to manage training and data processing
causal_trainer_from_xgboost = TrainerHub(ml_params, data_config, device, use_print=True)

In [None]:
causal_trainer_from_xgboost.train(training_data_with_xgb_predictions, evalset)

In [None]:
causal_trainer_from_xgboost.test(evalset)