Author:
        
        KIM, JeongYoong, jeongyoong@ccnets.org
        
    COPYRIGHT (c) 2024. CCNets. All Rights reserved.

## Train CCNet with titanic
https://www.kaggle.com/competitions/titanic/data

<a id="1"></a>
> <h1 style = 'font-family: Times New Roman'><b> <b style = 'color: #42c2f5'>1.</b> Import Necessary Libraries </b></h1>

In [None]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")

path_append = "../../"
sys.path.append(path_append)  # Go up one directory from where you are.

In [None]:
import pandas  as pd
dataset_name = "California House Price"
df = pd.read_csv(path_append + f'../data/titanic/train.csv')
df.head()

<a id="2"></a>
> <h1 style = 'font-family: Times New Roman'><b> <b style = 'color: #4290f5'>2.</b> Modeling: Preprocess </b></h1>

In [None]:
from tools.preprocessing.data_frame import auto_preprocess_dataframe
target_columns = ['Survived']
drop_columns = ['PassengerId', 'Name', 'Cabin', 'Ticket']
encode_columns = ['Sex', 'Embarked', 'Pclass']
df, description = auto_preprocess_dataframe(df, target_columns, drop_columns, encode_columns)

In [None]:
import torch
from sklearn.model_selection import train_test_split
from tools.preprocessing.template_dataset import TemplateDataset

train_df, test_df = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, shuffle=True, random_state=42)
# predict the next value in the sequence
train_df_x = train_df.iloc[:, :-1] # all columns except the last one
train_df_y = train_df.iloc[:, -1:] # only the last column

val_df_x = val_df.iloc[:, :-1] # all columns except the last one
val_df_y = val_df.iloc[:, -1:] # only the last column

test_df_x = test_df.iloc[:, :-1] # all columns except the last one
test_df_y = test_df.iloc[:, -1:] # only the last column

print('train df shape: ', train_df.shape)
print('val df shape: ', val_df.shape)
print('test df shape: ', test_df.shape)
trainset = TemplateDataset(train_df_x, train_df_y)
valset = TemplateDataset(val_df_x, val_df_y)
testset = TemplateDataset(test_df_x, test_df_y)

In [None]:
from tools.setting.data_config import DataConfig
from tools.setting.ml_params import MLParameters
from trainer_hub import TrainerHub

num_features = description['num_features']
num_classes = description['num_classes']
data_config = DataConfig(dataset_name = dataset_name, task_type='binary_classification', obs_shape=[num_features], label_size=num_classes)

#  Set training configuration from the AlgorithmConfig class, returning them as a Namespace object.
ml_params = MLParameters(ccnet_network = 'tabnet', encoder_network = 'none')
ml_params.training.num_epoch = 100
ml_params.training.batch_size = 32
ml_params.model.ccnet_config.num_layers = 3

# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

# Initialize the TrainerHub class with the training configuration, data configuration, device, and use_print and use_wandb flags
trainer_hub = TrainerHub(ml_params, data_config, device, use_print=True, use_wandb=False, print_interval=20) 

In [None]:
trainer_hub.train(trainset, valset)

## Proving independence of X and E

In [None]:
import numpy as np
test_loader = torch.utils.data.DataLoader(dataset=testset, batch_size=64, shuffle=False, drop_last=False)

all_labels = []

for _, labels in test_loader:
    labels = labels.numpy().flatten()  
    all_labels.extend(labels)

all_labels = np.array(all_labels)

unique_labels, counts = np.unique(all_labels, return_counts=True)

label_ratios = counts / counts.sum()

for label, ratio in zip(unique_labels, label_ratios):
    print(f'Label: {label}, Ratio: {ratio:.4f}')

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

class SimpleNN(nn.Module):
    def __init__(self, input_size):
        super(SimpleNN, self).__init__()
        self.layer1 = nn.Linear(input_size, 50)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(50, 2)  
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        return self.sigmoid(x)

In [None]:
import tqdm

def train_model(model, train_loader, num_epochs=30):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.BCELoss()

    metrics = {
        'loss': [],
        'accuracy': [], 
        'f1_score': []
    }

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = []
        for data, target in tqdm.tqdm(train_loader):
            data, target = data.float().to(device), target.float().to(device)
            optimizer.zero_grad()
            output = model(data)
            target_one_hot = torch.nn.functional.one_hot(target.to(torch.int64), num_classes=2).float()  # 원-핫 인코딩
            target_one_hot = target_one_hot.squeeze(1)
            loss = criterion(output, target_one_hot)
            loss.backward()
            optimizer.step()
            epoch_loss.append(loss.item())

        avg_loss = sum(epoch_loss) / len(epoch_loss)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')
        metrics['loss'].append(avg_loss)

    return metrics


In [None]:
def test_model(model, val_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    y_true, y_pred = [], []

    with torch.no_grad():
        for data, target in val_loader:
            data, target = data.float().to(device), target.float().to(device)
            output = model(data)
            predicted = (output > 0.5).float()
            y_pred.extend(predicted.view(-1).tolist())
            y_true.extend(target.view(-1).tolist())

    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    return acc, f1

### Before experimenting with the model, train SimpleNN model with raw data

In [None]:
model = SimpleNN(num_features)

train_loader = torch.utils.data.DataLoader(dataset=trainset, batch_size=32, shuffle=False, drop_last=False)

train_metrics = train_model(model, train_loader)

### load test data and ccnet from trainer_hub

In [None]:
test_loader = torch.utils.data.DataLoader(dataset=testset, batch_size=32, shuffle=False, drop_last=False)
ccnet = trainer_hub.ccnet

### 1-1 Infer Y from X created using original Explanation

In [None]:
original_e_data = None

for data, labels in test_loader:
    data = data.float().to(device)
    labels = labels.float().to(device)
    labels = torch.nn.functional.one_hot(labels.to(torch.int64), num_classes=2).float()
    labels = labels.squeeze(1)
    
    explanation = ccnet.explain(data)
    synthetic_data = ccnet.produce(labels, explanation)
    
    synthetic_data = synthetic_data.detach().cpu()
    labels = labels.detach().cpu()
    
    if original_e_data is None:
        original_e_data = synthetic_data
        original_labels = labels
    else:
        original_e_data = torch.cat([original_e_data, synthetic_data], dim=0)
        original_labels = torch.cat([original_labels, labels], dim=0)
        
print(f"Original Explanation Data Shape: {original_e_data.shape}")
original_testset = TemplateDataset(original_e_data.numpy(), original_labels.numpy())


### 1-2 Infer Y from X' created using E'' = E[1:]+E[:1]

In [None]:
shifted_e1_data = None

for data, labels in test_loader:
    data = data.float().to(device)
    labels = labels.float().to(device)
    labels = torch.nn.functional.one_hot(labels.to(torch.int64), num_classes=2).float()
    labels = labels.squeeze(1)
    
    explanation = ccnet.explain(data)
    explanation = torch.cat((explanation[1:], explanation[:1]), dim=0)
    synthetic_data = ccnet.produce(labels, explanation)
    
    synthetic_data = synthetic_data.detach().cpu()
    labels = labels.detach().cpu()
    
    if shifted_e1_data is None:
        shifted_e1_data = synthetic_data
        original_labels = labels
    else:
        shifted_e1_data = torch.cat([shifted_e1_data, synthetic_data], dim=0)
        original_labels = torch.cat([original_labels, labels], dim=0)
    
print(f"Manipulated Data Shape: {shifted_e1_data.shape}")
shifted_explanation_testset = TemplateDataset(shifted_e1_data.numpy(), original_labels.numpy())


### 1-3 Infer Y from X'' created using E'' = E[5:]+E[:5]

In [None]:
shifted_e5_data = None

for data, labels in test_loader:
    data = data.float().to(device)
    labels = labels.float().to(device)
    labels = torch.nn.functional.one_hot(labels.to(torch.int64), num_classes=2).float()
    labels = labels.squeeze(1)
    
    explanation = ccnet.explain(data)
    explanation = torch.cat((explanation[5:], explanation[:5]), dim=0)
    synthetic_data = ccnet.produce(labels, explanation)
    
    synthetic_data = synthetic_data.detach().cpu()
    labels = labels.detach().cpu()
    
    if shifted_e5_data is None:
        shifted_e5_data = synthetic_data
        original_labels = labels
    else:
        shifted_e5_data = torch.cat([shifted_e5_data, synthetic_data], dim=0)
        original_labels = torch.cat([original_labels, labels], dim=0)

print(f"Shifted Explanation2 Shape: {shifted_e5_data.shape}")
shifted_explanation5_testset = TemplateDataset(shifted_e5_data.numpy(), original_labels.numpy())


In [None]:
train_loader1 = DataLoader(original_testset, batch_size=10, shuffle=True)
train_loader2 = DataLoader(shifted_explanation_testset, batch_size=10, shuffle=True)
train_loader3 = DataLoader(shifted_explanation5_testset, batch_size=10, shuffle=True)

In [None]:
test_accuracy1, test_f11 = test_model(model, train_loader1)
test_accuracy2, test_f12 = test_model(model, train_loader2)
test_accuracy3, test_f13 = test_model(model, train_loader3)

print(f"Original: Test Accuracy: {test_accuracy1:.4f}, Test F1 Score: {test_f11:.4f}")
print(f"Shifted E1: Test Accuracy: {test_accuracy2:.4f}, Test F1 Score: {test_f12:.4f}")
print(f"Shifted E5: Test Accuracy: {test_accuracy3:.4f}, Test F1 Score: {test_f13:.4f}")

### Plotting the results