# Data Exploration for Tabular XAI Experiments

This notebook explores the three datasets used in our experiments:
1. Breast Cancer Dataset
2. Adult Income Dataset
3. Bank Marketing Dataset

In [None]:
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from utils.data_loader import DataLoader

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

%matplotlib inline

## 1. Breast Cancer Dataset

In [None]:
# Load breast cancer data
loader_bc = DataLoader('breast_cancer', random_state=42)
X_bc, y_bc = loader_bc.load_data()

print(f"Dataset shape: {X_bc.shape}")
print(f"Number of features: {len(loader_bc.feature_names)}")
print(f"Target distribution:\n{y_bc.value_counts()}")
print(f"\nDataset info: {loader_bc.get_dataset_info()}")

In [None]:
# Display basic statistics
print("Basic Statistics:")
X_bc.describe()

In [None]:
# Visualize target distribution
plt.figure(figsize=(8, 6))
y_bc.value_counts().plot(kind='bar')
plt.title('Breast Cancer - Target Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(14, 12))
correlation_matrix = X_bc.corr()
sns.heatmap(correlation_matrix, cmap='coolwarm', center=0)
plt.title('Breast Cancer - Feature Correlation Heatmap')
plt.tight_layout()
plt.show()

## 2. Adult Income Dataset

In [None]:
# Load adult income data
loader_adult = DataLoader('adult_income', random_state=42)
X_adult, y_adult = loader_adult.load_data()

print(f"Dataset shape: {X_adult.shape}")
print(f"Number of features: {len(loader_adult.feature_names)}")
print(f"Categorical features: {len(loader_adult.categorical_features)}")
print(f"Numerical features: {len(loader_adult.numerical_features)}")
print(f"Target distribution:\n{y_adult.value_counts()}")

In [None]:
# Display first few rows
print("First few rows:")
X_adult.head()

In [None]:
# Visualize target distribution
plt.figure(figsize=(8, 6))
y_adult.value_counts().plot(kind='bar')
plt.title('Adult Income - Target Distribution')
plt.xlabel('Income Class')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

## 3. Bank Marketing Dataset

In [None]:
# Load bank marketing data
loader_bank = DataLoader('bank_marketing', random_state=42)
X_bank, y_bank = loader_bank.load_data()

print(f"Dataset shape: {X_bank.shape}")
print(f"Number of features: {len(loader_bank.feature_names)}")
print(f"Categorical features: {len(loader_bank.categorical_features)}")
print(f"Numerical features: {len(loader_bank.numerical_features)}")
print(f"Target distribution:\n{y_bank.value_counts()}")

In [None]:
# Display first few rows
print("First few rows:")
X_bank.head()

In [None]:
# Visualize target distribution
plt.figure(figsize=(8, 6))
y_bank.value_counts().plot(kind='bar')
plt.title('Bank Marketing - Target Distribution')
plt.xlabel('Subscription')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

## Summary

We have explored three diverse tabular datasets:
- **Breast Cancer**: Medical diagnosis dataset with numerical features
- **Adult Income**: Census data with mixed categorical and numerical features
- **Bank Marketing**: Marketing campaign data with diverse feature types

These datasets provide a comprehensive testbed for evaluating gradient boosting and deep learning models with explainability techniques.