In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import cross_val_score, KFold
import xgboost as xgb
from pandas.plotting import scatter_matrix
import textwrap
import seaborn as sns
import random

# Load the Breast Cancer dataset
cancer = load_breast_cancer()
X_cancer = cancer.data
y_cancer = cancer.target

# Create a DataFrame from the Breast Cancer dataset
cancer_df = pd.DataFrame(X_cancer, columns=cancer.feature_names)
cancer_df['diagnosis'] = pd.Categorical.from_codes(y_cancer, cancer.target_names)
# Function to wrap text
def wrap_text(text, width=10):
    return '\n'.join(textwrap.wrap(text, width))

num_features = 10

# Randomly sample feature names
random_features = random.sample(list(cancer.feature_names), num_features)
# Create a DataFrame from the Breast Cancer dataset with only the selected features
cancer_df = pd.DataFrame(X_cancer, columns=cancer.feature_names)[random_features]
cancer_df['diagnosis'] = pd.Categorical.from_codes(y_cancer, cancer.target_names)

# Wrap feature names
wrapped_feature_names = [wrap_text(name) for name in random_features]

# Create a DataFrame from the Breast Cancer dataset with wrapped feature names and only the selected features
cancer_df_wrapped = pd.DataFrame(cancer_df, columns=random_features)
cancer_df_wrapped.columns = wrapped_feature_names
cancer_df_wrapped['diagnosis'] = pd.Categorical.from_codes(y_cancer, cancer.target_names)


# Visualize the data using a scatter matrix (first 10 features)
#scatter_matrix_fig = scatter_matrix(cancer_df_wrapped.iloc[:, :], figsize=(20, 20), c=y_cancer, marker='o', hist_kwds={'bins': 20}, s=60, alpha=.8)

# Custom histogram function
def colored_hist(x, color, **kwargs):
    bins = kwargs.pop('bins', 20)
    plt.hist(x, bins=bins, color=color, alpha=0.5, **kwargs)

# Custom scatter_matrix function
def colored_scatter_matrix(data, hue, hue_order=None, hue_cmap='viridis', **kwargs):
    g = sns.pairplot(data, diag_kind='hist', hue=hue, hue_order=hue_order, palette=hue_cmap, **kwargs)
    for i, j in zip(*np.triu_indices_from(g.axes, 1)):
        g.axes[i, j].set_visible(False)
    return g

# Visualize the data using a colored scatter matrix (first 10 features)

g = colored_scatter_matrix(cancer_df_wrapped.iloc[:, :], hue='diagnosis', hue_order=cancer.target_names, markers='o', height=2, aspect=1.2)

plt.savefig('scatter_matrix_colored.png', bbox_inches='tight', dpi=100)
plt.close()


# Breast Cancer Wisconsin (Diagnostic) Dataset Scatter Matrix

This scatter matrix displays the relationships between the first 10 features of the Breast Cancer dataset. Each off-diagonal scatter plot shows the relationship between two features, with points colored based on the diagnosis (malignant or benign). The diagonal plots show the distribution of each feature as a histogram.

**Key points:**
- Scatter matrices provide a high-level overview of feature relationships but may not capture complex patterns or interactions.
- When dealing with high-dimensional datasets, scatter matrices can become overwhelming, and alternative visualization techniques such as PCA or t-SNE might be more appropriate.
- In the scatter matrix, each row and column represent a feature in the dataset. The scatter plots off the diagonal show the relationship between pairs of features, while the diagonal plots show the distribution of a single feature.
- some general observations you can make from a scatter plot:

    * Positive Correlation: If the dots form an upward trend (from bottom-left to top-right), it indicates that there is a positive correlation between the two features. As one feature increases, the other also tends to increase.

    * Negative Correlation: If the dots form a downward trend (from top-left to bottom-right), it suggests a negative correlation between the two features. As one feature increases, the other tends to decrease.

    * No Correlation: If the dots form a random pattern with no apparent trend, it means that there is little or no correlation between the two features.

    * Non-linear Relationship: If the dots form a curve or any other non-linear pattern, it indicates a non-linear relationship between the features.

<img src="scatter_matrix_colored.png" alt="Breast Cancer Scatter Matrix" style="width: 100%; max-width: 800px;"/>


In [3]:
# Convert the data into DMatrix format
D_cancer = xgb.DMatrix(X_cancer, label=y_cancer)

# Set XGBoost parameters
params_cancer = {
    'objective': 'binary:logistic',
    'max_depth': 3,
    'eta': 0.3,
}

# Train the XGBoost model with cross-validation
num_rounds_cancer = 100
kfold_cancer = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores_cancer = xgb.cv(params_cancer, D_cancer, num_rounds_cancer, folds=kfold_cancer, metrics='error', as_pandas=True)

# Calculate mean and standard deviation of the cross-validation scores
mean_score_cancer = 1 - cv_scores_cancer['test-error-mean'].iloc[-1]
std_score_cancer = cv_scores_cancer['test-error-std'].iloc[-1]

print(f"Cross-validated accuracy: {mean_score_cancer * 100:.2f}% (+/- {std_score_cancer * 100:.2f}%)")

Cross-validated accuracy: 97.19% (+/- 1.29%)
