## Step 1: Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.datasets import load_breast_cancer, load_diabetes, make_classification
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully!")
print(f"XGBoost version: {xgb.__version__}")

## Step 2: Load and Prepare Dataset

We'll use the Breast Cancer dataset for classification demonstration.

In [None]:
# Load Breast Cancer dataset
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target

# Display dataset information
print("Dataset Information:")
print("="*60)
print(f"Number of samples: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")
print(f"Feature names (first 5): {cancer.feature_names[:5]}")
print(f"Target names: {cancer.target_names}")
print(f"Class distribution:")
print(f"  - Malignant (0): {np.sum(y == 0)}")
print(f"  - Benign (1): {np.sum(y == 1)}")
print("="*60)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")
print(f"Training class distribution: {np.bincount(y_train)}")
print(f"Testing class distribution: {np.bincount(y_test)}")

## Step 3: Train Basic XGBoost Classifier

Let's start with default parameters.

In [None]:
# Create XGBoost classifier with basic parameters
xgb_clf = XGBClassifier(
    n_estimators=100,          # Number of boosting rounds
    max_depth=3,               # Maximum depth of trees
    learning_rate=0.1,         # Step size shrinkage (eta)
    random_state=42,
    eval_metric='logloss'      # Evaluation metric
)

# Train the model
xgb_clf.fit(X_train, y_train)

print("XGBoost Classifier trained successfully!")
print(f"Number of estimators: {xgb_clf.n_estimators}")
print(f"Max depth: {xgb_clf.max_depth}")
print(f"Learning rate: {xgb_clf.learning_rate}")
print(f"Number of features: {xgb_clf.n_features_in_}")

## Step 4: Make Predictions and Evaluate

In [None]:
# Make predictions
y_train_pred = xgb_clf.predict(X_train)
y_test_pred = xgb_clf.predict(X_test)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("XGBoost Performance:")
print("="*60)
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy:  {test_accuracy:.4f}")
print("="*60)

print("\nClassification Report (Test Set):")
print("="*60)
print(classification_report(y_test, y_test_pred, target_names=cancer.target_names))

## Step 5: Feature Importance Analysis

XGBoost provides feature importance scores to understand which features are most influential.

In [None]:
# Get feature importance
feature_importance = xgb_clf.feature_importances_

# Create a dataframe for better visualization
feature_df = pd.DataFrame({
    'Feature': cancer.feature_names,
    'Importance': feature_importance
}).sort_values('Importance', ascending=False)

print("Top 10 Most Important Features:")
print("="*60)
print(feature_df.head(10).to_string(index=False))

# Plot feature importance
plt.figure(figsize=(10, 8))
plt.barh(feature_df['Feature'][:15], feature_df['Importance'][:15])
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.title('Top 15 Feature Importance (XGBoost)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# Built-in XGBoost plot
fig, ax = plt.subplots(figsize=(10, 8))
xgb.plot_importance(xgb_clf, max_num_features=15, ax=ax, importance_type='weight')
plt.title('Feature Importance (XGBoost Built-in Plot)')
plt.tight_layout()
plt.show()

## Step 6: Confusion Matrix Visualization

In [None]:
# Create confusion matrix
cm = confusion_matrix(y_test, y_test_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=cancer.target_names,
            yticklabels=cancer.target_names)
plt.title(f'XGBoost Confusion Matrix\nAccuracy: {test_accuracy:.4f}')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()

print("Confusion Matrix:")
print(cm)

## Step 7: Effect of Number of Estimators (n_estimators)

Let's see how the number of trees affects performance.

In [None]:
# Test different numbers of estimators
n_estimators_range = [10, 25, 50, 100, 150, 200, 300, 500]
train_scores = []
test_scores = []

for n_est in n_estimators_range:
    xgb_temp = XGBClassifier(
        n_estimators=n_est,
        max_depth=3,
        learning_rate=0.1,
        random_state=42,
        eval_metric='logloss'
    )
    xgb_temp.fit(X_train, y_train)
    
    train_scores.append(xgb_temp.score(X_train, y_train))
    test_scores.append(xgb_temp.score(X_test, y_test))

# Plot results
plt.figure(figsize=(10, 6))
plt.plot(n_estimators_range, train_scores, marker='o', label='Training Accuracy', linewidth=2)
plt.plot(n_estimators_range, test_scores, marker='s', label='Testing Accuracy', linewidth=2)
plt.xlabel('Number of Estimators (Trees)')
plt.ylabel('Accuracy')
plt.title('XGBoost Performance vs Number of Estimators')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Find optimal n_estimators
optimal_n = n_estimators_range[np.argmax(test_scores)]
print(f"Optimal number of estimators: {optimal_n}")
print(f"Best test accuracy: {max(test_scores):.4f}")

## Step 8: Effect of Learning Rate

Learning rate controls how much each tree contributes to the final prediction.

In [None]:
# Test different learning rates
learning_rates = [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]
train_scores_lr = []
test_scores_lr = []

for lr in learning_rates:
    xgb_temp = XGBClassifier(
        n_estimators=100,
        max_depth=3,
        learning_rate=lr,
        random_state=42,
        eval_metric='logloss'
    )
    xgb_temp.fit(X_train, y_train)
    
    train_scores_lr.append(xgb_temp.score(X_train, y_train))
    test_scores_lr.append(xgb_temp.score(X_test, y_test))

# Plot results
plt.figure(figsize=(10, 6))
plt.plot(learning_rates, train_scores_lr, marker='o', label='Training Accuracy', linewidth=2)
plt.plot(learning_rates, test_scores_lr, marker='s', label='Testing Accuracy', linewidth=2)
plt.xlabel('Learning Rate')
plt.ylabel('Accuracy')
plt.title('XGBoost Performance vs Learning Rate')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print("Learning Rate Analysis:")
print("-"*60)
print("Small learning rate (e.g., 0.01):")
print("  - More conservative updates")
print("  - Requires more trees (higher n_estimators)")
print("  - More robust, less prone to overfitting")
print("\nLarge learning rate (e.g., 0.3-0.5):")
print("  - Aggressive updates")
print("  - Faster training")
print("  - Risk of overfitting")
print("-"*60)

## Step 9: Effect of Max Depth

Max depth controls the complexity of individual trees.

In [None]:
# Test different max depths
max_depths = [1, 2, 3, 4, 5, 6, 7, 8]
train_scores_depth = []
test_scores_depth = []

for depth in max_depths:
    xgb_temp = XGBClassifier(
        n_estimators=100,
        max_depth=depth,
        learning_rate=0.1,
        random_state=42,
        eval_metric='logloss'
    )
    xgb_temp.fit(X_train, y_train)
    
    train_scores_depth.append(xgb_temp.score(X_train, y_train))
    test_scores_depth.append(xgb_temp.score(X_test, y_test))

# Plot results
plt.figure(figsize=(10, 6))
plt.plot(max_depths, train_scores_depth, marker='o', label='Training Accuracy', linewidth=2)
plt.plot(max_depths, test_scores_depth, marker='s', label='Testing Accuracy', linewidth=2)
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.title('XGBoost Performance vs Max Depth')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(max_depths)
plt.show()

# Find optimal max_depth
optimal_depth = max_depths[np.argmax(test_scores_depth)]
print(f"Optimal max depth: {optimal_depth}")
print(f"Best test accuracy: {max(test_scores_depth):.4f}")

## Key Parameters and Best Practices

### Key Parameters in XGBoost:

**Tree-Specific Parameters:**
- **n_estimators**: Number of boosting rounds (trees)
  - More trees = better performance but longer training
  - Typical range: 100-1000

- **max_depth**: Maximum depth of trees
  - Controls tree complexity
  - Typical range: 3-10
  - Deeper = more complex, risk of overfitting

- **learning_rate (eta)**: Step size shrinkage
  - Lower = more robust but needs more trees
  - Typical range: 0.01-0.3

- **subsample**: Fraction of samples for each tree
  - Prevents overfitting
  - Typical range: 0.5-1.0

- **colsample_bytree**: Fraction of features for each tree
  - Random feature selection per tree
  - Typical range: 0.5-1.0

**Regularization Parameters:**
- **gamma**: Minimum loss reduction for split
  - Higher = more conservative
  - Range: 0-infinity

- **reg_alpha**: L1 regularization (Lasso)
  - Feature selection
  - Default: 0

- **reg_lambda**: L2 regularization (Ridge)
  - Smooth weights
  - Default: 1

### Best Practices:

1. **Start Simple**: Begin with default parameters, then tune

2. **Learning Rate Trade-off**:
   - Low learning rate + many estimators = better performance
   - High learning rate + fewer estimators = faster training

3. **Use Early Stopping**: 
   - Monitor validation set
   - Stop when no improvement

4. **Handle Imbalanced Data**: 
   - Use `scale_pos_weight` parameter
   - Ratio of negative to positive samples

5. **Regularization**:
   - Increase `gamma`, `reg_alpha`, `reg_lambda` to reduce overfitting
   - Decrease `max_depth`, increase `min_child_weight`

6. **Feature Engineering**:
   - XGBoost benefits from good features
   - Remove highly correlated features

7. **Cross-Validation**: Always validate with CV before final model

### Typical Parameter Combinations:

**Fast Training (Quick Baseline):**
```python
n_estimators=100, max_depth=3, learning_rate=0.1
```

**High Accuracy (Competition):**
```python
n_estimators=1000, max_depth=5, learning_rate=0.01,
subsample=0.8, colsample_bytree=0.8
```

**Prevent Overfitting:**
```python
max_depth=3, min_child_weight=5, gamma=1,
subsample=0.7, colsample_bytree=0.7
```

## Conclusion:

XGBoost is a state-of-the-art gradient boosting algorithm that:
- Provides exceptional performance and accuracy
- Uses advanced regularization to prevent overfitting
- Efficiently handles large datasets with parallel processing
- Automatically manages missing values
- Offers comprehensive feature importance analysis
- Dominates machine learning competitions

**Key Insight**: XGBoost's success comes from combining gradient boosting with regularization, efficient computation, and smart handling of edge cases. The key to mastering XGBoost is understanding the balance between model complexity (max_depth, n_estimators) and regularization (gamma, lambda, alpha) to achieve optimal performance!