# XGBoost Analysis with Feather File

This notebook demonstrates how to read a feather file and fit an XGBoost model.

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

## Read Feather File

In [None]:
# Replace 'your_file.feather' with the actual path to your feather file
file_path = 'your_file.feather'

try:
    # Read the feather file
    df = pd.read_feather(file_path)
    print(f"Data loaded successfully!")
    print(f"Shape: {df.shape}")
    print(f"\nColumns: {list(df.columns)}")
except FileNotFoundError:
    print(f"File '{file_path}' not found. Please check the file path.")
    # Create sample data for demonstration
    print("Creating sample data for demonstration...")
    np.random.seed(42)
    n_samples = 1000
    df = pd.DataFrame({
        'feature1': np.random.normal(0, 1, n_samples),
        'feature2': np.random.normal(2, 0.5, n_samples),
        'feature3': np.random.randint(0, 5, n_samples),
        'feature4': np.random.exponential(1, n_samples),
        'target': np.random.randint(0, 3, n_samples)  # Multi-class target
    })
    print(f"Sample data created with shape: {df.shape}")

## Explore the Data

In [None]:
# Display basic information about the dataset
print("Dataset Info:")
print(df.info())
print("\nFirst few rows:")
print(df.head())
print("\nBasic statistics:")
print(df.describe())

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Check data types
print("\nData types:")
print(df.dtypes)

## Prepare Data for XGBoost

In [None]:
# You'll need to specify which column is your target variable
# For this example, we'll assume the last column is the target
# Adjust this based on your actual data

# Identify target column (modify as needed)
target_column = df.columns[-1]  # Assuming last column is target
print(f"Using '{target_column}' as target variable")

# Separate features and target
X = df.drop(columns=[target_column])
y = df[target_column]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution:")
print(y.value_counts())

In [None]:
# Handle categorical variables if any
categorical_columns = X.select_dtypes(include=['object', 'category']).columns
if len(categorical_columns) > 0:
    print(f"Categorical columns found: {list(categorical_columns)}")
    # Apply label encoding to categorical variables
    le = LabelEncoder()
    for col in categorical_columns:
        X[col] = le.fit_transform(X[col])
    print("Categorical variables encoded")
else:
    print("No categorical variables found")

# Handle missing values if any
if X.isnull().sum().sum() > 0:
    print("Filling missing values with median...")
    X = X.fillna(X.median())
    print("Missing values handled")

## Train-Test Split

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Training target distribution:")
print(y_train.value_counts())

## Fit XGBoost Model

In [None]:
# Determine if this is a classification or regression problem
is_classification = len(y.unique()) < 20 and y.dtype in ['int64', 'object', 'category']

if is_classification:
    print("Detected classification problem")
    # XGBoost Classifier
    model = xgb.XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='mlogloss' if len(y.unique()) > 2 else 'logloss'
    )
else:
    print("Detected regression problem")
    # XGBoost Regressor
    model = xgb.XGBRegressor(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )

# Fit the model
print("Training XGBoost model...")
model.fit(X_train, y_train)
print("Model training completed!")

## Make Predictions and Evaluate

In [None]:
# Make predictions
y_pred = model.predict(X_test)

if is_classification:
    # Classification metrics
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
else:
    # Regression metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"Root Mean Squared Error: {rmse:.4f}")
    print(f"R² Score: {r2:.4f}")

## Feature Importance

In [None]:
# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance:")
print(feature_importance)

# Plot feature importance (optional - requires matplotlib)
try:
    import matplotlib.pyplot as plt
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance['feature'], feature_importance['importance'])
    plt.xlabel('Feature Importance')
    plt.title('XGBoost Feature Importance')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
except ImportError:
    print("Matplotlib not available for plotting")

## Model Summary

In [None]:
print("=== Model Summary ===")
print(f"Model type: {type(model).__name__}")
print(f"Number of features: {X.shape[1]}")
print(f"Training samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")
print(f"Problem type: {'Classification' if is_classification else 'Regression'}")

if is_classification:
    print(f"Number of classes: {len(y.unique())}")
    print(f"Test accuracy: {accuracy:.4f}")
else:
    print(f"Test R² score: {r2:.4f}")
    print(f"Test RMSE: {rmse:.4f}")

print("\nTop 3 most important features:")
for i, (idx, row) in enumerate(feature_importance.head(3).iterrows()):
    print(f"{i+1}. {row['feature']}: {row['importance']:.4f}")