<a href="https://colab.research.google.com/github/bhanup6663/COMP691_DL/blob/bhanu/XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import torchvision
from torchvision import datasets, transforms
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
import xgboost as xgb
import torch

In [2]:
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=10),
    transforms.RandomCrop(32, padding=4),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010])
])

test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010])
])

In [3]:
trainset = datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transform)
testset = datasets.CIFAR10(root='./data', train=False, download=True, transform=test_transform)

Files already downloaded and verified
Files already downloaded and verified


In [4]:
selected_classes = np.random.choice(range(10), 2, replace=False)

In [5]:
class_mapping = {original: new for new, original in enumerate(selected_classes)}

In [6]:
train_indices = []
test_indices = []

for idx, (image, label) in enumerate(trainset):
    if label in selected_classes:  # Include all samples from selected classes for training
        train_indices.append(idx)

for idx, (image, label) in enumerate(testset):
    if label in selected_classes:  # Include all samples from selected classes for testing
        test_indices.append(idx)

# Shuffle the indices
np.random.shuffle(train_indices)
np.random.shuffle(test_indices)

# Check if enough samples are found for training and testing
if len(train_indices) < 25 or len(test_indices) < 2000:
    raise ValueError("Insufficient samples found for training or testing.")

In [7]:
train_indices = train_indices[:25]
test_indices = test_indices[:2000]

trainloader = torch.utils.data.DataLoader(trainset, batch_size=25, sampler=torch.utils.data.SubsetRandomSampler(train_indices))
testloader = torch.utils.data.DataLoader(testset, batch_size=2000, sampler=torch.utils.data.SubsetRandomSampler(test_indices))

In [8]:
train_data = []
train_labels = []
test_data = []
test_labels = []

for images, labels in trainloader:
    for image, label in zip(images, labels):
        train_data.append(image.numpy().flatten())  # Flatten the image tensor
        train_labels.append(label.item())

for images, labels in testloader:
    for image, label in zip(images, labels):
        test_data.append(image.numpy().flatten())  # Flatten the image tensor
        test_labels.append(label.item())

In [9]:
train_data = np.array(train_data)
train_labels = np.array(train_labels)
test_data = np.array(test_data)
test_labels = np.array(test_labels)

train_remapped = np.array([class_mapping[label] for label in train_labels])
test_remapped = np.array([class_mapping[label] for label in test_labels])

In [10]:
pca = PCA(n_components=0.70)
X_train_pca = pca.fit_transform(np.array(train_data))
X_test_pca = pca.transform(np.array(test_data))

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X_train_pca, np.array(train_remapped), test_size=0.2, stratify=train_remapped)


In [12]:
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',
    eval_metric='mlogloss',
    use_label_encoder=False,
    num_class=2,  # Assuming you are working with binary or two-class classification
    tree_method='gpu_hist',  # Use GPU-accelerated tree construction
    gpu_id=0,  # GPU ID, use 0 for the first GPU in your system
    seed=42
)

In [13]:
param_grid = {
    'n_estimators': [25, 50, 100, 150, 200, ],
    'max_depth': [2, 3, 6, 8, 10],
    'learning_rate': [0.001, 0.01, 0.05, 0.1],
    'subsample': [0.3, 0.6, 0.8, 1.0],
    'colsample_bytree': [0.3, 0.6, 0.8, 1.0],
    'lambda': [1, 1.5],


}

In [14]:
grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_val, y_val)], verbose=False)

Fitting 3 folds for each of 3200 candidates, totalling 9600 fits


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


   

In [15]:
best_model = grid_search.best_estimator_
print(f"Best hyperparameters: {grid_search.best_params_}")

Best hyperparameters: {'colsample_bytree': 0.3, 'lambda': 1, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 150, 'subsample': 1.0}


In [16]:
predictions = best_model.predict(X_test_pca)
accuracy = accuracy_score(test_remapped, predictions)
print(f'XGBoost Test Accuracy: {accuracy*100:.2f}%')

XGBoost Test Accuracy: 73.40%
