In [1]:
import os
import warnings
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_covtype
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import balanced_accuracy_score
from dataheroes import CoresetTreeServiceDTC

from sklearn.metrics import balanced_accuracy_score
import xgboost as xgb


## Prepare Data

In [22]:
# load the data
X, y = fetch_covtype(return_X_y=True)


# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f'Number of data points in the training data: {X_train.shape[0]}')


Number of data points in the training data: 464809


In [23]:
# transform the training and test data
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

### Training with the Full Dataset

In [25]:
# train the model with the full training data
full_dataset_model = xgb.XGBClassifier(random_state=42)
full_dataset_model.fit(X_train, y_train)

# generate predictions
y_pred_full = full_dataset_model.predict(X_test)

# evaluate the model
full_balanced = balanced_accuracy_score(y_test, y_pred_full)
print(f'Balanced Accuracy Score: {full_balanced}')


Balanced Accuracy Score: 0.8296036929211656


In [26]:
%%timeit
full_dataset_model = xgb.XGBClassifier(random_state=42)
full_dataset_model.fit(X_train, y_train)

3min 5s ± 12 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Training with a Coreset

In [9]:
from dataheroes import CoresetTreeServiceDTC

# Build the coreset tree
service_obj = CoresetTreeServiceDTC(
                                   optimized_for='training',
                                   n_classes=7,
                                   n_instances=X_train.shape[0]
                                  )
service_obj.build(X_train, y_train)

<dataheroes.services.coreset_tree.dtc.CoresetTreeServiceDTC at 0x2119332c640>

In [27]:
# Get the coreset
coreset = service_obj.get_coreset(level=5) # level=5
indices, X_train_coreset, y_train_coreset = coreset['data']
w = coreset['w']

# Train a xgboost model on the coreset.
coreset_model = xgb.XGBClassifier(random_state=42).fit(X_train_coreset, y_train_coreset, sample_weight=w)
y_pred_coreset = coreset_model.predict(X_test)
n_samples_coreset = y_train_coreset.shape[0]

print(f'Number of samples in the coreset: {n_samples_coreset}')

Number of samples in the coreset: 68856


In [28]:

# Evaluate model
coreset_score = balanced_accuracy_score(y_test, y_pred_coreset) # target: 0.8296036929211656

print(f"Balanced score: {coreset_score}")


Balanced score: 0.8388975901035508


In [29]:
%%timeit

# time the training process with the coreset
coreset_model = xgb.XGBClassifier(random_state=42).fit(X_train_coreset, y_train_coreset, sample_weight=w)


25.9 s ± 1.77 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Training with a Sample

In [31]:
import random
random.seed(42)


sample_length = 68856

# Create a list of indices
indices = list(range(X_train.shape[0]))

# Get a random sample of indices
random_indices = random.sample(indices, sample_length)

# Retrieve elements from both arrays using the random indices
X_train_sample = np.array([X_train[i] for i in random_indices])
y_train_sample = np.array([y_train[i] for i in random_indices])

# train the model with the sample
sample_model = xgb.XGBClassifier(random_state=42).fit(X_train_sample, y_train_sample)

# evaluate the model
sample_balanced = balanced_accuracy_score(y_test, sample_model.predict(X_test))

print(f"Balanced score: {sample_balanced}")


Balanced score: 0.7843310695650707


In [19]:
%%timeit 

# time the training with the random sample
sample_model = xgb.XGBClassifier(random_state=42).fit(X_train_sample, y_train_sample)


25.6 s ± 1.79 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Summary

In [34]:
# summarize the data with a data frame
data = [['Full Dataset', '464,809', '3min 5s', '0.8296'],
        ['Coreset', '68,856', '25.9 s', '0.8388'],
        ['Random Sample', '68,856','25.6s', '0.7843']
] 

pd.DataFrame(data, columns = ['Data', 'Number of Samples', 'Training Time', 'Balanced Accuracy Score'])

Unnamed: 0,Data,Number of Samples,Training Time,Balanced Accuracy Score
0,Full Dataset,464809,3min 5s,0.8296
1,Coreset,68856,25.9 s,0.8388
2,Random Sample,68856,25.6s,0.7843
