### Install Libraries

In [54]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.metrics import roc_auc_score
from dataheroes import CoresetTreeServiceLG

from sklearn.linear_model import LogisticRegression
import openml
import xgboost as xgb

# execute once
#from dataheroes.utils import activate_account
#activate_account("aashishnair8925@gmail.com")

import warnings

warnings.filterwarnings(action='ignore')

### Load Data

In [55]:
# load click data
trains = openml.datasets.get_dataset(45556) 

# Get the data itself as a dataframe (or otherwise)
df, y, _, _ = trains.get_data(dataset_format="dataframe")
df.head()

Could not download file from http://openml1.win.tue.nl/dataset45556/dataset_45556.pq: Bucket does not exist or is private.


Unnamed: 0,impression,url_hash,ad_id,advertiser_id,depth,position,query_id,keyword_id,title_id,description_id,user_id,target
0,345.0,0.373763,0.409091,0.453513,2,1,0.338462,0.474684,0.409091,0.409091,0.594475,0.0
1,1.0,0.439948,0.582474,0.380261,2,2,0.499815,0.857143,0.863636,0.833333,0.499815,1.0
2,1.0,0.554103,0.725275,0.553467,1,1,0.499815,1.0,1.0,0.755102,0.499815,1.0
3,1.0,0.597701,0.633333,0.597701,1,1,0.65625,0.717391,0.555556,0.555556,0.499815,1.0
4,1.0,0.385417,0.499815,0.385417,2,1,0.499815,0.0,0.499815,0.499815,0.499815,0.0


In [56]:
# info of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 12 columns):
 #   Column          Non-Null Count    Dtype   
---  ------          --------------    -----   
 0   impression      1000000 non-null  float64 
 1   url_hash        1000000 non-null  float64 
 2   ad_id           1000000 non-null  float64 
 3   advertiser_id   1000000 non-null  float64 
 4   depth           1000000 non-null  uint8   
 5   position        1000000 non-null  uint8   
 6   query_id        1000000 non-null  float64 
 7   keyword_id      1000000 non-null  float64 
 8   title_id        1000000 non-null  float64 
 9   description_id  1000000 non-null  float64 
 10  user_id         1000000 non-null  float64 
 11  target          1000000 non-null  category
dtypes: category(1), float64(9), uint8(2)
memory usage: 71.5 MB


In [57]:
# get input and output data
target = 'target'
X = df.drop(target, axis=1)
y = df[target]

# create training and testing splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [58]:
print(f'Number of data points in the training data: {X_train.shape[0]}')

Number of data points in the training data: 670000


### Data Preprocessing

In [59]:
# Define the categorical and numerical features
cat_var = [col for col in X_train.columns if X_train[col].dtypes in ['object', 'category']]
num_var = [col for col in X_train.columns if X_train[col].dtypes in ['int64', 'float64']]

In [60]:
# Define column transformer for one-hot encoding categorical features
categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Define column transformer for one-hot encoding categorical features
numeric_transformer = Pipeline([
    ('scaler', StandardScaler())
])

# Define preprocessing pipeline
preprocessing_pipeline = ColumnTransformer([
    ('cat_transformer', categorical_transformer, cat_var),
    ('num_transformer', numeric_transformer, num_var)
  # replace missing categorical values with mode and one-hot encode
], remainder='passthrough')


# transform the training and test data
X_train = preprocessing_pipeline.fit_transform(X_train)
X_test = preprocessing_pipeline.transform(X_test)



In [61]:
# encode the target label
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

### Train Model with Coresets

In [62]:
from dataheroes import CoresetTreeServiceLG

# build a coreset for a logistic regression model
service_obj = CoresetTreeServiceLG(
                                   optimized_for='training',
                                   n_classes=2,
                                   n_instances=len(X_train)
                                  )
service_obj.build(X_train, y_train)

<dataheroes.services.coreset_tree.lg.CoresetTreeServiceLG at 0x1229c910610>

In [63]:
# Get the top level coreset and weights
coreset = service_obj.get_coreset(level=2)
indices, X, y = coreset['data']
w = coreset['w']

# Train a logistic regression model on the coreset.
log_coreset = LogisticRegression(random_state=42)
log_coreset.fit(X, y, sample_weight=w)

In [78]:
# create the service object
service_obj = CoresetTreeServiceLG(
                                   optimized_for='training',
                                   n_classes=2,
                                   n_instances=len(X_train)
                                  )
service_obj.build(X_train, y_train)

 # Get the coreset and weights
coreset = service_obj.get_coreset(level=2)
indices, X, y = coreset['data']
w = coreset['w']

In [79]:
print(f'Number of data points in the coreset: {X.shape[0]}')

Number of data points in the coreset: 5320


In [80]:
# Train a logistic regression model on the coreset.
log_coreset = LogisticRegression(random_state=42)
log_coreset.fit(X, y, sample_weight=w)

In [81]:
%%timeit 

# Train a logistic regression model on the coreset.
log_coreset = LogisticRegression(random_state=42)
log_coreset.fit(X, y, sample_weight=w)

27.4 ms ± 3.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [82]:
n_samples_coreset = len(y)
print(n_samples_coreset)

5320


In [83]:
# get the roc auc score
y_pred_coreset = log_coreset.predict(X_test)
roc_auc_coreset = roc_auc_score(y_pred_coreset, y_test)
print(f'ROC AUC Score for the coreset: {roc_auc_coreset}')

ROC AUC Score for the coreset: 0.737835738946004


### Train Model with Entire Training Data

In [70]:
# train the logistic regression model
log_full = LogisticRegression(random_state=42)
log_full.fit(X_train, y_train)

In [71]:
%%timeit

# train the logistic regression model
log_full = LogisticRegression(random_state=42)
log_full.fit(X_train, y_train)

2.86 s ± 193 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [72]:
# generate predictions
y_pred_full = log_full.predict(X_test)

# get roc_auc score
score_full = roc_auc_score(y_pred_full, y_test)
print(f'ROC AUC score with full dataset: {score_full}')

ROC AUC score with full dataset: 0.7380139330737792


### Train Model with a Random Sample

In [73]:
# get number of samples in the coreset
n_samples_coreset = len(service_obj.get_coreset(level=2)['w'])
random_idxs = np.random.choice(n_samples_coreset, n_samples_coreset, replace=False)


In [74]:

# train xgboost with random sample with the same size as the coreset
log_sample = LogisticRegression(random_state=42)
log_sample.fit(X_train[random_idxs, :], y_train[random_idxs])

In [75]:
%%timeit

# train xgboost with random sample with the same size as the coreset
log_sample = LogisticRegression(random_state=42)
log_sample.fit(X_train[random_idxs, :], y_train[random_idxs])

26.6 ms ± 3.44 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [76]:

# evaluate model
y_pred_sample = log_sample.predict(X_test)
score_sample = roc_auc_score(y_pred_sample, y_test)
print(f'ROC AUC Score with the random sample: {score_sample}')

ROC AUC Score with the random sample: 0.7344895450582795


### Summary

In [84]:
# summarize the data with a data frame
data = [['Full Dataset', '670,000', '2.86 s', '0.7380'],
        ['Coreset', '5,320', '27.4 ms', '0.7378'],
        ['Random Sample', '5,320','26.6 ms', '0.7345']
] 

pd.DataFrame(data, columns = ['Data', 'Number of Samples', 'Training Time', 'ROC AUC Score'])

Unnamed: 0,Data,Number of Samples,Training Time,ROC AUC Score
0,Full Dataset,670000,2.86 s,0.738
1,Coreset,5320,27.4 ms,0.7378
2,Random Sample,5320,26.6 ms,0.7345
