In [2]:
# current dir
%pwd

%cd '~/privacy_active_learning'

/Users/andreasopsahlferstad/privacy_active_learning


In [3]:
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from xgboost.core import XGBoostError
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import matplotlib.pylab as plt
import numpy as np

In [None]:
model = XGBClassifier(learning_rate=0.1,
            n_estimators=650,
            max_depth=5,
            min_child_weight=1,
            gamma=0,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='multi:softmax',
            nthread=8,
            scale_pos_weight=1,
            seed=0)

In [4]:
def cv_grid(param_space, model, X, y, cv_folds = 5):
    try:
        grid_search = GridSearchCV(estimator=model, param_grid=param_space, scoring='f1_macro', n_jobs=8, cv=cv_folds,
                                   refit=True)
        grid_search.fit(X, y)
    except ValueError:
        if cv_folds > 2:
            return cv_grid(param_space, model, X, y, cv_folds=cv_folds-1)
        else:
            print('2-FOLD DATASET CONTAINS TOO FEW POS OR NEG SAMPLES. NO TUNING PERFORMED')
            return model

    print('Parameter Space: ' + str(param_space))
    print('Best Params: ' + str(grid_search.best_params_))
    return grid_search.best_estimator_


data = get_voice_data(n_components=300)

train = data.sample(n=300, random_state=0)
X = train.iloc[:,1:]
y = train.iloc[:,0]

test = data.drop(X.index)
X_test = test.iloc[:,1:]
y_test = test.iloc[:,0]


model = XGBClassifier(learning_rate=0.1,
            n_estimators=1000,
            max_depth=5,
            min_child_weight=1,
            gamma=0,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='multi:softmax',
            nthread=8,
            scale_pos_weight=1,
            seed=0)

# n_estimators_tuning
param_space = {
    'n_estimators': list(range(50, 1000, 200)),
}
model = cv_grid(param_space, model, X,y)


# max_depth and min_child_weight tuning
param_space = {
    'max_depth': list(range(1, 10, 1)),
    'min_child_weight': list(range(1, 10, 1))
}
model = cv_grid(param_space, model, X,y)

# gamma tuning
param_space = {
    'gamma': np.linspace(0, 0.5, 5)
}
model = cv_grid(param_space, model, X,y)

# n_estimator re-tuning
param_space = {
    'n_estimators': list(range(50, 1000, 200)),
}
model = cv_grid(param_space, model, X,y)

# subsample and colsample_bytree tuning
param_space = {
    'subsample': np.linspace(0.5, 1, 5),
    'colsample_bytree': np.linspace(0, 1, 5)
}
model = cv_grid(param_space, model, X, y)

# reg_alpha tuning
param_space = {
    'reg_alpha': [1e-13, 1e-10, 1e-8, 1e-5, 1e-2, 0.1, 1, 100]
}
model = cv_grid(param_space, model, X, y)

# set learning rate low and re-tune n_estimators
model.set_params(learning_rate=0.01, n_estimators=5000)
param_space = {
    'n_estimators': list(range(50, 1000, 200)),
}
model = cv_grid(param_space, model, X,y)

variance kept due to PCA: 0.98
Parameter Space: {'n_estimators': [50, 250, 450, 650, 850]}
Best Params: {'n_estimators': 650}


KeyboardInterrupt: 

In [9]:
d = model.get_xgb_params()
import alsDataManager

alsDataManager.save_dict_as_json(d, 'd')

In [6]:
sum(model.predict(X_test) == y_test) / len(y_test)

0.6550620248099239

In [8]:
# max_depth and min_child_weight tuning
param_space = {
    'max_depth': list(range(1, 10, 1)),
    'min_child_weight': list(range(1, 10, 1))  # list(range(0,10,1))
}
#model = cv_grid(param_space, model, X,y)
cv_folds = 5

grid_search = GridSearchCV(estimator=model, param_grid=param_space, scoring='f1_macro', n_jobs=4, cv=cv_folds,
                           refit=True)
grid_search.fit(X,y)



GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=0.8, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=5, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=8, objective='multi:softprob',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=0, silent=None,
                                     subsample=0.8, verbosity=1),
             iid='deprecated', n_jobs=4,
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9],
                         'min_child_weight': array([ 0.        ,  1.11111111,  2.22222222,  3.33333333,  4.44444444,

In [10]:
model = grid_search.best_estimator_

In [14]:
# gamma tuning
param_space = {
    'gamma': np.linspace(0, 0.5, 5)
}
model = cv_grid(param_space, model, X,y)

In [16]:
# subsample and colsample_bytree tuning
param_space = {
    'subsample': np.linspace(0.5, 1, 5),
    'colsample_bytree': np.linspace(0, 1, 5)
}
model = cv_grid(param_space, model, X, y)

In [18]:
# reg_alpha tuning
param_space = {
    'reg_alpha': [1e-13, 1e-8, 1e-5, 1e-2, 0.1, 1, 100]
}
model = cv_grid(param_space, model, X, y)

In [19]:
sum(model.predict(X_test) == y_test) / len(y_test)

0.6638655462184874

In [13]:
def cv_grid(param_space, model, X, y, cv_folds = 5):
    try:
        grid_search = GridSearchCV(estimator=model, param_grid=param_space, scoring='f1_macro', n_jobs=8, cv=cv_folds,
                                   refit=True)
        grid_search.fit(X, y)
    except ValueError:
        if cv_folds > 2:
            return cv_grid(param_space, model, X, y, cv_folds=cv_folds-1)
        else:
            print('2-FOLD DATASET CONTAINS TOO FEW POS OR NEG SAMPLES. NO TUNING PERFORMED')
            return model

    # print(grid_search.best_params_)

    return grid_search.best_estimator_

In [69]:
sorted(sklearn.metrics.SCORERS.keys())

NameError: name 'sklearn' is not defined

In [34]:
def cv_n_estimators(model, X, y, cv_folds=5, early_stopping_rounds=50, num_class = num_class):
    model_params = model.get_xgb_params()  # returns all parameters of model
    model_params['num_class'] = num_class #number of classes
    # converts data to xg matrix
    xgtrain = xgb.DMatrix(X.values, label=y.values)


    #try:
    cvresult = xgb.cv(model_params,
                      xgtrain,
                      num_boost_round=model.get_params()['n_estimators'],
                      nfold=cv_folds,
                      metrics='error',
                      early_stopping_rounds=early_stopping_rounds)
    #except XGBoostError:
    #    if cv_folds > 2:
    #        print(cv_folds)
    #        return cv_n_estimators(model, X, y, cv_folds=cv_folds-1)
    #    else:
    #        print('2-FOLD DATASET CONTAINS TOO FEW POS OR NEG SAMPLES. NO TUNING PERFORMED')
    #        return model

    model.set_params(n_estimators=cvresult.shape[0])

    # Fit the algorithm on the data
    model.fit(X, y, eval_metric='auc')

    # print('n estimators set to : ' + str(model.get_num_boosting_rounds()))
    return model

model = cv_n_estimators(model, X, y)


XGBoostError: [12:38:00] src/metric/elementwise_metric.cu:326: Check failed: preds.Size() == info.labels_.Size() (6240 vs. 240) : label and prediction size not match, hint: use merror or mlogloss for multi-class classification
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x00000001233b4319 dmlc::LogMessageFatal::~LogMessageFatal() + 57
  [bt] (1) 2   libxgboost.dylib                    0x00000001234256f8 xgboost::metric::EvalEWiseBase<xgboost::metric::EvalError>::Eval(xgboost::HostDeviceVector<float> const&, xgboost::MetaInfo const&, bool) + 568
  [bt] (2) 3   libxgboost.dylib                    0x00000001233b125f xgboost::LearnerImpl::EvalOneIter(int, std::__1::vector<xgboost::DMatrix*, std::__1::allocator<xgboost::DMatrix*> > const&, std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > const&) + 1487
  [bt] (3) 4   libxgboost.dylib                    0x00000001233cf7cd XGBoosterEvalOneIter + 909
  [bt] (4) 5   _ctypes.cpython-38-darwin.so        0x0000000108455077 ffi_call_unix64 + 79
  [bt] (5) 6   ???                                 0x00007ffee97cbf00 0x0 + 140732815687424



In [None]:
# max_depth and min_child_weight tuning
    param_space = {
        'max_depth': list(range(1, 10, 1)),
        'min_child_weight': np.linspace(0, 10, 30)  # list(range(0,10,1))
    }
    model = cv_grid(param_space, model, X,y)

In [9]:
#xg.fit(X,y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
              nthread=8, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0,
              silent=None, subsample=0.8, verbosity=1)

In [11]:
test = data.drop(X.index)
X_test = test.iloc[:,1:]
y_test = test.iloc[:,0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-9.419547,0.459947,-7.288776,4.110508,-6.179064,-1.794985,-0.00106,3.68269,2.262369,-0.75434,...,0.426717,-0.391139,-0.1901,-0.044736,-0.246842,-0.032605,0.151998,-0.047171,-0.054338,0.252969
1,-5.124728,-1.297131,-7.837942,2.811276,-2.980114,-3.17939,-3.616721,-1.658861,0.993507,1.924557,...,0.476081,0.707167,0.027435,-0.201637,0.974762,-0.146275,0.102927,0.754423,0.149169,0.175828


In [22]:
import numpy as np
np.sum(xg.predict_proba(X_test.iloc[:2,:]), axis = 1)

array([0.99999994, 0.99999994], dtype=float32)

In [23]:
from scipy.stats import entropy


In [26]:
entropy([0.5, 0.5], base = 2)

1.0

In [9]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from xgboost.core import XGBoostError
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import matplotlib.pylab as plt
import numpy as np

In [None]:
model = 