In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV

In [10]:
def get_auc(indexList):
    CBECS_mat= pd.read_pickle('C:\\Users\\tzcha\\Documents\\aibpd\\application\\CBECS_4heating_clf.plk')
    allFeature=['buildingAreaCategory','buildingShape','censusRegion','climateZone','HDD65',
                    'HVACUpgrade','insulationUpgrade','MAINHT','MONUSEC',
                      'OWNTYPE', 'region','RENWLL','roofConstruction','wallConstruction',
                        'WHOPPR','WINTYP','WKHRSC','WWR','yearOfConstruction']
    al=np.array(allFeature)
    al_iter=list(al[indexList])
    mainFeaturesCBECS_Categorical=[]
    mainFeaturesCBECS_numeric=[]
    mainFeaturesCBECS_Categorical1=['buildingAreaCategory','buildingShape',
                    'censusRegion','climateZone','HVACUpgrade',
                    'insulationUpgrade','MAINHT','MONUSEC','OWNTYPE', 
                    'region','RENWLL','roofConstruction','wallConstruction',
                        'WHOPPR','WINTYP','WKHRSC']
    mainFeaturesCBECS_numeric1=['HDD65','WWR','yearOfConstruction']
    for feature in al_iter:
        if feature in mainFeaturesCBECS_numeric1:
            mainFeaturesCBECS_numeric.append(feature)
        elif feature in mainFeaturesCBECS_Categorical1:
            mainFeaturesCBECS_Categorical.append(feature)

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer,mainFeaturesCBECS_numeric),
            ('cat', categorical_transformer,mainFeaturesCBECS_Categorical)])
    
    X=preprocessor.fit_transform(CBECS_mat[al_iter])
    y=CBECS_mat['heatingLevel']
    
    from sklearn.model_selection import KFold
    kf = KFold(n_splits=5)
    errorList=[]
    X1=X.values
    Y1=y.values
    for train_index, test_index in kf.split(X1,Y1):
        print(train_index)
        X_train, X_test = X1[train_index], X1[test_index]
        y_train, y_test = Y1[train_index], Y1[test_index]
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test, label=y_test)
        param = {'max_depth': 2, 'eta': 0.8, 'verbosity': 1, 'objective': 'binary:logistic','gamma':0.5}
        param['nthread'] = 4
        param['eval_metric'] = 'auc'
        evallist = [(dtrain, 'train'),(dtest, 'test')]
        num_round = 20

        bst = xgb.train(param,dtrain, num_round,evallist,
                callbacks=[xgb.callback.print_evaluation(show_stdv=False),
                xgb.callback.early_stop(3)])
        error=1-bst.best_score
        errorList.append(error)
    error2=np.mean(errorList)
    print(error2)
    return error2

In [13]:
def logregobj(preds, dtrain):
    labels = dtrain.get_label()
    preds = 1.0 / (1.0 + np.exp(-preds))
    grad = preds - labels
    hess = preds * (1.0 - preds)
    return grad, hess
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'error', float(sum(labels != (preds > 0.0))) / len(labels)

In [24]:
def get_auc_cv(indexList):
    CBECS_mat= pd.read_pickle('C:\\Users\\tzcha\\Documents\\aibpd\\application\\CBECS_4heating_clf.plk')
    allFeature=['buildingAreaCategory','buildingShape','censusRegion','climateZone','HDD65',
                    'HVACUpgrade','insulationUpgrade','MAINHT','MONUSEC',
                      'OWNTYPE', 'region','RENWLL','roofConstruction','wallConstruction',
                        'WHOPPR','WINTYP','WKHRSC','WWR','yearOfConstruction']
    al=np.array(allFeature)
    al_iter=list(al[indexList])
    mainFeaturesCBECS_Categorical=[]
    mainFeaturesCBECS_numeric=[]
    mainFeaturesCBECS_Categorical1=['buildingAreaCategory','buildingShape',
                    'censusRegion','climateZone','HVACUpgrade',
                    'insulationUpgrade','MAINHT','MONUSEC','OWNTYPE', 
                    'region','RENWLL','roofConstruction','wallConstruction',
                        'WHOPPR','WINTYP','WKHRSC']
    mainFeaturesCBECS_numeric1=['HDD65','WWR','yearOfConstruction']
    for feature in al_iter:
        if feature in mainFeaturesCBECS_numeric1:
            mainFeaturesCBECS_numeric.append(feature)
        elif feature in mainFeaturesCBECS_Categorical1:
            mainFeaturesCBECS_Categorical.append(feature)

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer,mainFeaturesCBECS_numeric),
            ('cat', categorical_transformer,mainFeaturesCBECS_Categorical)])
    
    X=preprocessor.fit_transform(CBECS_mat[al_iter])
    y=CBECS_mat['heatingLevel']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test,label=y_test)
    
    param = {'max_depth': 2, 'eta': 0.8, 'verbosity': 1, 'objective': 'binary:logistic','gamma':0.5}
    param['nthread'] = 4
    param['eval_metric'] = 'auc'
    evallist = [(dtrain, 'train'),(dtest, 'test')]
    num_round = 10
    
    df=xgb.cv(param, dtrain, num_round, nfold=5, metrics={'auc'},seed=0, obj=logregobj, feval=evalerror)
    error = 1- np.max(df['test-auc-mean'].values)
    print(error)
    return error

In [50]:
indexList=[0, 2,3,7, 8, 9, 10,11,15,17]

In [51]:
auc=get_auc_cv(indexList)

0.21305619999999992


  if getattr(data, 'base', None) is not None and \
