# LOFO for GPU

## Bibliotecas Utilizadas

In [1]:
import numpy as np
import cupy as cp
import pandas as pd
import datetime
from cnr_methods import get_simplified_data, metric_cnr

from sklearn.model_selection import train_test_split, TimeSeriesSplit
import xgboost as xgb

## Leave One Feature Out

In [66]:
def lofo_df(df,y,features,feature_out):
    if feature_out is not None:
        df = df.drop(feature_out,axis=1)
    gpu_matrix = cp.asarray(df[[feature for feature in features if feature != feature_out]])
    gpu_matrix = xgb.DMatrix(gpu_matrix,label=y)
    return gpu_matrix

In [67]:
def lofo_score(X,y,features,feature_out,model):
    feature_df = lofo_df(X,y,features,feature_out=feature_out)
    model_params = model.get_xgb_params()
    lofo_score = xgb.cv(params=params,dtrain=feature_df, nfold=5)
    #lofo_score = cross_validate(estimator=model,X=feature_df,y=y,scoring=scoring,cv=cv,n_jobs=n_jobs)
    return lofo_score.iloc[:, [2]].mean()

In [68]:
def LOFO_GPU_Importance(X,y,features,model):
    base_score = lofo_score(X,y,features,None,model)
    scores = np.empty(0)
    i = 0
    for feature_out in features:
        i = i + 1
        start_time = datetime.datetime.now()
        feature_score = lofo_score(X,y,features,feature_out,model)
        scores = np.append(scores,base_score-feature_score)
        end_time = datetime.datetime.now()
        delta = end_time - start_time
        print('{}/{} {}.{} s/it'.format(i,len(features),delta.seconds,delta.microseconds))
    importance_df = pd.DataFrame()
    importance_df["feature"] = features
    importance_df["score"] = scores
    return importance_df.sort_values(by='score',ascending=True)

In [69]:
x_train = pd.read_csv('Data/X_train.csv')
y_train = pd.read_csv('Data/Y_train.csv')

In [70]:
features = x_train.drop(['ID','WF','Time'],axis=1).columns
model = xgb.XGBRegressor(tree_method='gpu_hist',max_depth=5)
cv = KFold(n_splits=5, shuffle=False, random_state=0)
scoring = make_scorer(mean_squared_error, greater_is_better=False,)
n_jobs=2

In [71]:
importance_df = LOFO_GPU_Importance(x_train,y_train['Production'],features,model)

1/102 2.160516 s/it
2/102 2.192221 s/it
3/102 2.180734 s/it
4/102 2.297052 s/it
5/102 2.92967 s/it
6/102 2.119747 s/it
7/102 2.140981 s/it
8/102 2.125453 s/it
9/102 2.201463 s/it
10/102 2.164225 s/it
11/102 2.93454 s/it
12/102 2.111707 s/it
13/102 2.42634 s/it
14/102 2.18509 s/it
15/102 2.67307 s/it
16/102 2.83932 s/it
17/102 2.59300 s/it
18/102 2.44056 s/it
19/102 2.30547 s/it
20/102 2.10603 s/it
21/102 2.13918 s/it
22/102 2.37754 s/it
23/102 2.195575 s/it
24/102 2.987015 s/it
25/102 3.334085 s/it
26/102 3.170521 s/it
27/102 2.889383 s/it
28/102 2.502161 s/it
29/102 2.454040 s/it
30/102 2.490076 s/it
31/102 2.702162 s/it
32/102 2.305795 s/it
33/102 2.442085 s/it
34/102 3.165890 s/it
35/102 2.577025 s/it
36/102 2.304734 s/it
37/102 2.162587 s/it
38/102 2.215352 s/it
39/102 2.203236 s/it
40/102 2.275436 s/it
41/102 2.189384 s/it
42/102 2.298511 s/it
43/102 2.599305 s/it
44/102 2.821636 s/it
45/102 2.613248 s/it
46/102 2.625768 s/it
47/102 2.304152 s/it
48/102 2.303443 s/it
49/102 2.3810

In [72]:
importance_df

Unnamed: 0,feature,score
25,NWP1_00h_D_V,-0.005814
22,NWP1_18h_D-1_V,-0.003987
97,NWP4_00h_D_V,-0.002951
26,NWP1_00h_D_T,-0.001282
4,NWP1_06h_D-2_V,-0.001274
...,...,...
18,NWP1_12h_D-1_U,0.000778
16,NWP1_06h_D-1_V,0.001007
88,NWP4_12h_D-2_V,0.001106
89,NWP4_12h_D-2_CLCT,0.001652


## RFE

In [2]:
full_data, full_label = get_simplified_data()

In [3]:
X = full_data[full_data['Set']=='Train']
X = X[X['WF']=='WF1']
y = full_label[full_label.index.isin(X['ID'])]

In [4]:
X = X.drop(['ID','WF','Set'],axis=1)

In [5]:
def gpu_df(df,y):
    gpu_matrix = cp.asarray(df)
    gpu_matrix = xgb.DMatrix(gpu_matrix,label=y)
    return gpu_matrix

In [6]:
def get_feat_scores(model,X_train):
    scores = bst.get_fscore()
    scores = pd.DataFrame.from_dict(scores,orient='index')
    scores = scores.reset_index()
    scores.columns = ['Features','score']
    scores['Features'] = scores['Features'].apply(lambda x: x.split('f')[1])
    scores['Features'] = scores['Features'].astype(int)

    columns_df = pd.DataFrame(X_train.columns,columns=['feature'])
    columns_df['Features'] = columns_df.index

    scores = scores.merge(columns_df,on='Features',how='left')
    scores = scores.sort_values(by='score',ascending=False)
    scores = scores.reset_index()
    scores = scores.drop(['index','Features'],axis=1)
    scores = scores[['feature','score']]

    return scores

In [7]:
num_boost_round = 1000
early_stopping_rounds = 100

In [8]:
param = {'tree_method' : 'gpu_hist'}

In [66]:
def rfe_score(X,y,param,num_boost_round,early_stopping_rounds):
    X_cv, X_hold, y_cv, y_hold = train_test_split(X, y, test_size=0.33, shuffle=False)

    fold_scores = []
    tscv = TimeSeriesSplit(n_splits=5)
    for train_index, test_index in tscv.split(X_cv):
        X_train, X_test = X_cv.iloc[train_index], X_cv.iloc[test_index]
        y_train, y_test = y_cv.iloc[train_index], y_cv.iloc[test_index]

        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.143, shuffle=False)

        dtrain = gpu_df(X_train,y_train['Production'])
        dval = gpu_df(X_val,y_val['Production'])

        watchlist = [(dtrain,'train'),(dval,'eval')]
        bst = xgb.train(param, dtrain, num_boost_round=num_boost_round, evals=watchlist, feval=metric_cnr,early_stopping_rounds=early_stopping_rounds,verbose_eval=False)

        scores = get_feat_scores(bst,X_train)

        subset_score = []
        for subset in np.arange(1,len(scores)+1):
            features = scores.iloc[:subset]['feature']

            dtrain = gpu_df(X_train[features],y_train['Production'])
            dval = gpu_df(X_val[features],y_val['Production'])
            dtest = gpu_df(X_test[features],y_test['Production'])

            watchlist = [(dtrain,'train'),(dval,'eval')]
            bst = xgb.train(param, dtrain, num_boost_round=num_boost_round, evals=watchlist, feval=metric_cnr,early_stopping_rounds=early_stopping_rounds,verbose_eval=False)

            preds = bst.predict(dtest,ntree_limit=bst.best_ntree_limit)
            test_score = metric_cnr(preds,dtest)
            subset_score.append(test_score)
        fold_scores.append(subset_score)

    subset_mean_score = np.empty(0)
    for i in np.arange(len(fold_scores[0])):
        subset_mean_score = np.append(subset_mean_score,np.array([x[i][1] for x in fold_scores]).mean())

    subset_size = subset_mean_score.argmin() + 1

    dtrain = gpu_df(X_cv,y_cv['Production'])
    dval = gpu_df(X_hold,y_hold['Production'])

    watchlist = [(dtrain,'train'),(dval,'eval')]
    bst = xgb.train(param, dtrain, num_boost_round=num_boost_round, evals=watchlist, feval=metric_cnr,early_stopping_rounds=early_stopping_rounds,verbose_eval=False)

    scores = get_feat_scores(bst,X_cv)

    selected_features = scores.iloc[:subset_size]['feature']

    return selected_features

In [67]:
selected_features = rfe_score(X,y,param,1000,100)

In [68]:
selected_features

0    U_100m
1    V_100m
2         T
3     V_10m
4     U_10m
5      CLCT
Name: feature, dtype: object