# Modelling

The objective fo this notebook is to utilize some methods to find and determine the optimal model to solve the challenge problem. 

## Libraries

In [1]:
import numpy as np 
import cupy as cp
import pandas as pd
from cnr_methods import get_selected_features, transform_data, revert_data,metric_cnr, get_simplified_data
import tsfresh


from sklearn.model_selection import TimeSeriesSplit, train_test_split
from collections import deque
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
import xgboost as xgb

## Read Data

Here, the data used correspond to the results of the Feature Engineering and Selection Step. (Add Later)

In [2]:
full_data = get_selected_features(100)

full_data = full_data.rename({'Unnamed: 0' : 'Time'},axis=1)
full_data = full_data.set_index('Time')


full_label = pd.read_csv('Data/Y_train.csv')
X = full_data[full_data['Set']=='Train']

WF = 'WF1'
X = X[X['WF']==WF]
y = full_label[full_label['ID'].isin(X['ID'])]

In [3]:
X = X.drop(['ID','WF','Set'],axis=1)

In [4]:
X = transform_data(X)

## Validation Scheme

Before proceeding to the Hyperparameter search, it is necessary first to have some way to reliably measure the performance of the model. For this purpose, it will be used a Time Split Cross Validation Method, were the "Test" Fold for each Iteration is going to be used as the Validation Data, and so, to make Early Stopping on the data.

In [5]:
k_fold_splits = 5
num_boost_round = 500
early_stopping_rounds = 10

In [6]:
def gpu_df(df,y):
    gpu_matrix = cp.asarray(df)
    gpu_matrix = xgb.DMatrix(gpu_matrix,label=y)
    return gpu_matrix

In [7]:
def objective(param,k_fold_splits=k_fold_splits,num_boost_round=num_boost_round,early_stopping_rounds=early_stopping_rounds):
    # Define Time Split Cross Validation
    tscv = TimeSeriesSplit(n_splits=k_fold_splits)

    # Separating Data from Hold Out Set

    X_cv, _, y_cv, _ = train_test_split(X, y, test_size=0.125, shuffle=False)

    # Set XGBoost for GPU
    param['tree_method'] = 'gpu_hist'

    train_scores = np.empty(0)
    val_scores = np.empty(0)
    test_scores = np.empty(0)
    for train_index, test_index in tscv.split(X_cv):

        # Get the Data of the Split
        X_train, X_test = X_cv.iloc[train_index], X_cv.iloc[test_index]
        y_train, y_test = y_cv.iloc[train_index], y_cv.iloc[test_index]

        # Separating Training Set of Split on Train and Validation Subsets
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.143, shuffle=False)

        # Transform the Subsets (Diff)
        '''
        X_train = transform_data(X_train)
        X_val = transform_data(X_val)
        X_test = transform_data(X_test)
        '''

        # Apply GPU-DF Transformation
        dtrain = gpu_df(X_train,y_train['Production'])
        dval = gpu_df(X_val,y_val['Production'])
        dtest = gpu_df(X_test,y_test['Production'])

        # Train the Model
        progress = dict()
        watchlist = [(dtrain,'train'),(dval,'eval')]
        bst = xgb.train(param, dtrain, num_boost_round=num_boost_round, evals=watchlist, feval=metric_cnr,early_stopping_rounds=early_stopping_rounds,verbose_eval=False,evals_result=progress)

        # Train and Validation Score
        train_score = np.array(progress['train']['CAPE']).mean()
        val_score = np.array(progress['eval']['CAPE']).mean()

        # Test Score
        preds = bst.predict(dtest,ntree_limit=bst.best_ntree_limit)
        test_score = metric_cnr(preds,dtest)

        train_scores = np.append(train_scores,train_score)
        val_scores = np.append(val_scores,val_score)
        test_scores = np.append(test_scores,test_score[1])

    return {'loss' : test_scores.mean(), 'params' : param, 'status' : STATUS_OK, 'train_loss' : train_scores.mean(), 'val_loss' : val_scores.mean(), 'test_score_array' : test_scores}

In [9]:
def objective_3(param,k_fold_splits=k_fold_splits,num_boost_round=num_boost_round,early_stopping_rounds=early_stopping_rounds):
    # Define Time Split Cross Validation
    tscv = TimeSeriesSplit(n_splits=k_fold_splits)

    # Separating Data from Hold Out Set
    X_cv, X_hold, y_cv, y_hold = train_test_split(X, y, test_size=0.125, shuffle=False)
    X_train, X_val, y_train, y_val = train_test_split(X_cv, y_cv, test_size=0.125, shuffle=False)

    # Transform the Subsets (Diff)
    X_train, y_train = transform_data(X_train), transform_data(y_train[['Production']])
    X_val, y_val = transform_data(X_val), transform_data(y_val[['Production']])
    X_hold, y_hold = transform_data(X_hold), transform_data(y_hold[['Production']])

    # Apply GPU-DF Transformation
    dtrain = gpu_df(X_train,y_train)
    dval = gpu_df(X_val,y_val)
    dtest = gpu_df(X_test,y_test)

    # Set XGBoost for GPU
    param['tree_method'] = 'gpu_hist'

    progress = dict()
    watchlist = [(dtrain,'train'),(dval,'eval')]
    bst = xgb.train(param, dtrain, num_boost_round=num_boost_round, evals=watchlist, feval=metric_cnr,early_stopping_rounds=early_stopping_rounds,verbose_eval=False,evals_result=progress)

    # Train and Validation Score
    train_score = np.array(progress['train']['CAPE']).mean()
    val_score = np.array(progress['eval']['CAPE']).mean()

    # Test Score
    preds = bst.predict(dtest,ntree_limit=bst.best_ntree_limit)
    test_score = metric_cnr(preds,dtest)

    return {'loss' : test_score, 'params' : param, 'status' : STATUS_OK, 'train_loss' : train_score, 'val_loss' : val_score}

## Hyperparameter Tuning

For the Hyperparameter Tuning, the HyperOpt Library will be used, which implements some techniques for a more efficient search for parameters.

### Domain Space

In [10]:
space = {
    'max_depth' : 1 + hp.randint('max_depth', 15),
    'subsample' : hp.uniform('subsample', 0, 1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0, 1),
    'colsample_bylevel' : hp.uniform('colsample_bylevel', 0, 1),
    'min_child_weight' : hp.uniform('min_child_weight', 0, 10),
    'lambda' : hp.uniform('lambda', 0, 1),
    'alpha' : hp.uniform('alpha', 0, 1),
    'eta' : hp.uniform('eta', 0, 1)
}

### Optimization Algorithm

In [11]:
tpe_algorithm = tpe.suggest
bayes_trials = Trials()

### Bayesian Optimization

In [12]:
MAX_EVALS = 300

In [13]:
best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = MAX_EVALS, trials = bayes_trials, rstate = np.random.RandomState(50))

27%|██▋       | 82/300 [12:08<32:15,  8.88s/trial, best loss: 61.713628493948036]


KeyboardInterrupt: 

### Hold Out Score

In [23]:
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.125, shuffle=False)
X_holdout = transform_data(X_holdout)
dhold = gpu_df(X_holdout,y_holdout['Production'])

In [25]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.143, shuffle=False)

X_train = transform_data(X_train)
X_val = transform_data(X_val)

dtrain = gpu_df(X_train,y_train['Production'])
dval = gpu_df(X_val,y_val['Production'])

In [26]:
progress = dict()
watchlist = [(dtrain,'train'),(dval,'eval')]
bst = xgb.train(best, dtrain, num_boost_round=num_boost_round, evals=watchlist, feval=metric_cnr,early_stopping_rounds=early_stopping_rounds,verbose_eval=False,evals_result=progress)

In [27]:
preds = bst.predict(dhold,ntree_limit=bst.best_ntree_limit)
score = metric_cnr(preds,dhold)

In [28]:
print(score)

('CAPE', 65.52684202041121)


## Generating Predictions

In [36]:
best

{'alpha': 0.1417303862640725,
 'colsample_bylevel': 0.6378675049254598,
 'colsample_bytree': 0.992550137741651,
 'eta': 0.1279325116071544,
 'lambda': 0.21769987564354057,
 'max_depth': 6,
 'min_child_weight': 0.9633818080808089,
 'subsample': 0.468090761379298,
 'validate_parameters': 1}

In [37]:
preds = []
for WF in full_data['WF'].unique():
    X_WF = full_data[full_data['WF']==WF]
    X_train = X_WF[X_WF['Set']=='Train']
    y_train = full_label[full_label['ID'].isin(X_train['ID'])]
    X_test = X_WF[X_WF['Set']=='Test']

    #Transform Data
    X_train = transform_data(X_train.drop(['ID','WF','Set'],axis=1))
    X_test = transform_data(X_test.drop(['ID','WF','Set'],axis=1))
    dtrain = gpu_df(X_train,y_train['Production'])
    dtest = gpu_df(X_test,None)

    bst = xgb.train(best,dtrain)
    pred = bst.predict(dtest)

    preds = np.append(preds,pred)

### Generate Submissions

In [38]:
preds_id = pd.read_csv(r'C:\Users\andre_\OneDrive\Documentos\GitHub\cnr\Data\random_submission_example.csv')['ID']

In [39]:
submission = pd.DataFrame()
submission['ID'] = preds_id
submission['Production'] = preds
submission = submission.set_index('ID')

In [40]:
submission

Unnamed: 0_level_0,Production
ID,Unnamed: 1_level_1
37376,0.822704
37377,0.822704
37378,0.638979
37379,0.884808
37380,1.517015
...,...
73900,0.811004
73901,0.401514
73902,0.401514
73903,0.364643


In [41]:
submission.to_csv(r'Data\Submission.csv')