# Modelling

The objective fo this notebook is to utilize some methods to find and determine the optimal model to solve the challenge problem. 

## Libraries

In [1]:
import numpy as np 
import cupy as cp
import pandas as pd
from cnr_methods import get_selected_features, transform_data, revert_data,metric_cnr
import tsfresh


from sklearn.model_selection import TimeSeriesSplit
from collections import deque
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
import xgboost as xgb

## Read Data

Here, the data used correspond to the results of the Feature Engineering and Selection Step. (Add Later)

In [2]:
# Initially using the Original Data
full_data = get_selected_features(50)

In [3]:
full_data = full_data.rename({'Unnamed: 0' : 'Time'},axis=1)
full_data = full_data.set_index('Time')

In [4]:
full_label = pd.read_csv('Data/Y_train.csv')

In [5]:
X = full_data[full_data['Set']=='Train']

For initial debugging, only One Windfarm will be considered.

In [6]:
WF = 'WF1'
X = X[X['WF']==WF]
y = full_label[full_label['ID'].isin(X['ID'])]

In [7]:
X = transform_data(X.drop(['ID','WF','Set'],axis=1))
y = transform_data(y)['Production']

## Validation Scheme

Before proceeding to the Hyperparameter search, it is necessary first to have some way to reliably measure the performance of the model. For this purpose, it will be used a Time Split Cross Validation Method, were the "Test" Fold for each Iteration is going to be used as the Validation Data, and so, to make Early Stopping on the data.

In [8]:
k_fold_splits = 8
num_boost_round = 500
early_stopping_rounds = 10

In [9]:
def gpu_df(df,y):
    gpu_matrix = cp.asarray(df)
    gpu_matrix = xgb.DMatrix(gpu_matrix,label=y)
    return gpu_matrix

In [25]:
def objective(param,k_fold_splits=k_fold_splits,num_boost_round=num_boost_round,early_stopping_rounds=early_stopping_rounds):
    # Define Time Split Cross Validation
    tscv = TimeSeriesSplit(n_splits=k_fold_splits)

    # Separating a Holdout Set
    X_holdout = X[-round(len(X)/8):]
    y_holdout = y[-round(len(X)/8):]
    dhold = gpu_df(X_holdout,y_holdout)

    X_cv = X[:-round(len(X)/8)]
    y_cv = y[:-round(len(X)/8)]

    # Set XGBoost for GPU
    param['tree_method'] = 'gpu_hist'
    
    hold_scores = np.empty(0)
    for train_index, val_index in tscv.split(X_cv):
        # Get the Data of the Split
        X_train, X_val = X_cv.iloc[train_index], X_cv.iloc[val_index]
        y_train, y_val = y_cv.iloc[train_index], y_cv.iloc[val_index]
        dtrain = gpu_df(X_train,y_train)
        dval = gpu_df(X_val,y_val)

        # Train the Model
        watchlist = [(dtrain,'train'),(dval,'eval')]
        bst = xgb.train(param, dtrain, num_boost_round=num_boost_round, evals=watchlist, feval=metric_cnr,early_stopping_rounds=early_stopping_rounds,verbose_eval=False)

        preds = bst.predict(dhold,ntree_limit=bst.best_ntree_limit)
        score = metric_cnr(preds,dhold)
        hold_scores = np.append(hold_scores,score[1])

    return {'loss' : hold_scores.mean(), 'params' : param, 'status' : STATUS_OK}

## Hyperparameter Tuning

For the Hyperparameter Tuning, the HyperOpt Library will be used, which implements some techniques for a more efficient search for parameters.

### Domain Space

In [26]:
space = {
    'max_depth' : 1 + hp.randint('max_depth', 15),
    'subsample' : hp.uniform('subsample', 0, 1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0, 1),
    'colsample_bylevel' : hp.uniform('colsample_bylevel', 0, 1),
    'min_child_weight' : hp.uniform('min_child_weight', 0, 10),
    'lambda' : hp.uniform('lambda', 0, 1),
    'alpha' : hp.uniform('alpha', 0, 1),
    'eta' : hp.uniform('eta', 0, 1)
}

### Optimization Algorithm

In [28]:
tpe_algorithm = tpe.suggest
bayes_trials = Trials()

### Bayesian Optimization

In [29]:
MAX_EVALS = 100

In [30]:
best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = MAX_EVALS, trials = bayes_trials, rstate = np.random.RandomState(50))

100%|██████████| 100/100 [20:32<00:00, 12.33s/trial, best loss: 909165.2489032997]


## Generating Predictions

In [16]:
best

NameError: name 'best' is not defined

In [17]:
preds = []
for WF in full_data['WF'].unique():
    X_WF = full_data[full_data['WF']==WF]
    X_train = X_WF[X_WF['Set']=='Train']
    y_train = full_label[full_label['ID'].isin(X_train['ID'])]
    X_test = X_WF[X_WF['Set']=='Test']

    #Transform Data
    X_train = transform_data(X_train.drop(['ID','WF','Set'],axis=1))
    X_test = transform_data(X_test.drop(['ID','WF','Set'],axis=1))
    y_train = transform_data(y_train)['Production']
    dtrain = gpu_df(X_train,y_train)
    dtest = gpu_df(X_test,None)

    bst = xgb.train(best,dtrain)
    pred = bst.predict(dtest)

    preds = np.append(preds,pred)

NameError: name 'best' is not defined

In [18]:
preds = revert_data(preds)

### Generate Submissions

In [19]:
preds_id = pd.read_csv(r'C:\Users\andre_\OneDrive\Documentos\GitHub\cnr\Data\random_submission_example.csv')['ID']

In [20]:
submission = pd.DataFrame()
submission['ID'] = preds_id
submission['Production'] = preds
submission = submission.set_index('ID')

ValueError: Length of values does not match length of index

In [21]:
submission.to_csv(r'Data\Submission.csv')

In [22]:
submission

Unnamed: 0,ID
0,37376
1,37377
2,37378
3,37379
4,37380
...,...
36524,73900
36525,73901
36526,73902
36527,73903


In [23]:
pd.read_csv(r'C:\Users\andre_\OneDrive\Documentos\GitHub\cnr\Data\random_submission_example.csv')

Unnamed: 0,ID,Production
0,37376,1.34
1,37377,7.28
2,37378,5.74
3,37379,10.14
4,37380,4.19
...,...,...
36524,73900,1.95
36525,73901,0.85
36526,73902,2.66
36527,73903,1.99


In [24]:
pd.read_csv(r'C:\Users\andre_\OneDrive\Documentos\GitHub\cnr\Data\Submission.csv')

Unnamed: 0.1,Unnamed: 0,ID
0,0,37376
1,1,37377
2,2,37378
3,3,37379
4,4,37380
...,...,...
36524,36524,73900
36525,36525,73901
36526,36526,73902
36527,36527,73903


Unnamed: 0,ID,Production
0,1,0.02
1,2,0.07
2,3,0.22
3,4,0.39
4,5,0.41
...,...,...
37370,37371,0.04
37371,37372,0.33
37372,37373,0.13
37373,37374,0.01
