# Modelling

The objective fo this notebook is to utilize some methods to find and determine the optimal model to solve the challenge problem. 

## Libraries

In [1]:
import numpy as np 
import cupy as cp
import pandas as pd
import matplotlib.pyplot as plt
from cnr_methods import get_selected_features, transform_data, revert_data,metric_cnr, get_simplified_data
import tsfresh


from sklearn.model_selection import TimeSeriesSplit, train_test_split
from collections import deque
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
import xgboost as xgb
import lightgbm as lgb

## Read Data

Here, full data is read and the column names are changed to avoid errors on LightGBM.

In [2]:
full_data = get_selected_features(500)

full_data = full_data.rename({'Unnamed: 0' : 'Time'},axis=1)
full_data = full_data.set_index('Time')

full_label = pd.read_csv('Data/Y_train.csv')

In [3]:
import re
columns = []
for column in full_data.columns:
    column = re.sub(r'[^a-zA-Z0-9]', '',column)
    columns.append(column)
full_data.columns = columns

Here, the data used correspond to the results of the Feature Engineering and Selection Step. For simplicity, during Hyperparameter Optimization, only Wind Farm 3 Training Data is used. Wind Farm 3 is selected here because it has the greatest Production, which is a positive thing for the Tuning.

In [4]:
X = full_data[full_data['Set']=='Train']
WF = 'WF3'
X = X[X['WF']==WF]
y = full_label[full_label['ID'].isin(X['ID'])]

In [5]:
X = X.drop(['ID','WF','Set'],axis=1)

Here, differentiation (numpy.diff) is applied to the data, before the model is trained.

In [6]:
X = transform_data(X)

## Validation Scheme

Before proceeding to the Hyperparameter search, it is necessary first to have some way to reliably measure the performance of the model. For this purpose, it will be used a Time Split Cross Validation Method, were the "Test" Fold for each Iteration is going to be used as the Validation Data, and so, to make Early Stopping on the data.

In [7]:
k_fold_splits = 5
num_boost_round = 1000
early_stopping_rounds = 100

In [8]:
def objective(param,k_fold_splits=k_fold_splits,num_boost_round=num_boost_round,early_stopping_rounds=early_stopping_rounds):
    # Define Time Split Cross Validation
    tscv = TimeSeriesSplit(n_splits=k_fold_splits)

    # Separating Data from Hold Out Set

    X_cv, _, y_cv, _ = train_test_split(X, y, test_size=0.125, shuffle=False)

    # Set Objective Functions
    param['objective'] = 'mean_absolute_percentage_error'
    param['metric'] = 'mean_absolute_percentage_error'


    test_scores = np.empty(0)
    for train_index, test_index in tscv.split(X_cv):

        # Get the Data of the Split
        X_train, X_test = X_cv.iloc[train_index], X_cv.iloc[test_index]
        y_train, y_test = y_cv.iloc[train_index], y_cv.iloc[test_index]

        # Separating Training Set of Split on Train and Validation Subsets
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.143, shuffle=False)

        # Apply GPU-DF Transformation
        dtrain = lgb.Dataset(X_train,label=y_train['Production'])
        dval = lgb.Dataset(X_val,label=y_val['Production'])
        dtest = lgb.Dataset(X_test,label=y_test['Production'],free_raw_data=False).construct()

        # Train the Model
        progress = dict()
        bst = lgb.train(param, dtrain, num_boost_round=num_boost_round, valid_sets=[dval], valid_names=['eval'], early_stopping_rounds=early_stopping_rounds,verbose_eval=False,evals_result=progress)

        # Test Score
        preds = bst.predict(dtest.get_data(),num_iteration=bst.best_iteration)
        test_score = metric_cnr(preds,dtest)

        test_scores = np.append(test_scores,test_score[1])

    return {'loss' : test_scores.mean(), 'params' : param, 'status' : STATUS_OK, 'test_score_array' : test_scores}

In [9]:
# [Test] Objective Function with Train, Validation and Test Sets Only (No CV, Not being used at the Moment)

def objective_2(param,k_fold_splits=k_fold_splits,num_boost_round=num_boost_round,early_stopping_rounds=early_stopping_rounds):
    # Define Time Split Cross Validation
    tscv = TimeSeriesSplit(n_splits=k_fold_splits)

    # Separating Data from Hold Out Set
    X_cv, X_hold, y_cv, y_hold = train_test_split(X, y, test_size=0.125, shuffle=False)
    X_train, X_test, y_train, y_test = train_test_split(X_cv, y_cv, test_size=0.125, shuffle=False)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, shuffle=False)

    # Apply GPU-DF Transformation
    dtrain = lgb.Dataset(X_train,label=y_train['Production'])
    dval = lgb.Dataset(X_val,label=y_val['Production'])
    dtest = lgb.Dataset(X_test,label=y_test['Production'],free_raw_data=False).construct()

    # Set Objective Functions
    param['objective'] = 'mean_absolute_percentage_error'
    param['metric'] = 'mean_absolute_percentage_error'

    progress = dict()
    bst = lgb.train(param, dtrain, num_boost_round=num_boost_round, valid_sets=[dval], valid_names=['eval'], early_stopping_rounds=early_stopping_rounds,verbose_eval=False,evals_result=progress)

    # Test Score
    preds = bst.predict(dtest.get_data(),num_iteration=bst.best_iteration)
    #preds = np.clip(preds,0,None)
    test_score = metric_cnr(preds,dtest)

    return {'loss' : test_score[1], 'params' : param, 'status' : STATUS_OK}

## Hyperparameter Tuning

For the Hyperparameter Tuning, the HyperOpt Library will be used, which implements some techniques for a more efficient search for parameters.

### Domain Space

In [10]:
space = {
    'max_depth' : 5 + hp.randint('max_depth', 30),
    'bagging_fraction' : hp.uniform('bagging_fraction', 0.5, 1),
    'feature_fraction' : hp.uniform('feature_fraction', 0.5, 1),
    'feature_fraction_bynode' : hp.uniform('feature_fraction_bynode', 0, 1),
    'min_data_in_leaf' : 1 + hp.randint('min_data_in_leaf', 100),
    'lambda_l2' : hp.uniform('lambda_l2', 0, 10),
    'lambda_l1' : hp.uniform('lambda_l1', 0, 10),
    'learning_rate' : hp.uniform('learning_rate', 0, 10),
    'num_leaves' : hp.randint('num_leaves', 50, 1000)
}

### Optimization Algorithm

In [11]:
tpe_algorithm = tpe.suggest
bayes_trials = Trials()

### Bayesian Optimization

In [12]:
MAX_EVALS = 100

In [13]:
best = fmin(fn = objective_2, space = space, algo = tpe.suggest, max_evals = MAX_EVALS, trials = bayes_trials, rstate = np.random.RandomState(50))

100%|██████████| 100/100 [27:40<00:00, 16.61s/trial, best loss: 70.87096655032013]


### Hold Out Score

Here, the best model selected by HyperOpt is applied to a Holdout Set, which occurs right after the data used in CV.

In [20]:
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.125, shuffle=False)
X_holdout = transform_data(X_holdout)
dhold = lgb.Dataset(X_holdout,label=y_holdout['Production'],free_raw_data=False).construct()

In [21]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.143, shuffle=False)

X_train = transform_data(X_train)
X_val = transform_data(X_val)

dtrain = lgb.Dataset(X_train,label=y_train['Production'])
dval = lgb.Dataset(X_val,label=y_val['Production'])

In [22]:
progress = dict()
bst = lgb.train(best, dtrain, num_boost_round=num_boost_round, valid_sets=[dval], valid_names=['eval'], early_stopping_rounds=early_stopping_rounds,verbose_eval=False,evals_result=progress)

ValueError: For early stopping, at least one dataset and eval metric is required for evaluation

In [17]:
preds = bst.predict(dhold.get_data(),num_iteration=bst.best_iteration)
#preds = np.clip(preds,0,None)
score = metric_cnr(preds,dhold)

NameError: name 'bst' is not defined

The Score for the HoldOut Set is shown here.

In [18]:
print(score)

NameError: name 'score' is not defined

In [19]:
plt.figure(figsize=(15,8))
len_x = np.arange(len(dhold.get_label()))
plt.plot(len_x,dhold.get_label(),label='True Values')
plt.plot(len_x,preds,'r--',label='Predicts')
plt.legend()

NameError: name 'preds' is not defined

## Generating Predictions

Finally, the best model selected is applied to all Wind Farms separately, using all Training Data for Training and Predicting all the Test Data at once (At least for the moment).

In [102]:
best

{'bagging_fraction': 0.6162117210484838,
 'feature_fraction': 0.8121297725774304,
 'feature_fraction_bynode': 0.600722729840607,
 'lambda_l1': 1.8162137175711186,
 'lambda_l2': 2.303994028222712,
 'learning_rate': 0.10474710600194667,
 'max_depth': 5,
 'min_data_in_leaf': 12,
 'num_leaves': 771,
 'objective': 'mean_absolute_percentage_error',
 'metric': 'mean_absolute_percentage_error'}

In [22]:
preds = []
for WF in full_data['WF'].unique():
    X_WF = full_data[full_data['WF']==WF]
    X_train = X_WF[X_WF['Set']=='Train']
    y_train = full_label[full_label['ID'].isin(X_train['ID'])]

    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.143, shuffle=False)

    X_test = X_WF[X_WF['Set']=='Test']

    #Transform Data
    X_train = transform_data(X_train.drop(['ID','WF','Set'],axis=1))
    X_val = transform_data(X_val.drop(['ID','WF','Set'],axis=1))
    X_test = transform_data(X_test.drop(['ID','WF','Set'],axis=1))

    dtrain = lgb.Dataset(X_train,label=y_train['Production'])
    dval = lgb.Dataset(X_val,label=y_val['Production'])

    bst = lgb.train(params=best,train_set=dtrain,num_boost_round=num_boost_round, valid_sets=[dval], valid_names=['eval'], early_stopping_rounds=early_stopping_rounds,verbose_eval=False)
    pred = bst.predict(X_test)

    preds = np.append(preds,pred)

### Generate Submissions

In [23]:
preds_id = pd.read_csv(r'C:\Users\andre_\OneDrive\Documentos\GitHub\cnr\Data\random_submission_example.csv')['ID']

In [24]:
submission = pd.DataFrame()
submission['ID'] = preds_id
submission['Production'] = np.clip(preds,0,None)
submission = submission.set_index('ID')

In [25]:
submission

Unnamed: 0_level_0,Production
ID,Unnamed: 1_level_1
37376,0.116467
37377,0.116467
37378,0.113701
37379,0.114883
37380,0.115353
...,...
73900,0.621626
73901,0.501482
73902,0.286979
73903,0.402168


In [26]:
submission.to_csv(r'C:\Users\andre_\OneDrive\Documentos\GitHub\cnr\Data/Submission_LightGBM.csv')